diff mbox series

[v10,2/5] mm: page_isolation: check specified range for unmovable pages

Message ID 20220406151858.3149821-3-zi.yan@sent.com (mailing list archive)
State New
Headers show
Series Use pageblock_order for cma and alloc_contig_range alignment. | expand

Commit Message

Zi Yan April 6, 2022, 3:18 p.m. UTC
From: Zi Yan <ziy@nvidia.com>

Enable set_migratetype_isolate() to check specified sub-range for
unmovable pages during isolation. Page isolation is done
at MAX_ORDER_NR_PAEGS granularity, but not all pages within that
granularity are intended to be isolated. For example,
alloc_contig_range(), which uses page isolation, allows ranges without
alignment. This commit makes unmovable page check only look for
interesting pages, so that page isolation can succeed for any
non-overlapping ranges.

Signed-off-by: Zi Yan <ziy@nvidia.com>
---
 mm/page_alloc.c     | 16 ++--------
 mm/page_isolation.c | 78 ++++++++++++++++++++++++++++++++-------------
 2 files changed, 57 insertions(+), 37 deletions(-)

Comments

David Hildenbrand April 12, 2022, 1:10 p.m. UTC | #1
On 06.04.22 17:18, Zi Yan wrote:
> From: Zi Yan <ziy@nvidia.com>
> 
> Enable set_migratetype_isolate() to check specified sub-range for
> unmovable pages during isolation. Page isolation is done
> at MAX_ORDER_NR_PAEGS granularity, but not all pages within that
> granularity are intended to be isolated. For example,
> alloc_contig_range(), which uses page isolation, allows ranges without
> alignment. This commit makes unmovable page check only look for
> interesting pages, so that page isolation can succeed for any
> non-overlapping ranges.
> 
> Signed-off-by: Zi Yan <ziy@nvidia.com>
> ---

[...]

>  /*
> - * This function checks whether pageblock includes unmovable pages or not.
> + * This function checks whether the range [start_pfn, end_pfn) includes
> + * unmovable pages or not. The range must fall into a single pageblock and
> + * consequently belong to a single zone.
>   *
>   * PageLRU check without isolation or lru_lock could race so that
>   * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable
> @@ -28,12 +30,14 @@
>   * cannot get removed (e.g., via memory unplug) concurrently.
>   *
>   */
> -static struct page *has_unmovable_pages(struct zone *zone, struct page *page,
> -				 int migratetype, int flags)
> +static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long end_pfn,
> +				int migratetype, int flags)
>  {
> -	unsigned long iter = 0;
> -	unsigned long pfn = page_to_pfn(page);
> -	unsigned long offset = pfn % pageblock_nr_pages;
> +	unsigned long pfn = start_pfn;
> +	struct page *page = pfn_to_page(pfn);


Just do

struct page *page = pfn_to_page(start_pfn);
struct zone *zone = page_zone(page);

here. No need to lookup the zone again in the loop because, as you
document "must ... belong to a single zone.".

Then, there is also no need to initialize "pfn" here. In the loop header
is sufficient.

> +
> +	VM_BUG_ON(ALIGN_DOWN(start_pfn, pageblock_nr_pages) !=
> +		  ALIGN_DOWN(end_pfn - 1, pageblock_nr_pages));
>  
>  	if (is_migrate_cma_page(page)) {
>  		/*
> @@ -47,8 +51,11 @@ static struct page *has_unmovable_pages(struct zone *zone, struct page *page,
>  		return page;
>  	}
>  
> -	for (; iter < pageblock_nr_pages - offset; iter++) {
> -		page = pfn_to_page(pfn + iter);
> +	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
> +		struct zone *zone;
> +
> +		page = pfn_to_page(pfn);
> +		zone = page_zone(page);
>  
>  		/*
>  		 * Both, bootmem allocations and memory holes are marked
> @@ -85,7 +92,7 @@ static struct page *has_unmovable_pages(struct zone *zone, struct page *page,
>  			}
>  
>  			skip_pages = compound_nr(head) - (page - head);
> -			iter += skip_pages - 1;
> +			pfn += skip_pages - 1;
>  			continue;
>  		}
>  
> @@ -97,7 +104,7 @@ static struct page *has_unmovable_pages(struct zone *zone, struct page *page,
>  		 */
>  		if (!page_ref_count(page)) {
>  			if (PageBuddy(page))
> -				iter += (1 << buddy_order(page)) - 1;
> +				pfn += (1 << buddy_order(page)) - 1;
>  			continue;
>  		}
>  
> @@ -134,11 +141,18 @@ static struct page *has_unmovable_pages(struct zone *zone, struct page *page,
>  	return NULL;
>  }
>  
> -static int set_migratetype_isolate(struct page *page, int migratetype, int isol_flags)
> +/*
> + * This function set pageblock migratetype to isolate if no unmovable page is
> + * present in [start_pfn, end_pfn). The pageblock must intersect with
> + * [start_pfn, end_pfn).
> + */
> +static int set_migratetype_isolate(struct page *page, int migratetype, int isol_flags,
> +			unsigned long start_pfn, unsigned long end_pfn)

I think we might be able do better, eventually not passing start_pfn at
all. Hmm.

I think we want to pull out the
start_isolate_page_range()/undo_isolate_page_range() interface change
into a separate patch.

Let me try to give it a shot, I'll try hacking something up real quick
to see if we can do better.
Zi Yan April 12, 2022, 2:07 p.m. UTC | #2
On 12 Apr 2022, at 9:10, David Hildenbrand wrote:

> On 06.04.22 17:18, Zi Yan wrote:
>> From: Zi Yan <ziy@nvidia.com>
>>
>> Enable set_migratetype_isolate() to check specified sub-range for
>> unmovable pages during isolation. Page isolation is done
>> at MAX_ORDER_NR_PAEGS granularity, but not all pages within that
>> granularity are intended to be isolated. For example,
>> alloc_contig_range(), which uses page isolation, allows ranges without
>> alignment. This commit makes unmovable page check only look for
>> interesting pages, so that page isolation can succeed for any
>> non-overlapping ranges.
>>
>> Signed-off-by: Zi Yan <ziy@nvidia.com>
>> ---
>
> [...]
>
>>  /*
>> - * This function checks whether pageblock includes unmovable pages or not.
>> + * This function checks whether the range [start_pfn, end_pfn) includes
>> + * unmovable pages or not. The range must fall into a single pageblock and
>> + * consequently belong to a single zone.
>>   *
>>   * PageLRU check without isolation or lru_lock could race so that
>>   * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable
>> @@ -28,12 +30,14 @@
>>   * cannot get removed (e.g., via memory unplug) concurrently.
>>   *
>>   */
>> -static struct page *has_unmovable_pages(struct zone *zone, struct page *page,
>> -				 int migratetype, int flags)
>> +static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long end_pfn,
>> +				int migratetype, int flags)
>>  {
>> -	unsigned long iter = 0;
>> -	unsigned long pfn = page_to_pfn(page);
>> -	unsigned long offset = pfn % pageblock_nr_pages;
>> +	unsigned long pfn = start_pfn;
>> +	struct page *page = pfn_to_page(pfn);
>
>
> Just do
>
> struct page *page = pfn_to_page(start_pfn);
> struct zone *zone = page_zone(page);
>
> here. No need to lookup the zone again in the loop because, as you
> document "must ... belong to a single zone.".
>
> Then, there is also no need to initialize "pfn" here. In the loop header
> is sufficient.
>

Sure.

>> +
>> +	VM_BUG_ON(ALIGN_DOWN(start_pfn, pageblock_nr_pages) !=
>> +		  ALIGN_DOWN(end_pfn - 1, pageblock_nr_pages));
>>
>>  	if (is_migrate_cma_page(page)) {
>>  		/*
>> @@ -47,8 +51,11 @@ static struct page *has_unmovable_pages(struct zone *zone, struct page *page,
>>  		return page;
>>  	}
>>
>> -	for (; iter < pageblock_nr_pages - offset; iter++) {
>> -		page = pfn_to_page(pfn + iter);
>> +	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
>> +		struct zone *zone;
>> +
>> +		page = pfn_to_page(pfn);
>> +		zone = page_zone(page);
>>
>>  		/*
>>  		 * Both, bootmem allocations and memory holes are marked
>> @@ -85,7 +92,7 @@ static struct page *has_unmovable_pages(struct zone *zone, struct page *page,
>>  			}
>>
>>  			skip_pages = compound_nr(head) - (page - head);
>> -			iter += skip_pages - 1;
>> +			pfn += skip_pages - 1;
>>  			continue;
>>  		}
>>
>> @@ -97,7 +104,7 @@ static struct page *has_unmovable_pages(struct zone *zone, struct page *page,
>>  		 */
>>  		if (!page_ref_count(page)) {
>>  			if (PageBuddy(page))
>> -				iter += (1 << buddy_order(page)) - 1;
>> +				pfn += (1 << buddy_order(page)) - 1;
>>  			continue;
>>  		}
>>
>> @@ -134,11 +141,18 @@ static struct page *has_unmovable_pages(struct zone *zone, struct page *page,
>>  	return NULL;
>>  }
>>
>> -static int set_migratetype_isolate(struct page *page, int migratetype, int isol_flags)
>> +/*
>> + * This function set pageblock migratetype to isolate if no unmovable page is
>> + * present in [start_pfn, end_pfn). The pageblock must intersect with
>> + * [start_pfn, end_pfn).
>> + */
>> +static int set_migratetype_isolate(struct page *page, int migratetype, int isol_flags,
>> +			unsigned long start_pfn, unsigned long end_pfn)
>
> I think we might be able do better, eventually not passing start_pfn at
> all. Hmm.

IMHO, having start_pfn and end_pfn in the parameter list would make the
interface easier to understand. Otherwise if we remove start_pfn,
the caller needs to adjust @page to be within the range of [start_pfn,
end_pfn)

>
> I think we want to pull out the
> start_isolate_page_range()/undo_isolate_page_range() interface change
> into a separate patch.

You mean a patch just adding

unsigned long isolate_start = pfn_max_align_down(start_pfn);
unsigned long isolate_end = pfn_max_align_up(end_pfn);

in start_isolate_page_range()/undo_isolate_page_range()?

Yes I can do that.

>
> Let me try to give it a shot, I'll try hacking something up real quick
> to see if we can do better.

Sure. Thanks.

--
Best Regards,
Yan, Zi
David Hildenbrand April 12, 2022, 2:49 p.m. UTC | #3
On 12.04.22 16:07, Zi Yan wrote:
> On 12 Apr 2022, at 9:10, David Hildenbrand wrote:
> 
>> On 06.04.22 17:18, Zi Yan wrote:
>>> From: Zi Yan <ziy@nvidia.com>
>>>
>>> Enable set_migratetype_isolate() to check specified sub-range for
>>> unmovable pages during isolation. Page isolation is done
>>> at MAX_ORDER_NR_PAEGS granularity, but not all pages within that
>>> granularity are intended to be isolated. For example,
>>> alloc_contig_range(), which uses page isolation, allows ranges without
>>> alignment. This commit makes unmovable page check only look for
>>> interesting pages, so that page isolation can succeed for any
>>> non-overlapping ranges.
>>>
>>> Signed-off-by: Zi Yan <ziy@nvidia.com>
>>> ---
>>
>> [...]
>>
>>>  /*
>>> - * This function checks whether pageblock includes unmovable pages or not.
>>> + * This function checks whether the range [start_pfn, end_pfn) includes
>>> + * unmovable pages or not. The range must fall into a single pageblock and
>>> + * consequently belong to a single zone.
>>>   *
>>>   * PageLRU check without isolation or lru_lock could race so that
>>>   * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable
>>> @@ -28,12 +30,14 @@
>>>   * cannot get removed (e.g., via memory unplug) concurrently.
>>>   *
>>>   */
>>> -static struct page *has_unmovable_pages(struct zone *zone, struct page *page,
>>> -				 int migratetype, int flags)
>>> +static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long end_pfn,
>>> +				int migratetype, int flags)
>>>  {
>>> -	unsigned long iter = 0;
>>> -	unsigned long pfn = page_to_pfn(page);
>>> -	unsigned long offset = pfn % pageblock_nr_pages;
>>> +	unsigned long pfn = start_pfn;
>>> +	struct page *page = pfn_to_page(pfn);
>>
>>
>> Just do
>>
>> struct page *page = pfn_to_page(start_pfn);
>> struct zone *zone = page_zone(page);
>>
>> here. No need to lookup the zone again in the loop because, as you
>> document "must ... belong to a single zone.".
>>
>> Then, there is also no need to initialize "pfn" here. In the loop header
>> is sufficient.
>>
> 
> Sure.
> 
>>> +
>>> +	VM_BUG_ON(ALIGN_DOWN(start_pfn, pageblock_nr_pages) !=
>>> +		  ALIGN_DOWN(end_pfn - 1, pageblock_nr_pages));
>>>
>>>  	if (is_migrate_cma_page(page)) {
>>>  		/*
>>> @@ -47,8 +51,11 @@ static struct page *has_unmovable_pages(struct zone *zone, struct page *page,
>>>  		return page;
>>>  	}
>>>
>>> -	for (; iter < pageblock_nr_pages - offset; iter++) {
>>> -		page = pfn_to_page(pfn + iter);
>>> +	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
>>> +		struct zone *zone;
>>> +
>>> +		page = pfn_to_page(pfn);
>>> +		zone = page_zone(page);
>>>
>>>  		/*
>>>  		 * Both, bootmem allocations and memory holes are marked
>>> @@ -85,7 +92,7 @@ static struct page *has_unmovable_pages(struct zone *zone, struct page *page,
>>>  			}
>>>
>>>  			skip_pages = compound_nr(head) - (page - head);
>>> -			iter += skip_pages - 1;
>>> +			pfn += skip_pages - 1;
>>>  			continue;
>>>  		}
>>>
>>> @@ -97,7 +104,7 @@ static struct page *has_unmovable_pages(struct zone *zone, struct page *page,
>>>  		 */
>>>  		if (!page_ref_count(page)) {
>>>  			if (PageBuddy(page))
>>> -				iter += (1 << buddy_order(page)) - 1;
>>> +				pfn += (1 << buddy_order(page)) - 1;
>>>  			continue;
>>>  		}
>>>
>>> @@ -134,11 +141,18 @@ static struct page *has_unmovable_pages(struct zone *zone, struct page *page,
>>>  	return NULL;
>>>  }
>>>
>>> -static int set_migratetype_isolate(struct page *page, int migratetype, int isol_flags)
>>> +/*
>>> + * This function set pageblock migratetype to isolate if no unmovable page is
>>> + * present in [start_pfn, end_pfn). The pageblock must intersect with
>>> + * [start_pfn, end_pfn).
>>> + */
>>> +static int set_migratetype_isolate(struct page *page, int migratetype, int isol_flags,
>>> +			unsigned long start_pfn, unsigned long end_pfn)
>>
>> I think we might be able do better, eventually not passing start_pfn at
>> all. Hmm.
> 
> IMHO, having start_pfn and end_pfn in the parameter list would make the
> interface easier to understand. Otherwise if we remove start_pfn,
> the caller needs to adjust @page to be within the range of [start_pfn,
> end_pfn)
> 
>>
>> I think we want to pull out the
>> start_isolate_page_range()/undo_isolate_page_range() interface change
>> into a separate patch.
> 
> You mean a patch just adding
> 
> unsigned long isolate_start = pfn_max_align_down(start_pfn);
> unsigned long isolate_end = pfn_max_align_up(end_pfn);
> 
> in start_isolate_page_range()/undo_isolate_page_range()?
> 
> Yes I can do that.

I think we have to be careful with memory onlining/offlining. There are
corner cases where we get called with only pageblock alignment and
must not adjust the range.


Something like this as a base for the next cleanups/extensions:


From 18d29b53600d6d0d6ac87cdc6b7517e989fa3dac Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Tue, 12 Apr 2022 15:51:50 +0200
Subject: [PATCH] mm: page-isolation: Move alignment logic into
 start_isolate_page_range()/undo_isolate_page_range()

For ordinary range allocations, we actually have to isolate all pageblocks
in a MAX_ORDER - 1 range. Only memory onlining/offlining is special: it
knows exactly which pageblocks to isolate/unisolate and we must not mess
with the pageblocks to isolate (memory onlining/offlining alwayes passed
MAX_ORDER - 1 - aligned ranges, unless we're dealing with vmemmap
located on hotplugged memory, whereby the start pfn might only be
pageblock aligned).

Further, for ordinary allcoations, we'll want to know the exact range
we want to allocate -- to check only that range for unmovable pages.
Right now we lose that information.

So let's move the alignment logic into start_isolate_page_range() /
undo_isolate_page_range(), such that we have the actual range of
interest available and the alignment logic contained in there.

Provide start_isolate_pageblocks()/undo_isolate_pageblocks() for memory
onlining/offlining.

Signed-off-by: David Hildenbrand <david@redhat.com>
---
 include/linux/page-isolation.h | 23 ++++-----
 mm/memory_hotplug.c            |  8 ++--
 mm/page_alloc.c                | 15 +-----
 mm/page_isolation.c            | 85 ++++++++++++++++++++++++++--------
 4 files changed, 81 insertions(+), 50 deletions(-)

diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
index e14eddf6741a..8e9e9e80ba67 100644
--- a/include/linux/page-isolation.h
+++ b/include/linux/page-isolation.h
@@ -37,20 +37,15 @@ void set_pageblock_migratetype(struct page *page, int migratetype);
 int move_freepages_block(struct zone *zone, struct page *page,
 				int migratetype, int *num_movable);
 
-/*
- * Changes migrate type in [start_pfn, end_pfn) to be MIGRATE_ISOLATE.
- */
-int
-start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
-			 unsigned migratetype, int flags);
-
-/*
- * Changes MIGRATE_ISOLATE to MIGRATE_MOVABLE.
- * target range is [start_pfn, end_pfn)
- */
-void
-undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
-			unsigned migratetype);
+int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
+			     unsigned migratetype, int flags);
+int start_isolate_pageblocks(unsigned long start_pfn, unsigned long end_pfn,
+			     unsigned migratetype, int flags);
+
+void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
+			     unsigned migratetype);
+void undo_isolate_pageblocks(unsigned long start_pfn, unsigned long end_pfn,
+			     unsigned migratetype);
 
 /*
  * Test all pages in [start_pfn, end_pfn) are isolated or not.
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 416b38ca8def..fb7f63c800d1 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1089,7 +1089,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
 
 	/*
 	 * Fixup the number of isolated pageblocks before marking the sections
-	 * onlining, such that undo_isolate_page_range() works correctly.
+	 * onlining, such that undo_isolate_pageblocks() works correctly.
 	 */
 	spin_lock_irqsave(&zone->lock, flags);
 	zone->nr_isolate_pageblock += nr_pages / pageblock_nr_pages;
@@ -1113,7 +1113,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
 		build_all_zonelists(NULL);
 
 	/* Basic onlining is complete, allow allocation of onlined pages. */
-	undo_isolate_page_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE);
+	undo_isolate_pageblocks(pfn, pfn + nr_pages, MIGRATE_MOVABLE);
 
 	/*
 	 * Freshly onlined pages aren't shuffled (e.g., all pages are placed to
@@ -1834,7 +1834,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
 	lru_cache_disable();
 
 	/* set above range as isolated */
-	ret = start_isolate_page_range(start_pfn, end_pfn,
+	ret = start_isolate_pageblocks(start_pfn, end_pfn,
 				       MIGRATE_MOVABLE,
 				       MEMORY_OFFLINE | REPORT_FAILURE);
 	if (ret) {
@@ -1937,7 +1937,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
 
 failed_removal_isolated:
 	/* pushback to free area */
-	undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
+	undo_isolate_pageblocks(start_pfn, end_pfn, MIGRATE_MOVABLE);
 	memory_notify(MEM_CANCEL_OFFLINE, &arg);
 failed_removal_pcplists_disabled:
 	lru_cache_enable();
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index badc05fc1b2f..76f8c19e37a2 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -8950,15 +8950,6 @@ void *__init alloc_large_system_hash(const char *tablename,
 }
 
 #ifdef CONFIG_CONTIG_ALLOC
-static unsigned long pfn_max_align_down(unsigned long pfn)
-{
-	return ALIGN_DOWN(pfn, MAX_ORDER_NR_PAGES);
-}
-
-static unsigned long pfn_max_align_up(unsigned long pfn)
-{
-	return ALIGN(pfn, MAX_ORDER_NR_PAGES);
-}
 
 #if defined(CONFIG_DYNAMIC_DEBUG) || \
 	(defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
@@ -9104,8 +9095,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
 	 * put back to page allocator so that buddy can use them.
 	 */
 
-	ret = start_isolate_page_range(pfn_max_align_down(start),
-				       pfn_max_align_up(end), migratetype, 0);
+	ret = start_isolate_page_range(start, end, migratetype, 0);
 	if (ret)
 		return ret;
 
@@ -9186,8 +9176,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
 		free_contig_range(end, outer_end - end);
 
 done:
-	undo_isolate_page_range(pfn_max_align_down(start),
-				pfn_max_align_up(end), migratetype);
+	undo_isolate_page_range(start, end, migratetype);
 	return ret;
 }
 EXPORT_SYMBOL(alloc_contig_range);
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index b34f1310aeaa..7c1f7724c5bb 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -15,6 +15,16 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/page_isolation.h>
 
+static unsigned long pfn_max_align_down(unsigned long pfn)
+{
+	return ALIGN_DOWN(pfn, MAX_ORDER_NR_PAGES);
+}
+
+static unsigned long pfn_max_align_up(unsigned long pfn)
+{
+	return ALIGN(pfn, MAX_ORDER_NR_PAGES);
+}
+
 /*
  * This function checks whether pageblock includes unmovable pages or not.
  *
@@ -262,12 +272,40 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
 	return NULL;
 }
 
+/*
+ * Make page-allocation-type of pageblocks to be MIGRATE_ISOLATE.
+ *
+ * Most users should actually use start_isolate_page_range(). Only memory
+ * onlining/offlining that knows exactly what it's doing in regard to
+ * isolating only some pageblocks of MAX_ORDER - 1 pages (for the vmemmap)
+ * should use this interface.
+ */
+int start_isolate_pageblocks(unsigned long start_pfn, unsigned long end_pfn,
+			     unsigned migratetype, int flags)
+{
+	unsigned long pfn;
+	struct page *page;
+
+	BUG_ON(!IS_ALIGNED(start_pfn, pageblock_nr_pages));
+	BUG_ON(!IS_ALIGNED(end_pfn, pageblock_nr_pages));
+
+	for (pfn = start_pfn;
+	     pfn < end_pfn;
+	     pfn += pageblock_nr_pages) {
+		page = __first_valid_page(pfn, pageblock_nr_pages);
+		if (page && set_migratetype_isolate(page, migratetype, flags)) {
+			undo_isolate_pageblocks(start_pfn, pfn, migratetype);
+			return -EBUSY;
+		}
+	}
+	return 0;
+}
+
 /**
  * start_isolate_page_range() - make page-allocation-type of range of pages to
  * be MIGRATE_ISOLATE.
  * @start_pfn:		The lower PFN of the range to be isolated.
  * @end_pfn:		The upper PFN of the range to be isolated.
- *			start_pfn/end_pfn must be aligned to pageblock_order.
  * @migratetype:	Migrate type to set in error recovery.
  * @flags:		The following flags are allowed (they can be combined in
  *			a bit mask)
@@ -306,29 +344,23 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
 int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
 			     unsigned migratetype, int flags)
 {
-	unsigned long pfn;
-	struct page *page;
-
-	BUG_ON(!IS_ALIGNED(start_pfn, pageblock_nr_pages));
-	BUG_ON(!IS_ALIGNED(end_pfn, pageblock_nr_pages));
+	start_pfn = = pfn_max_align_down(start_pfn);
+	end_pfn = pfn_max_align_up(end_pfn);
 
-	for (pfn = start_pfn;
-	     pfn < end_pfn;
-	     pfn += pageblock_nr_pages) {
-		page = __first_valid_page(pfn, pageblock_nr_pages);
-		if (page && set_migratetype_isolate(page, migratetype, flags)) {
-			undo_isolate_page_range(start_pfn, pfn, migratetype);
-			return -EBUSY;
-		}
-	}
-	return 0;
+	return start_isolate_pageblocks(start_pfn, end_pfn, migratetype, flags);
 }
 
 /*
- * Make isolated pages available again.
+ * Make isolated pageblocks, isolated via start_isolate_pageblocks, available
+ * again.
+ *
+ * Most users should actually use undo_isolate_page_range(). Only memory
+ * onlining/offlining that knows exactly what it's doing in regard to
+ * isolating only some pageblocks of MAX_ORDER - 1 pages (for the vmemmap)
+ * should use this interface.
  */
-void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
-			    unsigned migratetype)
+void undo_isolate_pageblocks(unsigned long start_pfn, unsigned long end_pfn,
+			     unsigned migratetype)
 {
 	unsigned long pfn;
 	struct page *page;
@@ -345,6 +377,21 @@ void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
 		unset_migratetype_isolate(page, migratetype);
 	}
 }
+
+/*
+ * Make isolated pageblocks, isolated via start_isolate_page_range(), available
+ * again. The pageblock isolation range will be extended just like for
+ * start_isolate_page_range().
+ */
+void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
+			     unsigned migratetype)
+{
+	start_pfn = = pfn_max_align_down(start_pfn);
+	end_pfn = pfn_max_align_up(end_pfn);
+
+	return undo_isolate_pageblocks(start_pfn, end_pfn, migratetype);
+}
+
 /*
  * Test all pages in the range is free(means isolated) or not.
  * all pages in [start_pfn...end_pfn) must be in the same zone.
Zi Yan April 12, 2022, 3:01 p.m. UTC | #4
On 12 Apr 2022, at 10:49, David Hildenbrand wrote:

> On 12.04.22 16:07, Zi Yan wrote:
>> On 12 Apr 2022, at 9:10, David Hildenbrand wrote:
>>
>>> On 06.04.22 17:18, Zi Yan wrote:
>>>> From: Zi Yan <ziy@nvidia.com>
>>>>
>>>> Enable set_migratetype_isolate() to check specified sub-range for
>>>> unmovable pages during isolation. Page isolation is done
>>>> at MAX_ORDER_NR_PAEGS granularity, but not all pages within that
>>>> granularity are intended to be isolated. For example,
>>>> alloc_contig_range(), which uses page isolation, allows ranges without
>>>> alignment. This commit makes unmovable page check only look for
>>>> interesting pages, so that page isolation can succeed for any
>>>> non-overlapping ranges.
>>>>
>>>> Signed-off-by: Zi Yan <ziy@nvidia.com>
>>>> ---
>>>
>>> [...]
>>>
>>>>  /*
>>>> - * This function checks whether pageblock includes unmovable pages or not.
>>>> + * This function checks whether the range [start_pfn, end_pfn) includes
>>>> + * unmovable pages or not. The range must fall into a single pageblock and
>>>> + * consequently belong to a single zone.
>>>>   *
>>>>   * PageLRU check without isolation or lru_lock could race so that
>>>>   * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable
>>>> @@ -28,12 +30,14 @@
>>>>   * cannot get removed (e.g., via memory unplug) concurrently.
>>>>   *
>>>>   */
>>>> -static struct page *has_unmovable_pages(struct zone *zone, struct page *page,
>>>> -				 int migratetype, int flags)
>>>> +static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long end_pfn,
>>>> +				int migratetype, int flags)
>>>>  {
>>>> -	unsigned long iter = 0;
>>>> -	unsigned long pfn = page_to_pfn(page);
>>>> -	unsigned long offset = pfn % pageblock_nr_pages;
>>>> +	unsigned long pfn = start_pfn;
>>>> +	struct page *page = pfn_to_page(pfn);
>>>
>>>
>>> Just do
>>>
>>> struct page *page = pfn_to_page(start_pfn);
>>> struct zone *zone = page_zone(page);
>>>
>>> here. No need to lookup the zone again in the loop because, as you
>>> document "must ... belong to a single zone.".
>>>
>>> Then, there is also no need to initialize "pfn" here. In the loop header
>>> is sufficient.
>>>
>>
>> Sure.
>>
>>>> +
>>>> +	VM_BUG_ON(ALIGN_DOWN(start_pfn, pageblock_nr_pages) !=
>>>> +		  ALIGN_DOWN(end_pfn - 1, pageblock_nr_pages));
>>>>
>>>>  	if (is_migrate_cma_page(page)) {
>>>>  		/*
>>>> @@ -47,8 +51,11 @@ static struct page *has_unmovable_pages(struct zone *zone, struct page *page,
>>>>  		return page;
>>>>  	}
>>>>
>>>> -	for (; iter < pageblock_nr_pages - offset; iter++) {
>>>> -		page = pfn_to_page(pfn + iter);
>>>> +	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
>>>> +		struct zone *zone;
>>>> +
>>>> +		page = pfn_to_page(pfn);
>>>> +		zone = page_zone(page);
>>>>
>>>>  		/*
>>>>  		 * Both, bootmem allocations and memory holes are marked
>>>> @@ -85,7 +92,7 @@ static struct page *has_unmovable_pages(struct zone *zone, struct page *page,
>>>>  			}
>>>>
>>>>  			skip_pages = compound_nr(head) - (page - head);
>>>> -			iter += skip_pages - 1;
>>>> +			pfn += skip_pages - 1;
>>>>  			continue;
>>>>  		}
>>>>
>>>> @@ -97,7 +104,7 @@ static struct page *has_unmovable_pages(struct zone *zone, struct page *page,
>>>>  		 */
>>>>  		if (!page_ref_count(page)) {
>>>>  			if (PageBuddy(page))
>>>> -				iter += (1 << buddy_order(page)) - 1;
>>>> +				pfn += (1 << buddy_order(page)) - 1;
>>>>  			continue;
>>>>  		}
>>>>
>>>> @@ -134,11 +141,18 @@ static struct page *has_unmovable_pages(struct zone *zone, struct page *page,
>>>>  	return NULL;
>>>>  }
>>>>
>>>> -static int set_migratetype_isolate(struct page *page, int migratetype, int isol_flags)
>>>> +/*
>>>> + * This function set pageblock migratetype to isolate if no unmovable page is
>>>> + * present in [start_pfn, end_pfn). The pageblock must intersect with
>>>> + * [start_pfn, end_pfn).
>>>> + */
>>>> +static int set_migratetype_isolate(struct page *page, int migratetype, int isol_flags,
>>>> +			unsigned long start_pfn, unsigned long end_pfn)
>>>
>>> I think we might be able do better, eventually not passing start_pfn at
>>> all. Hmm.
>>
>> IMHO, having start_pfn and end_pfn in the parameter list would make the
>> interface easier to understand. Otherwise if we remove start_pfn,
>> the caller needs to adjust @page to be within the range of [start_pfn,
>> end_pfn)
>>
>>>
>>> I think we want to pull out the
>>> start_isolate_page_range()/undo_isolate_page_range() interface change
>>> into a separate patch.
>>
>> You mean a patch just adding
>>
>> unsigned long isolate_start = pfn_max_align_down(start_pfn);
>> unsigned long isolate_end = pfn_max_align_up(end_pfn);
>>
>> in start_isolate_page_range()/undo_isolate_page_range()?
>>
>> Yes I can do that.
>
> I think we have to be careful with memory onlining/offlining. There are
> corner cases where we get called with only pageblock alignment and
> must not adjust the range.

In the patch below, you added a new set of start_isolate_pageblocks()
and undo_isolate_pageblocks(), where start_isolate_pageblocks() still
calls set_migratetype_isolate() and noted their range should not be
adjusted. But in my patch, set_migratetype_isolate() adjusts
the range for has_unmovable_pages() check. Do you mean
start_isolate_pageblocks() should call a different version of
set_migratetype_isolate() without range adjustment? That can be done
with an additional parameter in start_isolate_page_range(), like
bool strict, right?


>
>
> Something like this as a base for the next cleanups/extensions:
>
>
> From 18d29b53600d6d0d6ac87cdc6b7517e989fa3dac Mon Sep 17 00:00:00 2001
> From: David Hildenbrand <david@redhat.com>
> Date: Tue, 12 Apr 2022 15:51:50 +0200
> Subject: [PATCH] mm: page-isolation: Move alignment logic into
>  start_isolate_page_range()/undo_isolate_page_range()
>
> For ordinary range allocations, we actually have to isolate all pageblocks
> in a MAX_ORDER - 1 range. Only memory onlining/offlining is special: it
> knows exactly which pageblocks to isolate/unisolate and we must not mess
> with the pageblocks to isolate (memory onlining/offlining alwayes passed
> MAX_ORDER - 1 - aligned ranges, unless we're dealing with vmemmap
> located on hotplugged memory, whereby the start pfn might only be
> pageblock aligned).
>
> Further, for ordinary allcoations, we'll want to know the exact range
> we want to allocate -- to check only that range for unmovable pages.
> Right now we lose that information.
>
> So let's move the alignment logic into start_isolate_page_range() /
> undo_isolate_page_range(), such that we have the actual range of
> interest available and the alignment logic contained in there.
>
> Provide start_isolate_pageblocks()/undo_isolate_pageblocks() for memory
> onlining/offlining.
>
> Signed-off-by: David Hildenbrand <david@redhat.com>
> ---
>  include/linux/page-isolation.h | 23 ++++-----
>  mm/memory_hotplug.c            |  8 ++--
>  mm/page_alloc.c                | 15 +-----
>  mm/page_isolation.c            | 85 ++++++++++++++++++++++++++--------
>  4 files changed, 81 insertions(+), 50 deletions(-)
>
> diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
> index e14eddf6741a..8e9e9e80ba67 100644
> --- a/include/linux/page-isolation.h
> +++ b/include/linux/page-isolation.h
> @@ -37,20 +37,15 @@ void set_pageblock_migratetype(struct page *page, int migratetype);
>  int move_freepages_block(struct zone *zone, struct page *page,
>  				int migratetype, int *num_movable);
>
> -/*
> - * Changes migrate type in [start_pfn, end_pfn) to be MIGRATE_ISOLATE.
> - */
> -int
> -start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
> -			 unsigned migratetype, int flags);
> -
> -/*
> - * Changes MIGRATE_ISOLATE to MIGRATE_MOVABLE.
> - * target range is [start_pfn, end_pfn)
> - */
> -void
> -undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
> -			unsigned migratetype);
> +int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
> +			     unsigned migratetype, int flags);
> +int start_isolate_pageblocks(unsigned long start_pfn, unsigned long end_pfn,
> +			     unsigned migratetype, int flags);
> +
> +void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
> +			     unsigned migratetype);
> +void undo_isolate_pageblocks(unsigned long start_pfn, unsigned long end_pfn,
> +			     unsigned migratetype);
>
>  /*
>   * Test all pages in [start_pfn, end_pfn) are isolated or not.
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index 416b38ca8def..fb7f63c800d1 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -1089,7 +1089,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
>
>  	/*
>  	 * Fixup the number of isolated pageblocks before marking the sections
> -	 * onlining, such that undo_isolate_page_range() works correctly.
> +	 * onlining, such that undo_isolate_pageblocks() works correctly.
>  	 */
>  	spin_lock_irqsave(&zone->lock, flags);
>  	zone->nr_isolate_pageblock += nr_pages / pageblock_nr_pages;
> @@ -1113,7 +1113,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
>  		build_all_zonelists(NULL);
>
>  	/* Basic onlining is complete, allow allocation of onlined pages. */
> -	undo_isolate_page_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE);
> +	undo_isolate_pageblocks(pfn, pfn + nr_pages, MIGRATE_MOVABLE);
>
>  	/*
>  	 * Freshly onlined pages aren't shuffled (e.g., all pages are placed to
> @@ -1834,7 +1834,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
>  	lru_cache_disable();
>
>  	/* set above range as isolated */
> -	ret = start_isolate_page_range(start_pfn, end_pfn,
> +	ret = start_isolate_pageblocks(start_pfn, end_pfn,
>  				       MIGRATE_MOVABLE,
>  				       MEMORY_OFFLINE | REPORT_FAILURE);
>  	if (ret) {
> @@ -1937,7 +1937,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
>
>  failed_removal_isolated:
>  	/* pushback to free area */
> -	undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
> +	undo_isolate_pageblocks(start_pfn, end_pfn, MIGRATE_MOVABLE);
>  	memory_notify(MEM_CANCEL_OFFLINE, &arg);
>  failed_removal_pcplists_disabled:
>  	lru_cache_enable();
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index badc05fc1b2f..76f8c19e37a2 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -8950,15 +8950,6 @@ void *__init alloc_large_system_hash(const char *tablename,
>  }
>
>  #ifdef CONFIG_CONTIG_ALLOC
> -static unsigned long pfn_max_align_down(unsigned long pfn)
> -{
> -	return ALIGN_DOWN(pfn, MAX_ORDER_NR_PAGES);
> -}
> -
> -static unsigned long pfn_max_align_up(unsigned long pfn)
> -{
> -	return ALIGN(pfn, MAX_ORDER_NR_PAGES);
> -}
>
>  #if defined(CONFIG_DYNAMIC_DEBUG) || \
>  	(defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
> @@ -9104,8 +9095,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
>  	 * put back to page allocator so that buddy can use them.
>  	 */
>
> -	ret = start_isolate_page_range(pfn_max_align_down(start),
> -				       pfn_max_align_up(end), migratetype, 0);
> +	ret = start_isolate_page_range(start, end, migratetype, 0);
>  	if (ret)
>  		return ret;
>
> @@ -9186,8 +9176,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
>  		free_contig_range(end, outer_end - end);
>
>  done:
> -	undo_isolate_page_range(pfn_max_align_down(start),
> -				pfn_max_align_up(end), migratetype);
> +	undo_isolate_page_range(start, end, migratetype);
>  	return ret;
>  }
>  EXPORT_SYMBOL(alloc_contig_range);
> diff --git a/mm/page_isolation.c b/mm/page_isolation.c
> index b34f1310aeaa..7c1f7724c5bb 100644
> --- a/mm/page_isolation.c
> +++ b/mm/page_isolation.c
> @@ -15,6 +15,16 @@
>  #define CREATE_TRACE_POINTS
>  #include <trace/events/page_isolation.h>
>
> +static unsigned long pfn_max_align_down(unsigned long pfn)
> +{
> +	return ALIGN_DOWN(pfn, MAX_ORDER_NR_PAGES);
> +}
> +
> +static unsigned long pfn_max_align_up(unsigned long pfn)
> +{
> +	return ALIGN(pfn, MAX_ORDER_NR_PAGES);
> +}
> +
>  /*
>   * This function checks whether pageblock includes unmovable pages or not.
>   *
> @@ -262,12 +272,40 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
>  	return NULL;
>  }
>
> +/*
> + * Make page-allocation-type of pageblocks to be MIGRATE_ISOLATE.
> + *
> + * Most users should actually use start_isolate_page_range(). Only memory
> + * onlining/offlining that knows exactly what it's doing in regard to
> + * isolating only some pageblocks of MAX_ORDER - 1 pages (for the vmemmap)
> + * should use this interface.
> + */
> +int start_isolate_pageblocks(unsigned long start_pfn, unsigned long end_pfn,
> +			     unsigned migratetype, int flags)
> +{
> +	unsigned long pfn;
> +	struct page *page;
> +
> +	BUG_ON(!IS_ALIGNED(start_pfn, pageblock_nr_pages));
> +	BUG_ON(!IS_ALIGNED(end_pfn, pageblock_nr_pages));
> +
> +	for (pfn = start_pfn;
> +	     pfn < end_pfn;
> +	     pfn += pageblock_nr_pages) {
> +		page = __first_valid_page(pfn, pageblock_nr_pages);
> +		if (page && set_migratetype_isolate(page, migratetype, flags)) {
> +			undo_isolate_pageblocks(start_pfn, pfn, migratetype);
> +			return -EBUSY;
> +		}
> +	}
> +	return 0;
> +}
> +
>  /**
>   * start_isolate_page_range() - make page-allocation-type of range of pages to
>   * be MIGRATE_ISOLATE.
>   * @start_pfn:		The lower PFN of the range to be isolated.
>   * @end_pfn:		The upper PFN of the range to be isolated.
> - *			start_pfn/end_pfn must be aligned to pageblock_order.
>   * @migratetype:	Migrate type to set in error recovery.
>   * @flags:		The following flags are allowed (they can be combined in
>   *			a bit mask)
> @@ -306,29 +344,23 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
>  int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
>  			     unsigned migratetype, int flags)
>  {
> -	unsigned long pfn;
> -	struct page *page;
> -
> -	BUG_ON(!IS_ALIGNED(start_pfn, pageblock_nr_pages));
> -	BUG_ON(!IS_ALIGNED(end_pfn, pageblock_nr_pages));
> +	start_pfn = = pfn_max_align_down(start_pfn);
> +	end_pfn = pfn_max_align_up(end_pfn);
>
> -	for (pfn = start_pfn;
> -	     pfn < end_pfn;
> -	     pfn += pageblock_nr_pages) {
> -		page = __first_valid_page(pfn, pageblock_nr_pages);
> -		if (page && set_migratetype_isolate(page, migratetype, flags)) {
> -			undo_isolate_page_range(start_pfn, pfn, migratetype);
> -			return -EBUSY;
> -		}
> -	}
> -	return 0;
> +	return start_isolate_pageblocks(start_pfn, end_pfn, migratetype, flags);
>  }
>
>  /*
> - * Make isolated pages available again.
> + * Make isolated pageblocks, isolated via start_isolate_pageblocks, available
> + * again.
> + *
> + * Most users should actually use undo_isolate_page_range(). Only memory
> + * onlining/offlining that knows exactly what it's doing in regard to
> + * isolating only some pageblocks of MAX_ORDER - 1 pages (for the vmemmap)
> + * should use this interface.
>   */
> -void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
> -			    unsigned migratetype)
> +void undo_isolate_pageblocks(unsigned long start_pfn, unsigned long end_pfn,
> +			     unsigned migratetype)
>  {
>  	unsigned long pfn;
>  	struct page *page;
> @@ -345,6 +377,21 @@ void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
>  		unset_migratetype_isolate(page, migratetype);
>  	}
>  }
> +
> +/*
> + * Make isolated pageblocks, isolated via start_isolate_page_range(), available
> + * again. The pageblock isolation range will be extended just like for
> + * start_isolate_page_range().
> + */
> +void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
> +			     unsigned migratetype)
> +{
> +	start_pfn = = pfn_max_align_down(start_pfn);
> +	end_pfn = pfn_max_align_up(end_pfn);
> +
> +	return undo_isolate_pageblocks(start_pfn, end_pfn, migratetype);
> +}
> +
>  /*
>   * Test all pages in the range is free(means isolated) or not.
>   * all pages in [start_pfn...end_pfn) must be in the same zone.
> -- 
> 2.35.1
>
>
>
> -- 
> Thanks,
>
> David / dhildenb


--
Best Regards,
Yan, Zi
David Hildenbrand April 12, 2022, 3:06 p.m. UTC | #5
On 12.04.22 17:01, Zi Yan wrote:
> On 12 Apr 2022, at 10:49, David Hildenbrand wrote:
> 
>> On 12.04.22 16:07, Zi Yan wrote:
>>> On 12 Apr 2022, at 9:10, David Hildenbrand wrote:
>>>
>>>> On 06.04.22 17:18, Zi Yan wrote:
>>>>> From: Zi Yan <ziy@nvidia.com>
>>>>>
>>>>> Enable set_migratetype_isolate() to check specified sub-range for
>>>>> unmovable pages during isolation. Page isolation is done
>>>>> at MAX_ORDER_NR_PAEGS granularity, but not all pages within that
>>>>> granularity are intended to be isolated. For example,
>>>>> alloc_contig_range(), which uses page isolation, allows ranges without
>>>>> alignment. This commit makes unmovable page check only look for
>>>>> interesting pages, so that page isolation can succeed for any
>>>>> non-overlapping ranges.
>>>>>
>>>>> Signed-off-by: Zi Yan <ziy@nvidia.com>
>>>>> ---
>>>>
>>>> [...]
>>>>
>>>>>  /*
>>>>> - * This function checks whether pageblock includes unmovable pages or not.
>>>>> + * This function checks whether the range [start_pfn, end_pfn) includes
>>>>> + * unmovable pages or not. The range must fall into a single pageblock and
>>>>> + * consequently belong to a single zone.
>>>>>   *
>>>>>   * PageLRU check without isolation or lru_lock could race so that
>>>>>   * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable
>>>>> @@ -28,12 +30,14 @@
>>>>>   * cannot get removed (e.g., via memory unplug) concurrently.
>>>>>   *
>>>>>   */
>>>>> -static struct page *has_unmovable_pages(struct zone *zone, struct page *page,
>>>>> -				 int migratetype, int flags)
>>>>> +static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long end_pfn,
>>>>> +				int migratetype, int flags)
>>>>>  {
>>>>> -	unsigned long iter = 0;
>>>>> -	unsigned long pfn = page_to_pfn(page);
>>>>> -	unsigned long offset = pfn % pageblock_nr_pages;
>>>>> +	unsigned long pfn = start_pfn;
>>>>> +	struct page *page = pfn_to_page(pfn);
>>>>
>>>>
>>>> Just do
>>>>
>>>> struct page *page = pfn_to_page(start_pfn);
>>>> struct zone *zone = page_zone(page);
>>>>
>>>> here. No need to lookup the zone again in the loop because, as you
>>>> document "must ... belong to a single zone.".
>>>>
>>>> Then, there is also no need to initialize "pfn" here. In the loop header
>>>> is sufficient.
>>>>
>>>
>>> Sure.
>>>
>>>>> +
>>>>> +	VM_BUG_ON(ALIGN_DOWN(start_pfn, pageblock_nr_pages) !=
>>>>> +		  ALIGN_DOWN(end_pfn - 1, pageblock_nr_pages));
>>>>>
>>>>>  	if (is_migrate_cma_page(page)) {
>>>>>  		/*
>>>>> @@ -47,8 +51,11 @@ static struct page *has_unmovable_pages(struct zone *zone, struct page *page,
>>>>>  		return page;
>>>>>  	}
>>>>>
>>>>> -	for (; iter < pageblock_nr_pages - offset; iter++) {
>>>>> -		page = pfn_to_page(pfn + iter);
>>>>> +	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
>>>>> +		struct zone *zone;
>>>>> +
>>>>> +		page = pfn_to_page(pfn);
>>>>> +		zone = page_zone(page);
>>>>>
>>>>>  		/*
>>>>>  		 * Both, bootmem allocations and memory holes are marked
>>>>> @@ -85,7 +92,7 @@ static struct page *has_unmovable_pages(struct zone *zone, struct page *page,
>>>>>  			}
>>>>>
>>>>>  			skip_pages = compound_nr(head) - (page - head);
>>>>> -			iter += skip_pages - 1;
>>>>> +			pfn += skip_pages - 1;
>>>>>  			continue;
>>>>>  		}
>>>>>
>>>>> @@ -97,7 +104,7 @@ static struct page *has_unmovable_pages(struct zone *zone, struct page *page,
>>>>>  		 */
>>>>>  		if (!page_ref_count(page)) {
>>>>>  			if (PageBuddy(page))
>>>>> -				iter += (1 << buddy_order(page)) - 1;
>>>>> +				pfn += (1 << buddy_order(page)) - 1;
>>>>>  			continue;
>>>>>  		}
>>>>>
>>>>> @@ -134,11 +141,18 @@ static struct page *has_unmovable_pages(struct zone *zone, struct page *page,
>>>>>  	return NULL;
>>>>>  }
>>>>>
>>>>> -static int set_migratetype_isolate(struct page *page, int migratetype, int isol_flags)
>>>>> +/*
>>>>> + * This function set pageblock migratetype to isolate if no unmovable page is
>>>>> + * present in [start_pfn, end_pfn). The pageblock must intersect with
>>>>> + * [start_pfn, end_pfn).
>>>>> + */
>>>>> +static int set_migratetype_isolate(struct page *page, int migratetype, int isol_flags,
>>>>> +			unsigned long start_pfn, unsigned long end_pfn)
>>>>
>>>> I think we might be able do better, eventually not passing start_pfn at
>>>> all. Hmm.
>>>
>>> IMHO, having start_pfn and end_pfn in the parameter list would make the
>>> interface easier to understand. Otherwise if we remove start_pfn,
>>> the caller needs to adjust @page to be within the range of [start_pfn,
>>> end_pfn)
>>>
>>>>
>>>> I think we want to pull out the
>>>> start_isolate_page_range()/undo_isolate_page_range() interface change
>>>> into a separate patch.
>>>
>>> You mean a patch just adding
>>>
>>> unsigned long isolate_start = pfn_max_align_down(start_pfn);
>>> unsigned long isolate_end = pfn_max_align_up(end_pfn);
>>>
>>> in start_isolate_page_range()/undo_isolate_page_range()?
>>>
>>> Yes I can do that.
>>
>> I think we have to be careful with memory onlining/offlining. There are
>> corner cases where we get called with only pageblock alignment and
>> must not adjust the range.
> 
> In the patch below, you added a new set of start_isolate_pageblocks()
> and undo_isolate_pageblocks(), where start_isolate_pageblocks() still
> calls set_migratetype_isolate() and noted their range should not be
> adjusted. But in my patch, set_migratetype_isolate() adjusts
> the range for has_unmovable_pages() check. Do you mean

Right, that's broken in your patch. Memory onlining/offlining behavior
changed recently when "vmemmap on memory" was added. The start range
might only be aligned to pageblocks but not MAX_ORDER -1 -- and we must
not align u..

The core thing is that there are two types of users: memory offlining
that knows what it's doing when it aligns to less then MAX_ORDER -1 ,
and range allocators, that just pass in the range of interest.

> start_isolate_pageblocks() should call a different version of
> set_migratetype_isolate() without range adjustment? That can be done
> with an additional parameter in start_isolate_page_range(), like
> bool strict, right?

Random boolean flags are in general frowned upon ;)
Zi Yan April 12, 2022, 5:41 p.m. UTC | #6
On 12 Apr 2022, at 11:06, David Hildenbrand wrote:

> On 12.04.22 17:01, Zi Yan wrote:
>> On 12 Apr 2022, at 10:49, David Hildenbrand wrote:
>>
>>> On 12.04.22 16:07, Zi Yan wrote:
>>>> On 12 Apr 2022, at 9:10, David Hildenbrand wrote:
>>>>
>>>>> On 06.04.22 17:18, Zi Yan wrote:
>>>>>> From: Zi Yan <ziy@nvidia.com>
>>>>>>
>>>>>> Enable set_migratetype_isolate() to check specified sub-range for
>>>>>> unmovable pages during isolation. Page isolation is done
>>>>>> at MAX_ORDER_NR_PAEGS granularity, but not all pages within that
>>>>>> granularity are intended to be isolated. For example,
>>>>>> alloc_contig_range(), which uses page isolation, allows ranges without
>>>>>> alignment. This commit makes unmovable page check only look for
>>>>>> interesting pages, so that page isolation can succeed for any
>>>>>> non-overlapping ranges.
>>>>>>
>>>>>> Signed-off-by: Zi Yan <ziy@nvidia.com>
>>>>>> ---
>>>>>
>>>>> [...]
>>>>>
>>>>>>  /*
>>>>>> - * This function checks whether pageblock includes unmovable pages or not.
>>>>>> + * This function checks whether the range [start_pfn, end_pfn) includes
>>>>>> + * unmovable pages or not. The range must fall into a single pageblock and
>>>>>> + * consequently belong to a single zone.
>>>>>>   *
>>>>>>   * PageLRU check without isolation or lru_lock could race so that
>>>>>>   * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable
>>>>>> @@ -28,12 +30,14 @@
>>>>>>   * cannot get removed (e.g., via memory unplug) concurrently.
>>>>>>   *
>>>>>>   */
>>>>>> -static struct page *has_unmovable_pages(struct zone *zone, struct page *page,
>>>>>> -				 int migratetype, int flags)
>>>>>> +static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long end_pfn,
>>>>>> +				int migratetype, int flags)
>>>>>>  {
>>>>>> -	unsigned long iter = 0;
>>>>>> -	unsigned long pfn = page_to_pfn(page);
>>>>>> -	unsigned long offset = pfn % pageblock_nr_pages;
>>>>>> +	unsigned long pfn = start_pfn;
>>>>>> +	struct page *page = pfn_to_page(pfn);
>>>>>
>>>>>
>>>>> Just do
>>>>>
>>>>> struct page *page = pfn_to_page(start_pfn);
>>>>> struct zone *zone = page_zone(page);
>>>>>
>>>>> here. No need to lookup the zone again in the loop because, as you
>>>>> document "must ... belong to a single zone.".
>>>>>
>>>>> Then, there is also no need to initialize "pfn" here. In the loop header
>>>>> is sufficient.
>>>>>
>>>>
>>>> Sure.
>>>>
>>>>>> +
>>>>>> +	VM_BUG_ON(ALIGN_DOWN(start_pfn, pageblock_nr_pages) !=
>>>>>> +		  ALIGN_DOWN(end_pfn - 1, pageblock_nr_pages));
>>>>>>
>>>>>>  	if (is_migrate_cma_page(page)) {
>>>>>>  		/*
>>>>>> @@ -47,8 +51,11 @@ static struct page *has_unmovable_pages(struct zone *zone, struct page *page,
>>>>>>  		return page;
>>>>>>  	}
>>>>>>
>>>>>> -	for (; iter < pageblock_nr_pages - offset; iter++) {
>>>>>> -		page = pfn_to_page(pfn + iter);
>>>>>> +	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
>>>>>> +		struct zone *zone;
>>>>>> +
>>>>>> +		page = pfn_to_page(pfn);
>>>>>> +		zone = page_zone(page);
>>>>>>
>>>>>>  		/*
>>>>>>  		 * Both, bootmem allocations and memory holes are marked
>>>>>> @@ -85,7 +92,7 @@ static struct page *has_unmovable_pages(struct zone *zone, struct page *page,
>>>>>>  			}
>>>>>>
>>>>>>  			skip_pages = compound_nr(head) - (page - head);
>>>>>> -			iter += skip_pages - 1;
>>>>>> +			pfn += skip_pages - 1;
>>>>>>  			continue;
>>>>>>  		}
>>>>>>
>>>>>> @@ -97,7 +104,7 @@ static struct page *has_unmovable_pages(struct zone *zone, struct page *page,
>>>>>>  		 */
>>>>>>  		if (!page_ref_count(page)) {
>>>>>>  			if (PageBuddy(page))
>>>>>> -				iter += (1 << buddy_order(page)) - 1;
>>>>>> +				pfn += (1 << buddy_order(page)) - 1;
>>>>>>  			continue;
>>>>>>  		}
>>>>>>
>>>>>> @@ -134,11 +141,18 @@ static struct page *has_unmovable_pages(struct zone *zone, struct page *page,
>>>>>>  	return NULL;
>>>>>>  }
>>>>>>
>>>>>> -static int set_migratetype_isolate(struct page *page, int migratetype, int isol_flags)
>>>>>> +/*
>>>>>> + * This function set pageblock migratetype to isolate if no unmovable page is
>>>>>> + * present in [start_pfn, end_pfn). The pageblock must intersect with
>>>>>> + * [start_pfn, end_pfn).
>>>>>> + */
>>>>>> +static int set_migratetype_isolate(struct page *page, int migratetype, int isol_flags,
>>>>>> +			unsigned long start_pfn, unsigned long end_pfn)
>>>>>
>>>>> I think we might be able do better, eventually not passing start_pfn at
>>>>> all. Hmm.
>>>>
>>>> IMHO, having start_pfn and end_pfn in the parameter list would make the
>>>> interface easier to understand. Otherwise if we remove start_pfn,
>>>> the caller needs to adjust @page to be within the range of [start_pfn,
>>>> end_pfn)
>>>>
>>>>>
>>>>> I think we want to pull out the
>>>>> start_isolate_page_range()/undo_isolate_page_range() interface change
>>>>> into a separate patch.
>>>>
>>>> You mean a patch just adding
>>>>
>>>> unsigned long isolate_start = pfn_max_align_down(start_pfn);
>>>> unsigned long isolate_end = pfn_max_align_up(end_pfn);
>>>>
>>>> in start_isolate_page_range()/undo_isolate_page_range()?
>>>>
>>>> Yes I can do that.
>>>
>>> I think we have to be careful with memory onlining/offlining. There are
>>> corner cases where we get called with only pageblock alignment and
>>> must not adjust the range.
>>
>> In the patch below, you added a new set of start_isolate_pageblocks()
>> and undo_isolate_pageblocks(), where start_isolate_pageblocks() still
>> calls set_migratetype_isolate() and noted their range should not be
>> adjusted. But in my patch, set_migratetype_isolate() adjusts
>> the range for has_unmovable_pages() check. Do you mean
>
> Right, that's broken in your patch. Memory onlining/offlining behavior
> changed recently when "vmemmap on memory" was added. The start range
> might only be aligned to pageblocks but not MAX_ORDER -1 -- and we must
> not align u..
>
> The core thing is that there are two types of users: memory offlining
> that knows what it's doing when it aligns to less then MAX_ORDER -1 ,
> and range allocators, that just pass in the range of interest.

Oh, you mean the pfn_max_align_down() and pfn_max_align_up() are wrong
for memory onlining/offlining callers. Got it. And in patch 3, this is
not a concern any more, since we move to pageblock_nr_pages alignment.

>
>> start_isolate_pageblocks() should call a different version of
>> set_migratetype_isolate() without range adjustment? That can be done
>> with an additional parameter in start_isolate_page_range(), like
>> bool strict, right?
>
> Random boolean flags are in general frowned upon ;)

I misunderstood about the alignment	issue. No need for this additional
parameter.

Thanks. Will take your patch and adjust this one based on yours.

I will wait for your reviews on patch 3 and onwards before sending
out a new version.

--
Best Regards,
Yan, Zi
Zi Yan April 12, 2022, 7:34 p.m. UTC | #7
On 12 Apr 2022, at 13:41, Zi Yan wrote:

> On 12 Apr 2022, at 11:06, David Hildenbrand wrote:
>
>> On 12.04.22 17:01, Zi Yan wrote:
>>> On 12 Apr 2022, at 10:49, David Hildenbrand wrote:
>>>
>>>> On 12.04.22 16:07, Zi Yan wrote:
>>>>> On 12 Apr 2022, at 9:10, David Hildenbrand wrote:
>>>>>
>>>>>> On 06.04.22 17:18, Zi Yan wrote:
>>>>>>> From: Zi Yan <ziy@nvidia.com>
>>>>>>>
>>>>>>> Enable set_migratetype_isolate() to check specified sub-range for
>>>>>>> unmovable pages during isolation. Page isolation is done
>>>>>>> at MAX_ORDER_NR_PAEGS granularity, but not all pages within that
>>>>>>> granularity are intended to be isolated. For example,
>>>>>>> alloc_contig_range(), which uses page isolation, allows ranges without
>>>>>>> alignment. This commit makes unmovable page check only look for
>>>>>>> interesting pages, so that page isolation can succeed for any
>>>>>>> non-overlapping ranges.
>>>>>>>
>>>>>>> Signed-off-by: Zi Yan <ziy@nvidia.com>
>>>>>>> ---
>>>>>>
>>>>>> [...]
>>>>>>
>>>>>>>  /*
>>>>>>> - * This function checks whether pageblock includes unmovable pages or not.
>>>>>>> + * This function checks whether the range [start_pfn, end_pfn) includes
>>>>>>> + * unmovable pages or not. The range must fall into a single pageblock and
>>>>>>> + * consequently belong to a single zone.
>>>>>>>   *
>>>>>>>   * PageLRU check without isolation or lru_lock could race so that
>>>>>>>   * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable
>>>>>>> @@ -28,12 +30,14 @@
>>>>>>>   * cannot get removed (e.g., via memory unplug) concurrently.
>>>>>>>   *
>>>>>>>   */
>>>>>>> -static struct page *has_unmovable_pages(struct zone *zone, struct page *page,
>>>>>>> -				 int migratetype, int flags)
>>>>>>> +static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long end_pfn,
>>>>>>> +				int migratetype, int flags)
>>>>>>>  {
>>>>>>> -	unsigned long iter = 0;
>>>>>>> -	unsigned long pfn = page_to_pfn(page);
>>>>>>> -	unsigned long offset = pfn % pageblock_nr_pages;
>>>>>>> +	unsigned long pfn = start_pfn;
>>>>>>> +	struct page *page = pfn_to_page(pfn);
>>>>>>
>>>>>>
>>>>>> Just do
>>>>>>
>>>>>> struct page *page = pfn_to_page(start_pfn);
>>>>>> struct zone *zone = page_zone(page);
>>>>>>
>>>>>> here. No need to lookup the zone again in the loop because, as you
>>>>>> document "must ... belong to a single zone.".
>>>>>>
>>>>>> Then, there is also no need to initialize "pfn" here. In the loop header
>>>>>> is sufficient.
>>>>>>
>>>>>
>>>>> Sure.
>>>>>
>>>>>>> +
>>>>>>> +	VM_BUG_ON(ALIGN_DOWN(start_pfn, pageblock_nr_pages) !=
>>>>>>> +		  ALIGN_DOWN(end_pfn - 1, pageblock_nr_pages));
>>>>>>>
>>>>>>>  	if (is_migrate_cma_page(page)) {
>>>>>>>  		/*
>>>>>>> @@ -47,8 +51,11 @@ static struct page *has_unmovable_pages(struct zone *zone, struct page *page,
>>>>>>>  		return page;
>>>>>>>  	}
>>>>>>>
>>>>>>> -	for (; iter < pageblock_nr_pages - offset; iter++) {
>>>>>>> -		page = pfn_to_page(pfn + iter);
>>>>>>> +	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
>>>>>>> +		struct zone *zone;
>>>>>>> +
>>>>>>> +		page = pfn_to_page(pfn);
>>>>>>> +		zone = page_zone(page);
>>>>>>>
>>>>>>>  		/*
>>>>>>>  		 * Both, bootmem allocations and memory holes are marked
>>>>>>> @@ -85,7 +92,7 @@ static struct page *has_unmovable_pages(struct zone *zone, struct page *page,
>>>>>>>  			}
>>>>>>>
>>>>>>>  			skip_pages = compound_nr(head) - (page - head);
>>>>>>> -			iter += skip_pages - 1;
>>>>>>> +			pfn += skip_pages - 1;
>>>>>>>  			continue;
>>>>>>>  		}
>>>>>>>
>>>>>>> @@ -97,7 +104,7 @@ static struct page *has_unmovable_pages(struct zone *zone, struct page *page,
>>>>>>>  		 */
>>>>>>>  		if (!page_ref_count(page)) {
>>>>>>>  			if (PageBuddy(page))
>>>>>>> -				iter += (1 << buddy_order(page)) - 1;
>>>>>>> +				pfn += (1 << buddy_order(page)) - 1;
>>>>>>>  			continue;
>>>>>>>  		}
>>>>>>>
>>>>>>> @@ -134,11 +141,18 @@ static struct page *has_unmovable_pages(struct zone *zone, struct page *page,
>>>>>>>  	return NULL;
>>>>>>>  }
>>>>>>>
>>>>>>> -static int set_migratetype_isolate(struct page *page, int migratetype, int isol_flags)
>>>>>>> +/*
>>>>>>> + * This function set pageblock migratetype to isolate if no unmovable page is
>>>>>>> + * present in [start_pfn, end_pfn). The pageblock must intersect with
>>>>>>> + * [start_pfn, end_pfn).
>>>>>>> + */
>>>>>>> +static int set_migratetype_isolate(struct page *page, int migratetype, int isol_flags,
>>>>>>> +			unsigned long start_pfn, unsigned long end_pfn)
>>>>>>
>>>>>> I think we might be able do better, eventually not passing start_pfn at
>>>>>> all. Hmm.
>>>>>
>>>>> IMHO, having start_pfn and end_pfn in the parameter list would make the
>>>>> interface easier to understand. Otherwise if we remove start_pfn,
>>>>> the caller needs to adjust @page to be within the range of [start_pfn,
>>>>> end_pfn)
>>>>>
>>>>>>
>>>>>> I think we want to pull out the
>>>>>> start_isolate_page_range()/undo_isolate_page_range() interface change
>>>>>> into a separate patch.
>>>>>
>>>>> You mean a patch just adding
>>>>>
>>>>> unsigned long isolate_start = pfn_max_align_down(start_pfn);
>>>>> unsigned long isolate_end = pfn_max_align_up(end_pfn);
>>>>>
>>>>> in start_isolate_page_range()/undo_isolate_page_range()?
>>>>>
>>>>> Yes I can do that.
>>>>
>>>> I think we have to be careful with memory onlining/offlining. There are
>>>> corner cases where we get called with only pageblock alignment and
>>>> must not adjust the range.
>>>
>>> In the patch below, you added a new set of start_isolate_pageblocks()
>>> and undo_isolate_pageblocks(), where start_isolate_pageblocks() still
>>> calls set_migratetype_isolate() and noted their range should not be
>>> adjusted. But in my patch, set_migratetype_isolate() adjusts
>>> the range for has_unmovable_pages() check. Do you mean
>>
>> Right, that's broken in your patch. Memory onlining/offlining behavior
>> changed recently when "vmemmap on memory" was added. The start range
>> might only be aligned to pageblocks but not MAX_ORDER -1 -- and we must
>> not align u..
>>
>> The core thing is that there are two types of users: memory offlining
>> that knows what it's doing when it aligns to less then MAX_ORDER -1 ,
>> and range allocators, that just pass in the range of interest.
>
> Oh, you mean the pfn_max_align_down() and pfn_max_align_up() are wrong
> for memory onlining/offlining callers. Got it. And in patch 3, this is
> not a concern any more, since we move to pageblock_nr_pages alignment.
>
>>
>>> start_isolate_pageblocks() should call a different version of
>>> set_migratetype_isolate() without range adjustment? That can be done
>>> with an additional parameter in start_isolate_page_range(), like
>>> bool strict, right?
>>
>> Random boolean flags are in general frowned upon ;)
>
> I misunderstood about the alignment	issue. No need for this additional
> parameter.
>
> Thanks. Will take your patch and adjust this one based on yours.
>
> I will wait for your reviews on patch 3 and onwards before sending
> out a new version.

As I am editing my patch 2 and 3 based on your patch, it becomes redundant
to have start_isolate_pageblocks() in patch 3, since start_isolate_page_range()
will only require pageblock size alignment after patch 3.
start_isolate_pageblocks() and start_isolate_page_range() do the same thing.

Since moving MAX_ORDER-1 alignment into start_isolate_page_range() is wrong
in patch 2, I think it is better to move it in patch 3, when
start_isolate_page_range() is ready for pageblock size alignment. Patch 2
then will have no functional change, since the range does not change and
has_unmovable_pages() will make its in in patch 3 eventually.

If you think moving the range alignment adjustment should be a separate
patch, I can move it to patch 4 after patch 3 when all the related
functions are ready for the new pageblock size alignment.

--
Best Regards,
Yan, Zi
diff mbox series

Patch

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index dafef8099acd..cacb2a16145e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -8873,16 +8873,6 @@  void *__init alloc_large_system_hash(const char *tablename,
 }
 
 #ifdef CONFIG_CONTIG_ALLOC
-static unsigned long pfn_max_align_down(unsigned long pfn)
-{
-	return ALIGN_DOWN(pfn, MAX_ORDER_NR_PAGES);
-}
-
-static unsigned long pfn_max_align_up(unsigned long pfn)
-{
-	return ALIGN(pfn, MAX_ORDER_NR_PAGES);
-}
-
 #if defined(CONFIG_DYNAMIC_DEBUG) || \
 	(defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
 /* Usage: See admin-guide/dynamic-debug-howto.rst */
@@ -9027,8 +9017,7 @@  int alloc_contig_range(unsigned long start, unsigned long end,
 	 * put back to page allocator so that buddy can use them.
 	 */
 
-	ret = start_isolate_page_range(pfn_max_align_down(start),
-				       pfn_max_align_up(end), migratetype, 0);
+	ret = start_isolate_page_range(start, end, migratetype, 0);
 	if (ret)
 		return ret;
 
@@ -9109,8 +9098,7 @@  int alloc_contig_range(unsigned long start, unsigned long end,
 		free_contig_range(end, outer_end - end);
 
 done:
-	undo_isolate_page_range(pfn_max_align_down(start),
-				pfn_max_align_up(end), migratetype);
+	undo_isolate_page_range(start, end, migratetype);
 	return ret;
 }
 EXPORT_SYMBOL(alloc_contig_range);
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index df49f86a6ed1..b8f96ed49d8b 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -16,7 +16,9 @@ 
 #include <trace/events/page_isolation.h>
 
 /*
- * This function checks whether pageblock includes unmovable pages or not.
+ * This function checks whether the range [start_pfn, end_pfn) includes
+ * unmovable pages or not. The range must fall into a single pageblock and
+ * consequently belong to a single zone.
  *
  * PageLRU check without isolation or lru_lock could race so that
  * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable
@@ -28,12 +30,14 @@ 
  * cannot get removed (e.g., via memory unplug) concurrently.
  *
  */
-static struct page *has_unmovable_pages(struct zone *zone, struct page *page,
-				 int migratetype, int flags)
+static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long end_pfn,
+				int migratetype, int flags)
 {
-	unsigned long iter = 0;
-	unsigned long pfn = page_to_pfn(page);
-	unsigned long offset = pfn % pageblock_nr_pages;
+	unsigned long pfn = start_pfn;
+	struct page *page = pfn_to_page(pfn);
+
+	VM_BUG_ON(ALIGN_DOWN(start_pfn, pageblock_nr_pages) !=
+		  ALIGN_DOWN(end_pfn - 1, pageblock_nr_pages));
 
 	if (is_migrate_cma_page(page)) {
 		/*
@@ -47,8 +51,11 @@  static struct page *has_unmovable_pages(struct zone *zone, struct page *page,
 		return page;
 	}
 
-	for (; iter < pageblock_nr_pages - offset; iter++) {
-		page = pfn_to_page(pfn + iter);
+	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+		struct zone *zone;
+
+		page = pfn_to_page(pfn);
+		zone = page_zone(page);
 
 		/*
 		 * Both, bootmem allocations and memory holes are marked
@@ -85,7 +92,7 @@  static struct page *has_unmovable_pages(struct zone *zone, struct page *page,
 			}
 
 			skip_pages = compound_nr(head) - (page - head);
-			iter += skip_pages - 1;
+			pfn += skip_pages - 1;
 			continue;
 		}
 
@@ -97,7 +104,7 @@  static struct page *has_unmovable_pages(struct zone *zone, struct page *page,
 		 */
 		if (!page_ref_count(page)) {
 			if (PageBuddy(page))
-				iter += (1 << buddy_order(page)) - 1;
+				pfn += (1 << buddy_order(page)) - 1;
 			continue;
 		}
 
@@ -134,11 +141,18 @@  static struct page *has_unmovable_pages(struct zone *zone, struct page *page,
 	return NULL;
 }
 
-static int set_migratetype_isolate(struct page *page, int migratetype, int isol_flags)
+/*
+ * This function set pageblock migratetype to isolate if no unmovable page is
+ * present in [start_pfn, end_pfn). The pageblock must intersect with
+ * [start_pfn, end_pfn).
+ */
+static int set_migratetype_isolate(struct page *page, int migratetype, int isol_flags,
+			unsigned long start_pfn, unsigned long end_pfn)
 {
 	struct zone *zone = page_zone(page);
 	struct page *unmovable;
 	unsigned long flags;
+	unsigned long check_unmovable_start, check_unmovable_end;
 
 	spin_lock_irqsave(&zone->lock, flags);
 
@@ -155,8 +169,16 @@  static int set_migratetype_isolate(struct page *page, int migratetype, int isol_
 	/*
 	 * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
 	 * We just check MOVABLE pages.
+	 *
+	 * Pass the intersection of [start_pfn, end_pfn) and the page's pageblock
+	 * to avoid redundant checks.
 	 */
-	unmovable = has_unmovable_pages(zone, page, migratetype, isol_flags);
+	check_unmovable_start = max(page_to_pfn(page), start_pfn);
+	check_unmovable_end = min(ALIGN(page_to_pfn(page) + 1, pageblock_nr_pages),
+				  end_pfn);
+
+	unmovable = has_unmovable_pages(check_unmovable_start, check_unmovable_end,
+			migratetype, isol_flags);
 	if (!unmovable) {
 		unsigned long nr_pages;
 		int mt = get_pageblock_migratetype(page);
@@ -259,12 +281,21 @@  __first_valid_page(unsigned long pfn, unsigned long nr_pages)
 	return NULL;
 }
 
+static unsigned long pfn_max_align_down(unsigned long pfn)
+{
+	return ALIGN_DOWN(pfn, MAX_ORDER_NR_PAGES);
+}
+
+static unsigned long pfn_max_align_up(unsigned long pfn)
+{
+	return ALIGN(pfn, MAX_ORDER_NR_PAGES);
+}
+
 /**
  * start_isolate_page_range() - make page-allocation-type of range of pages to
  * be MIGRATE_ISOLATE.
  * @start_pfn:		The lower PFN of the range to be isolated.
  * @end_pfn:		The upper PFN of the range to be isolated.
- *			start_pfn/end_pfn must be aligned to pageblock_order.
  * @migratetype:	Migrate type to set in error recovery.
  * @flags:		The following flags are allowed (they can be combined in
  *			a bit mask)
@@ -306,15 +337,16 @@  int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
 	unsigned long pfn;
 	struct page *page;
 
-	BUG_ON(!IS_ALIGNED(start_pfn, pageblock_nr_pages));
-	BUG_ON(!IS_ALIGNED(end_pfn, pageblock_nr_pages));
+	unsigned long isolate_start = pfn_max_align_down(start_pfn);
+	unsigned long isolate_end = pfn_max_align_up(end_pfn);
 
-	for (pfn = start_pfn;
-	     pfn < end_pfn;
+	for (pfn = isolate_start;
+	     pfn < isolate_end;
 	     pfn += pageblock_nr_pages) {
 		page = __first_valid_page(pfn, pageblock_nr_pages);
-		if (page && set_migratetype_isolate(page, migratetype, flags)) {
-			undo_isolate_page_range(start_pfn, pfn, migratetype);
+		if (page && set_migratetype_isolate(page, migratetype, flags,
+					start_pfn, end_pfn)) {
+			undo_isolate_page_range(isolate_start, pfn, migratetype);
 			return -EBUSY;
 		}
 	}
@@ -329,12 +361,12 @@  void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
 {
 	unsigned long pfn;
 	struct page *page;
+	unsigned long isolate_start = pfn_max_align_down(start_pfn);
+	unsigned long isolate_end = pfn_max_align_up(end_pfn);
 
-	BUG_ON(!IS_ALIGNED(start_pfn, pageblock_nr_pages));
-	BUG_ON(!IS_ALIGNED(end_pfn, pageblock_nr_pages));
 
-	for (pfn = start_pfn;
-	     pfn < end_pfn;
+	for (pfn = isolate_start;
+	     pfn < isolate_end;
 	     pfn += pageblock_nr_pages) {
 		page = __first_valid_page(pfn, pageblock_nr_pages);
 		if (!page || !is_migrate_isolate_page(page))