diff mbox series

[5/7] mm: move zone iterator outside of deferred_init_maxorder()

Message ID 20200430201125.532129-6-daniel.m.jordan@oracle.com (mailing list archive)
State RFC
Delegated to: Herbert Xu
Headers show
Series padata: parallelize deferred page init | expand

Commit Message

Daniel Jordan April 30, 2020, 8:11 p.m. UTC
padata will soon divide up pfn ranges between threads when parallelizing
deferred init, and deferred_init_maxorder() complicates that by using an
opaque index in addition to start and end pfns.  Move the index outside
the function to make splitting the job easier, and simplify the code
while at it.

deferred_init_maxorder() now always iterates within a single pfn range
instead of potentially multiple ranges, and advances start_pfn to the
end of that range instead of the max-order block so partial pfn ranges
in the block aren't skipped in a later iteration.  The section alignment
check in deferred_grow_zone() is removed as well since this alignment is
no longer guaranteed.  It's not clear what value the alignment provided
originally.

Signed-off-by: Daniel Jordan <daniel.m.jordan@oracle.com>
---
 mm/page_alloc.c | 88 +++++++++++++++----------------------------------
 1 file changed, 27 insertions(+), 61 deletions(-)

Comments

Alexander Duyck April 30, 2020, 9:43 p.m. UTC | #1
On 4/30/2020 1:11 PM, Daniel Jordan wrote:
> padata will soon divide up pfn ranges between threads when parallelizing
> deferred init, and deferred_init_maxorder() complicates that by using an
> opaque index in addition to start and end pfns.  Move the index outside
> the function to make splitting the job easier, and simplify the code
> while at it.
> 
> deferred_init_maxorder() now always iterates within a single pfn range
> instead of potentially multiple ranges, and advances start_pfn to the
> end of that range instead of the max-order block so partial pfn ranges
> in the block aren't skipped in a later iteration.  The section alignment
> check in deferred_grow_zone() is removed as well since this alignment is
> no longer guaranteed.  It's not clear what value the alignment provided
> originally.
> 
> Signed-off-by: Daniel Jordan <daniel.m.jordan@oracle.com>

So part of the reason for splitting it up along section aligned 
boundaries was because we already had an existing functionality in 
deferred_grow_zone that was going in and pulling out a section aligned 
chunk and processing it to prepare enough memory for other threads to 
keep running. I suspect that the section alignment was done because 
normally I believe that is also the alignment for memory onlining.

With this already breaking things up over multiple threads how does this 
work with deferred_grow_zone? Which thread is it trying to allocate from 
if it needs to allocate some memory for itself?

Also what is to prevent a worker from stop deferred_grow_zone from 
bailing out in the middle of a max order page block if there is a hole 
in the middle of the block?

> ---
>   mm/page_alloc.c | 88 +++++++++++++++----------------------------------
>   1 file changed, 27 insertions(+), 61 deletions(-)
> 
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 68669d3a5a665..990514d8f0d94 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1708,55 +1708,23 @@ deferred_init_mem_pfn_range_in_zone(u64 *i, struct zone *zone,
>   }
>   
>   /*
> - * Initialize and free pages. We do it in two loops: first we initialize
> - * struct page, then free to buddy allocator, because while we are
> - * freeing pages we can access pages that are ahead (computing buddy
> - * page in __free_one_page()).
> - *
> - * In order to try and keep some memory in the cache we have the loop
> - * broken along max page order boundaries. This way we will not cause
> - * any issues with the buddy page computation.
> + * Initialize the struct pages and then free them to the buddy allocator at
> + * most a max order block at a time because while we are freeing pages we can
> + * access pages that are ahead (computing buddy page in __free_one_page()).
> + * It's also cache friendly.
>    */
>   static unsigned long __init
> -deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn,
> -		       unsigned long *end_pfn)
> +deferred_init_maxorder(struct zone *zone, unsigned long *start_pfn,
> +		       unsigned long end_pfn)
>   {
> -	unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES);
> -	unsigned long spfn = *start_pfn, epfn = *end_pfn;
> -	unsigned long nr_pages = 0;
> -	u64 j = *i;
> -
> -	/* First we loop through and initialize the page values */
> -	for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) {
> -		unsigned long t;
> -
> -		if (mo_pfn <= *start_pfn)
> -			break;
> -
> -		t = min(mo_pfn, *end_pfn);
> -		nr_pages += deferred_init_pages(zone, *start_pfn, t);
> -
> -		if (mo_pfn < *end_pfn) {
> -			*start_pfn = mo_pfn;
> -			break;
> -		}
> -	}
> -
> -	/* Reset values and now loop through freeing pages as needed */
> -	swap(j, *i);
> -
> -	for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) {
> -		unsigned long t;
> -
> -		if (mo_pfn <= spfn)
> -			break;
> +	unsigned long nr_pages, pfn;
>   
> -		t = min(mo_pfn, epfn);
> -		deferred_free_pages(spfn, t);
> +	pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES);
> +	pfn = min(pfn, end_pfn);
>   
> -		if (mo_pfn <= epfn)
> -			break;
> -	}
> +	nr_pages = deferred_init_pages(zone, *start_pfn, pfn);
> +	deferred_free_pages(*start_pfn, pfn);
> +	*start_pfn = pfn;
>   
>   	return nr_pages;
>   }
> @@ -1814,9 +1782,11 @@ static int __init deferred_init_memmap(void *data)
>   	 * that we can avoid introducing any issues with the buddy
>   	 * allocator.
>   	 */
> -	while (spfn < epfn) {
> -		nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
> -		cond_resched();
> +	for_each_free_mem_pfn_range_in_zone_from(i, zone, &spfn, &epfn) {
> +		while (spfn < epfn) {
> +			nr_pages += deferred_init_maxorder(zone, &spfn, epfn);
> +			cond_resched();
> +		}
>   	}
>   zone_empty:
>   	/* Sanity check that the next zone really is unpopulated */
> @@ -1883,22 +1853,18 @@ deferred_grow_zone(struct zone *zone, unsigned int order)
>   	 * that we can avoid introducing any issues with the buddy
>   	 * allocator.
>   	 */
> -	while (spfn < epfn) {
> -		/* update our first deferred PFN for this section */
> -		first_deferred_pfn = spfn;
> -
> -		nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
> -		touch_nmi_watchdog();
> -
> -		/* We should only stop along section boundaries */
> -		if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION)
> -			continue;
> -
> -		/* If our quota has been met we can stop here */
> -		if (nr_pages >= nr_pages_needed)
> -			break;
> +	for_each_free_mem_pfn_range_in_zone_from(i, zone, &spfn, &epfn) {
> +		while (spfn < epfn) {
> +			nr_pages += deferred_init_maxorder(zone, &spfn, epfn);
> +			touch_nmi_watchdog();
> +
> +			/* If our quota has been met we can stop here */
> +			if (nr_pages >= nr_pages_needed)
> +				goto out;
> +		}
>   	}
>   
> +out:
>   	pgdat->first_deferred_pfn = spfn;
>   	pgdat_resize_unlock(pgdat, &flags);
>   
>
Daniel Jordan May 1, 2020, 2:45 a.m. UTC | #2
Hi Alex,

On Thu, Apr 30, 2020 at 02:43:28PM -0700, Alexander Duyck wrote:
> On 4/30/2020 1:11 PM, Daniel Jordan wrote:
> > padata will soon divide up pfn ranges between threads when parallelizing
> > deferred init, and deferred_init_maxorder() complicates that by using an
> > opaque index in addition to start and end pfns.  Move the index outside
> > the function to make splitting the job easier, and simplify the code
> > while at it.
> > 
> > deferred_init_maxorder() now always iterates within a single pfn range
> > instead of potentially multiple ranges, and advances start_pfn to the
> > end of that range instead of the max-order block so partial pfn ranges
> > in the block aren't skipped in a later iteration.  The section alignment
> > check in deferred_grow_zone() is removed as well since this alignment is
> > no longer guaranteed.  It's not clear what value the alignment provided
> > originally.
> > 
> > Signed-off-by: Daniel Jordan <daniel.m.jordan@oracle.com>
> 
> So part of the reason for splitting it up along section aligned boundaries
> was because we already had an existing functionality in deferred_grow_zone
> that was going in and pulling out a section aligned chunk and processing it
> to prepare enough memory for other threads to keep running. I suspect that
> the section alignment was done because normally I believe that is also the
> alignment for memory onlining.

I think Pavel added that functionality, maybe he could confirm.

My impression was that the reason deferred_grow_zone aligned the requested
order up to a section was to make enough memory available to avoid being called
on every allocation.

> With this already breaking things up over multiple threads how does this
> work with deferred_grow_zone? Which thread is it trying to allocate from if
> it needs to allocate some memory for itself?

I may not be following your question, but deferred_grow_zone doesn't allocate
memory during the multithreading in deferred_init_memmap because the latter
sets first_deferred_pfn so that deferred_grow_zone bails early.

> Also what is to prevent a worker from stop deferred_grow_zone from bailing
> out in the middle of a max order page block if there is a hole in the middle
> of the block?

deferred_grow_zone remains singlethreaded.  It could stop in the middle of a
max order block, but it can't run concurrently with deferred_init_memmap, as
per above, so if deferred_init_memmap were to init 'n free the remaining part
of the block, the previous portion would have already been initialized.
Alexander Duyck May 4, 2020, 10:10 p.m. UTC | #3
On Thu, Apr 30, 2020 at 7:45 PM Daniel Jordan
<daniel.m.jordan@oracle.com> wrote:
>
> Hi Alex,
>
> On Thu, Apr 30, 2020 at 02:43:28PM -0700, Alexander Duyck wrote:
> > On 4/30/2020 1:11 PM, Daniel Jordan wrote:
> > > padata will soon divide up pfn ranges between threads when parallelizing
> > > deferred init, and deferred_init_maxorder() complicates that by using an
> > > opaque index in addition to start and end pfns.  Move the index outside
> > > the function to make splitting the job easier, and simplify the code
> > > while at it.
> > >
> > > deferred_init_maxorder() now always iterates within a single pfn range
> > > instead of potentially multiple ranges, and advances start_pfn to the
> > > end of that range instead of the max-order block so partial pfn ranges
> > > in the block aren't skipped in a later iteration.  The section alignment
> > > check in deferred_grow_zone() is removed as well since this alignment is
> > > no longer guaranteed.  It's not clear what value the alignment provided
> > > originally.
> > >
> > > Signed-off-by: Daniel Jordan <daniel.m.jordan@oracle.com>
> >
> > So part of the reason for splitting it up along section aligned boundaries
> > was because we already had an existing functionality in deferred_grow_zone
> > that was going in and pulling out a section aligned chunk and processing it
> > to prepare enough memory for other threads to keep running. I suspect that
> > the section alignment was done because normally I believe that is also the
> > alignment for memory onlining.
>
> I think Pavel added that functionality, maybe he could confirm.
>
> My impression was that the reason deferred_grow_zone aligned the requested
> order up to a section was to make enough memory available to avoid being called
> on every allocation.
>
> > With this already breaking things up over multiple threads how does this
> > work with deferred_grow_zone? Which thread is it trying to allocate from if
> > it needs to allocate some memory for itself?
>
> I may not be following your question, but deferred_grow_zone doesn't allocate
> memory during the multithreading in deferred_init_memmap because the latter
> sets first_deferred_pfn so that deferred_grow_zone bails early.

It has been a while since I looked at this code so I forgot that
deferred_grow_zone is essentially blocked out once we start the
per-node init.

> > Also what is to prevent a worker from stop deferred_grow_zone from bailing
> > out in the middle of a max order page block if there is a hole in the middle
> > of the block?
>
> deferred_grow_zone remains singlethreaded.  It could stop in the middle of a
> max order block, but it can't run concurrently with deferred_init_memmap, as
> per above, so if deferred_init_memmap were to init 'n free the remaining part
> of the block, the previous portion would have already been initialized.

So we cannot stop in the middle of a max order block. That shouldn't
be possible as part of the issue is that the buddy allocator will
attempt to access the buddy for the page which could cause issues if
it tries to merge the page with one that is not initialized. So if
your code supports that then it is definitely broken. That was one of
the reasons for all of the variable weirdness in
deferred_init_maxorder. I was going through and making certain that
while we were initializing the range we were freeing the pages in
MAX_ORDER aligned blocks and skipping over whatever reserved blocks
were there. Basically it was handling the case where a single
MAX_ORDER block could span multiple ranges.

On x86 this was all pretty straightforward and I don't believe we
needed the code, but I seem to recall there were some other
architectures that had more complex memory layouts at the time and
that was one of the reasons why I had to be careful to wait until I
had processed the full MAX_ORDER block before I could start freeing
the pages, otherwise it would start triggering memory corruptions.
Daniel Jordan May 5, 2020, 12:54 a.m. UTC | #4
On Mon, May 04, 2020 at 03:10:46PM -0700, Alexander Duyck wrote:
> So we cannot stop in the middle of a max order block. That shouldn't
> be possible as part of the issue is that the buddy allocator will
> attempt to access the buddy for the page which could cause issues if
> it tries to merge the page with one that is not initialized. So if
> your code supports that then it is definitely broken. That was one of
> the reasons for all of the variable weirdness in
> deferred_init_maxorder. I was going through and making certain that
> while we were initializing the range we were freeing the pages in
> MAX_ORDER aligned blocks and skipping over whatever reserved blocks
> were there. Basically it was handling the case where a single
> MAX_ORDER block could span multiple ranges.
> 
> On x86 this was all pretty straightforward and I don't believe we
> needed the code, but I seem to recall there were some other
> architectures that had more complex memory layouts at the time and
> that was one of the reasons why I had to be careful to wait until I
> had processed the full MAX_ORDER block before I could start freeing
> the pages, otherwise it would start triggering memory corruptions.

Yes, thanks, I missed the case where deferred_grow_zone could stop
mid-max-order-block.

Maybe it's better to leave deferred_init_maxorder alone and adapt the
multithreading to the existing implementation.  That'd mean dealing with the
pesky opaque index somehow, so deferred_init_mem_pfn_range_in_zone() could be
generalized to find it in the thread function based on the start/end range, or
it could be maintained as part of the range that padata passes to the thread
function.

Or, keep this patch but make sure deferred_grow_zone stops on a
max-order-aligned boundary.
Alexander Duyck May 5, 2020, 3:27 p.m. UTC | #5
On Mon, May 4, 2020 at 5:54 PM Daniel Jordan <daniel.m.jordan@oracle.com> wrote:
>
> On Mon, May 04, 2020 at 03:10:46PM -0700, Alexander Duyck wrote:
> > So we cannot stop in the middle of a max order block. That shouldn't
> > be possible as part of the issue is that the buddy allocator will
> > attempt to access the buddy for the page which could cause issues if
> > it tries to merge the page with one that is not initialized. So if
> > your code supports that then it is definitely broken. That was one of
> > the reasons for all of the variable weirdness in
> > deferred_init_maxorder. I was going through and making certain that
> > while we were initializing the range we were freeing the pages in
> > MAX_ORDER aligned blocks and skipping over whatever reserved blocks
> > were there. Basically it was handling the case where a single
> > MAX_ORDER block could span multiple ranges.
> >
> > On x86 this was all pretty straightforward and I don't believe we
> > needed the code, but I seem to recall there were some other
> > architectures that had more complex memory layouts at the time and
> > that was one of the reasons why I had to be careful to wait until I
> > had processed the full MAX_ORDER block before I could start freeing
> > the pages, otherwise it would start triggering memory corruptions.
>
> Yes, thanks, I missed the case where deferred_grow_zone could stop
> mid-max-order-block.

As it turns out that deferred_free_range will be setting the
migratetype for the page. In a sparse config the migratetype bits are
stored in the section bitmap. So to avoid cacheline bouncing it would
make sense to section align the tasks so that they only have one
thread touching one section rather than having the pageblock_flags
getting bounced between threads. It should also reduce the overhead
for having to parallelize the work in the first place since a section
is several times larger than a MAX_ORDER page and allows for more
batching of the work.

> Maybe it's better to leave deferred_init_maxorder alone and adapt the
> multithreading to the existing implementation.  That'd mean dealing with the
> pesky opaque index somehow, so deferred_init_mem_pfn_range_in_zone() could be
> generalized to find it in the thread function based on the start/end range, or
> it could be maintained as part of the range that padata passes to the thread
> function.

You may be better off just implementing your threads to operate like
deferred_grow_zone does. All your worker thread really needs then is
to know where to start performing the page initialization and then it
could go through and process an entire section worth of pages. The
other bit that would have to be changed is patch 6 so that you combine
any ranges that might span a single section instead of just splitting
the work up based on the ranges.

If you are referring to the mo_pfn you shouldn't even need to think
about it. All it is doing is guaranteeing you are processing at least
a full max order worth of pages. Without that the logic before was
either process a whole section, or just process all of memory
initializing it before it started freeing it. I found it made things
much more efficient to process only up to MAX_ORDER at a time as you
could squeeze that into the L2 cache for most x86 processors at least
and it reduced the memory bandwidth by quite a bit. If you update the
code to only provide section aligned/sized ranges of of PFNs to
initialize then it can pretty much be ignored since all it is doing is
defining the break point for single MAX_ORDER chunks which would be
smaller than a section anyway.
Daniel Jordan May 6, 2020, 10:39 p.m. UTC | #6
On Tue, May 05, 2020 at 08:27:52AM -0700, Alexander Duyck wrote:
> As it turns out that deferred_free_range will be setting the
> migratetype for the page. In a sparse config the migratetype bits are
> stored in the section bitmap. So to avoid cacheline bouncing it would
> make sense to section align the tasks so that they only have one
> thread touching one section rather than having the pageblock_flags
> getting bounced between threads.

That's a good point, I'll change the alignment.

I kicked off some runs on the Skylake bare metal system to check how this did
and the performance stayed the same, but see below.

> It should also reduce the overhead
> for having to parallelize the work in the first place since a section
> is several times larger than a MAX_ORDER page and allows for more
> batching of the work.

I think you may be assuming that threads work in MAX_ORDER batches, maybe
because that's the job's min_chunk, but padata works differently.  The
min_chunk is a lower bound that establishes the smallest amount of work that
makes sense for a thread to do in one go, so in this case it's useful to
prevent starting large numbers of threads to initialize a tiny amount of pages.

Internally padata uses total job size and min chunk to arrive at the chunk
size, which on big machines will be much larger than min_chunk.  The idea is
the chunk size should be large enough to minimize multithreading overhead but
small enough to permit load balancing between threads.

This is probably why the results didn't change much when aligning by section,
but that doesn't mean other systems won't benefit.

> > Maybe it's better to leave deferred_init_maxorder alone and adapt the
> > multithreading to the existing implementation.  That'd mean dealing with the
> > pesky opaque index somehow, so deferred_init_mem_pfn_range_in_zone() could be

I should have been explicit, was thinking of @i from
for_each_free_mem_pfn_range_in_zone_from() when mentioning the opaque index.

> > generalized to find it in the thread function based on the start/end range, or
> > it could be maintained as part of the range that padata passes to the thread
> > function.
> 
> You may be better off just implementing your threads to operate like
> deferred_grow_zone does. All your worker thread really needs then is
> to know where to start performing the page initialization and then it
> could go through and process an entire section worth of pages. The
> other bit that would have to be changed is patch 6 so that you combine
> any ranges that might span a single section instead of just splitting
> the work up based on the ranges.

How are you thinking of combining them?  I don't see a way to do it without
storing an arbitrary number of ranges somewhere for each thread.

> If you are referring to the mo_pfn you shouldn't even need to think
> about it.

(clarified "opaque index" above)

> All it is doing is guaranteeing you are processing at least
> a full max order worth of pages. Without that the logic before was
> either process a whole section, or just process all of memory
> initializing it before it started freeing it. I found it made things
> much more efficient to process only up to MAX_ORDER at a time as you
> could squeeze that into the L2 cache for most x86 processors at least
> and it reduced the memory bandwidth by quite a bit.

Yes, that was clever, we should keep doing it that way.
Alexander Duyck May 7, 2020, 3:26 p.m. UTC | #7
On Wed, May 6, 2020 at 3:39 PM Daniel Jordan <daniel.m.jordan@oracle.com> wrote:
>
> On Tue, May 05, 2020 at 08:27:52AM -0700, Alexander Duyck wrote:
> > As it turns out that deferred_free_range will be setting the
> > migratetype for the page. In a sparse config the migratetype bits are
> > stored in the section bitmap. So to avoid cacheline bouncing it would
> > make sense to section align the tasks so that they only have one
> > thread touching one section rather than having the pageblock_flags
> > getting bounced between threads.
>
> That's a good point, I'll change the alignment.
>
> I kicked off some runs on the Skylake bare metal system to check how this did
> and the performance stayed the same, but see below.
>
> > It should also reduce the overhead
> > for having to parallelize the work in the first place since a section
> > is several times larger than a MAX_ORDER page and allows for more
> > batching of the work.
>
> I think you may be assuming that threads work in MAX_ORDER batches, maybe
> because that's the job's min_chunk, but padata works differently.  The
> min_chunk is a lower bound that establishes the smallest amount of work that
> makes sense for a thread to do in one go, so in this case it's useful to
> prevent starting large numbers of threads to initialize a tiny amount of pages.
>
> Internally padata uses total job size and min chunk to arrive at the chunk
> size, which on big machines will be much larger than min_chunk.  The idea is
> the chunk size should be large enough to minimize multithreading overhead but
> small enough to permit load balancing between threads.
>
> This is probably why the results didn't change much when aligning by section,
> but that doesn't mean other systems won't benefit.

Okay, that makes sense.

> > > Maybe it's better to leave deferred_init_maxorder alone and adapt the
> > > multithreading to the existing implementation.  That'd mean dealing with the
> > > pesky opaque index somehow, so deferred_init_mem_pfn_range_in_zone() could be
>
> I should have been explicit, was thinking of @i from
> () when mentioning the opaque index.

Okay, that makes sense. However in reality you don't need to split
that piece out. All you really are doing is splitting up the
first_init_pfn value over multiple threads so you just need to make
use of deferred_init_mem_pfn_range_in_zone() to initialize it.

> > > generalized to find it in the thread function based on the start/end range, or
> > > it could be maintained as part of the range that padata passes to the thread
> > > function.
> >
> > You may be better off just implementing your threads to operate like
> > deferred_grow_zone does. All your worker thread really needs then is
> > to know where to start performing the page initialization and then it
> > could go through and process an entire section worth of pages. The
> > other bit that would have to be changed is patch 6 so that you combine
> > any ranges that might span a single section instead of just splitting
> > the work up based on the ranges.
>
> How are you thinking of combining them?  I don't see a way to do it without
> storing an arbitrary number of ranges somewhere for each thread.

So when you are putting together your data you are storing a starting
value and a length. All you end up having to do is make certain that
the size + start pfn is section aligned. Then if you jump to a new
section you have the option of either adding to the size of your
current section or submitting the range and starting with a new start
pfn in a new section. All you are really doing is breaking up the
first_deferred_pfn over multiple sections. What I would do is section
align end_pfn, and then check the next range from the zone. If the
start_pfn of the next range is less than end_pfn you merge the two
ranges by just increasing the size, otherwise you could start a new
range.

The idea is that you just want to define what the valid range of PFNs
are, and if there are sizable holes you skip over them. You would
leave most of the lifting for identifying exactly what PFNs to
initialize to the pfn_range_in_zone iterators since they would all be
read-only accesses anyway.

> > If you are referring to the mo_pfn you shouldn't even need to think
> > about it.
>
> (clarified "opaque index" above)

Thanks.

> > All it is doing is guaranteeing you are processing at least
> > a full max order worth of pages. Without that the logic before was
> > either process a whole section, or just process all of memory
> > initializing it before it started freeing it. I found it made things
> > much more efficient to process only up to MAX_ORDER at a time as you
> > could squeeze that into the L2 cache for most x86 processors at least
> > and it reduced the memory bandwidth by quite a bit.
>
> Yes, that was clever, we should keep doing it that way.

Thanks.
Daniel Jordan May 7, 2020, 8:20 p.m. UTC | #8
On Thu, May 07, 2020 at 08:26:26AM -0700, Alexander Duyck wrote:
> On Wed, May 6, 2020 at 3:39 PM Daniel Jordan <daniel.m.jordan@oracle.com> wrote:
> > On Tue, May 05, 2020 at 08:27:52AM -0700, Alexander Duyck wrote:
> > > > Maybe it's better to leave deferred_init_maxorder alone and adapt the
> > > > multithreading to the existing implementation.  That'd mean dealing with the
> > > > pesky opaque index somehow, so deferred_init_mem_pfn_range_in_zone() could be
> >
> > I should have been explicit, was thinking of @i from
> > () when mentioning the opaque index.
> 
> Okay, that makes sense. However in reality you don't need to split
> that piece out. All you really are doing is splitting up the
> first_init_pfn value over multiple threads so you just need to make
> use of deferred_init_mem_pfn_range_in_zone() to initialize it.

Ok, I assume you mean that each thread should use
deferred_init_mem_pfn_range_in_zone.  Yes, that's what I meant when saying that
function could be generalized, though not sure we should opt for this.

> > > > generalized to find it in the thread function based on the start/end range, or
> > > > it could be maintained as part of the range that padata passes to the thread
> > > > function.
> > >
> > > You may be better off just implementing your threads to operate like
> > > deferred_grow_zone does. All your worker thread really needs then is
> > > to know where to start performing the page initialization and then it
> > > could go through and process an entire section worth of pages. The
> > > other bit that would have to be changed is patch 6 so that you combine
> > > any ranges that might span a single section instead of just splitting
> > > the work up based on the ranges.
> >
> > How are you thinking of combining them?  I don't see a way to do it without
> > storing an arbitrary number of ranges somewhere for each thread.
> 
> So when you are putting together your data you are storing a starting
> value and a length. All you end up having to do is make certain that
> the size + start pfn is section aligned. Then if you jump to a new
> section you have the option of either adding to the size of your
> current section or submitting the range and starting with a new start
> pfn in a new section. All you are really doing is breaking up the
> first_deferred_pfn over multiple sections. What I would do is section
> align end_pfn, and then check the next range from the zone. If the
> start_pfn of the next range is less than end_pfn you merge the two
> ranges by just increasing the size, otherwise you could start a new
> range.
> 
> The idea is that you just want to define what the valid range of PFNs
> are, and if there are sizable holes you skip over them. You would
> leave most of the lifting for identifying exactly what PFNs to
> initialize to the pfn_range_in_zone iterators since they would all be
> read-only accesses anyway.

Ok, I follow you.  My assumption is that there are generally few free pfn
ranges relative to the total number of pfns being initialized so that it's
efficient to parallelize over a single pfn range from the zone iterator.  On
the systems I tested, there were about 20 tiny ranges and one enormous range
per node so that firing off a job per range kept things simple without
affecting performance.  If that assumption holds, I'm not sure it's worth it to
merge ranges.

With the series as it stands plus leaving in the section alignment check in
deferred_grow_zone (which I think could be relaxed to a maxorder alignment
check) so it doesn't stop mid-max-order-block, threads simply deal with a
start/end range and deferred_init_maxorder becomes shorter and simpler too.
Alexander Duyck May 7, 2020, 9:18 p.m. UTC | #9
On Thu, May 7, 2020 at 1:20 PM Daniel Jordan <daniel.m.jordan@oracle.com> wrote:
>
> On Thu, May 07, 2020 at 08:26:26AM -0700, Alexander Duyck wrote:
> > On Wed, May 6, 2020 at 3:39 PM Daniel Jordan <daniel.m.jordan@oracle.com> wrote:
> > > On Tue, May 05, 2020 at 08:27:52AM -0700, Alexander Duyck wrote:
> > > > > Maybe it's better to leave deferred_init_maxorder alone and adapt the
> > > > > multithreading to the existing implementation.  That'd mean dealing with the
> > > > > pesky opaque index somehow, so deferred_init_mem_pfn_range_in_zone() could be
> > >
> > > I should have been explicit, was thinking of @i from
> > > () when mentioning the opaque index.
> >
> > Okay, that makes sense. However in reality you don't need to split
> > that piece out. All you really are doing is splitting up the
> > first_init_pfn value over multiple threads so you just need to make
> > use of deferred_init_mem_pfn_range_in_zone() to initialize it.
>
> Ok, I assume you mean that each thread should use
> deferred_init_mem_pfn_range_in_zone.  Yes, that's what I meant when saying that
> function could be generalized, though not sure we should opt for this.

Yes that is what I meant.

> > > > > generalized to find it in the thread function based on the start/end range, or
> > > > > it could be maintained as part of the range that padata passes to the thread
> > > > > function.
> > > >
> > > > You may be better off just implementing your threads to operate like
> > > > deferred_grow_zone does. All your worker thread really needs then is
> > > > to know where to start performing the page initialization and then it
> > > > could go through and process an entire section worth of pages. The
> > > > other bit that would have to be changed is patch 6 so that you combine
> > > > any ranges that might span a single section instead of just splitting
> > > > the work up based on the ranges.
> > >
> > > How are you thinking of combining them?  I don't see a way to do it without
> > > storing an arbitrary number of ranges somewhere for each thread.
> >
> > So when you are putting together your data you are storing a starting
> > value and a length. All you end up having to do is make certain that
> > the size + start pfn is section aligned. Then if you jump to a new
> > section you have the option of either adding to the size of your
> > current section or submitting the range and starting with a new start
> > pfn in a new section. All you are really doing is breaking up the
> > first_deferred_pfn over multiple sections. What I would do is section
> > align end_pfn, and then check the next range from the zone. If the
> > start_pfn of the next range is less than end_pfn you merge the two
> > ranges by just increasing the size, otherwise you could start a new
> > range.
> >
> > The idea is that you just want to define what the valid range of PFNs
> > are, and if there are sizable holes you skip over them. You would
> > leave most of the lifting for identifying exactly what PFNs to
> > initialize to the pfn_range_in_zone iterators since they would all be
> > read-only accesses anyway.
>
> Ok, I follow you.  My assumption is that there are generally few free pfn
> ranges relative to the total number of pfns being initialized so that it's
> efficient to parallelize over a single pfn range from the zone iterator.  On
> the systems I tested, there were about 20 tiny ranges and one enormous range
> per node so that firing off a job per range kept things simple without
> affecting performance.  If that assumption holds, I'm not sure it's worth it to
> merge ranges.

The idea behind merging ranges it to address possible cases where a
range is broken up such that there is a hole in a max order block as a
result. By combining the ranges if they both span the same section we
can guarantee that the entire section will be initialized as a block
and not potentially have partially initialized sections floating
around. Without that mo_pfn logic I had in there I was getting panics
every so often when booting up one of my systems as I recall.

Also the iterator itself is cheap. It is basically just walking a
read-only list so it scales efficiently as well. One of the reasons
why I arranged the code the way I did is that it also allowed me to
get rid of an extra check in the code as the previous code was having
to verify if the pfn belonged to the node. That is all handled
directly through the for_each_free_mem_pfn_range_in_zone[_from] call
now.

> With the series as it stands plus leaving in the section alignment check in
> deferred_grow_zone (which I think could be relaxed to a maxorder alignment
> check) so it doesn't stop mid-max-order-block, threads simply deal with a
> start/end range and deferred_init_maxorder becomes shorter and simpler too.

I still think we are better off initializing complete sections since
the pageblock_flags are fully initialized that way as well. What
guarantee do you have that all of the memory ranges will be max order
aligned? The problem is we have to guarantee all pages are initialized
before we start freeing the pages in a max order page. If we just
process each block as-is I believe we can end up with some
architectures trying to access uninitialized memory in the buddy
allocator as a result. That is why the deferred_init_maxorder function
will walk through the iterator, using the _from version to avoid
unnecessary iteration, the first time initializing the pages it needs
to cross that max order boundary, and then again to free the max order
block of pages that have been initialized. The iterator itself is
farily cheap and only has to get you through the smaller ranges before
you end up at the one big range that it just kind of sits at while it
is working on getting it processed.
Daniel Jordan May 7, 2020, 10:15 p.m. UTC | #10
On Thu, May 07, 2020 at 02:18:42PM -0700, Alexander Duyck wrote:
> The idea behind merging ranges it to address possible cases where a
> range is broken up such that there is a hole in a max order block as a
> result.

Gah, yes, you're right, there could be multiple ranges in a max order block, so
the threads have to use the zone iterators to skip the holes.

> By combining the ranges if they both span the same section we
> can guarantee that the entire section will be initialized as a block
> and not potentially have partially initialized sections floating
> around. Without that mo_pfn logic I had in there I was getting panics
> every so often when booting up one of my systems as I recall.
> 
> Also the iterator itself is cheap. It is basically just walking a
> read-only list so it scales efficiently as well. One of the reasons

Agreed, it's not expensive, it's just gnarliness I was hoping to avoid, but
obviously it's not gonna work.

> why I arranged the code the way I did is that it also allowed me to
> get rid of an extra check in the code as the previous code was having
> to verify if the pfn belonged to the node. That is all handled
> directly through the for_each_free_mem_pfn_range_in_zone[_from] call
> now.
> 
> > With the series as it stands plus leaving in the section alignment check in
> > deferred_grow_zone (which I think could be relaxed to a maxorder alignment
> > check) so it doesn't stop mid-max-order-block, threads simply deal with a
> > start/end range and deferred_init_maxorder becomes shorter and simpler too.
> 
> I still think we are better off initializing complete sections since
> the pageblock_flags are fully initialized that way as well.

Fair enough.

> What
> guarantee do you have that all of the memory ranges will be max order
> aligned?

Sure, it's a problem with multiple ranges in a maxorder block, the rest
could've been handled.

> The problem is we have to guarantee all pages are initialized
> before we start freeing the pages in a max order page. If we just
> process each block as-is I believe we can end up with some
> architectures trying to access uninitialized memory in the buddy
> allocator as a result. That is why the deferred_init_maxorder function
> will walk through the iterator, using the _from version to avoid
> unnecessary iteration, the first time initializing the pages it needs
> to cross that max order boundary, and then again to free the max order
> block of pages that have been initialized. The iterator itself is
> farily cheap and only has to get you through the smaller ranges before
> you end up at the one big range that it just kind of sits at while it
> is working on getting it processed.

Right.


Ok, I think we're on the same page for the next version.  Thanks for the
thorough review!
diff mbox series

Patch

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 68669d3a5a665..990514d8f0d94 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1708,55 +1708,23 @@  deferred_init_mem_pfn_range_in_zone(u64 *i, struct zone *zone,
 }
 
 /*
- * Initialize and free pages. We do it in two loops: first we initialize
- * struct page, then free to buddy allocator, because while we are
- * freeing pages we can access pages that are ahead (computing buddy
- * page in __free_one_page()).
- *
- * In order to try and keep some memory in the cache we have the loop
- * broken along max page order boundaries. This way we will not cause
- * any issues with the buddy page computation.
+ * Initialize the struct pages and then free them to the buddy allocator at
+ * most a max order block at a time because while we are freeing pages we can
+ * access pages that are ahead (computing buddy page in __free_one_page()).
+ * It's also cache friendly.
  */
 static unsigned long __init
-deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn,
-		       unsigned long *end_pfn)
+deferred_init_maxorder(struct zone *zone, unsigned long *start_pfn,
+		       unsigned long end_pfn)
 {
-	unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES);
-	unsigned long spfn = *start_pfn, epfn = *end_pfn;
-	unsigned long nr_pages = 0;
-	u64 j = *i;
-
-	/* First we loop through and initialize the page values */
-	for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) {
-		unsigned long t;
-
-		if (mo_pfn <= *start_pfn)
-			break;
-
-		t = min(mo_pfn, *end_pfn);
-		nr_pages += deferred_init_pages(zone, *start_pfn, t);
-
-		if (mo_pfn < *end_pfn) {
-			*start_pfn = mo_pfn;
-			break;
-		}
-	}
-
-	/* Reset values and now loop through freeing pages as needed */
-	swap(j, *i);
-
-	for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) {
-		unsigned long t;
-
-		if (mo_pfn <= spfn)
-			break;
+	unsigned long nr_pages, pfn;
 
-		t = min(mo_pfn, epfn);
-		deferred_free_pages(spfn, t);
+	pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES);
+	pfn = min(pfn, end_pfn);
 
-		if (mo_pfn <= epfn)
-			break;
-	}
+	nr_pages = deferred_init_pages(zone, *start_pfn, pfn);
+	deferred_free_pages(*start_pfn, pfn);
+	*start_pfn = pfn;
 
 	return nr_pages;
 }
@@ -1814,9 +1782,11 @@  static int __init deferred_init_memmap(void *data)
 	 * that we can avoid introducing any issues with the buddy
 	 * allocator.
 	 */
-	while (spfn < epfn) {
-		nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
-		cond_resched();
+	for_each_free_mem_pfn_range_in_zone_from(i, zone, &spfn, &epfn) {
+		while (spfn < epfn) {
+			nr_pages += deferred_init_maxorder(zone, &spfn, epfn);
+			cond_resched();
+		}
 	}
 zone_empty:
 	/* Sanity check that the next zone really is unpopulated */
@@ -1883,22 +1853,18 @@  deferred_grow_zone(struct zone *zone, unsigned int order)
 	 * that we can avoid introducing any issues with the buddy
 	 * allocator.
 	 */
-	while (spfn < epfn) {
-		/* update our first deferred PFN for this section */
-		first_deferred_pfn = spfn;
-
-		nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
-		touch_nmi_watchdog();
-
-		/* We should only stop along section boundaries */
-		if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION)
-			continue;
-
-		/* If our quota has been met we can stop here */
-		if (nr_pages >= nr_pages_needed)
-			break;
+	for_each_free_mem_pfn_range_in_zone_from(i, zone, &spfn, &epfn) {
+		while (spfn < epfn) {
+			nr_pages += deferred_init_maxorder(zone, &spfn, epfn);
+			touch_nmi_watchdog();
+
+			/* If our quota has been met we can stop here */
+			if (nr_pages >= nr_pages_needed)
+				goto out;
+		}
 	}
 
+out:
 	pgdat->first_deferred_pfn = spfn;
 	pgdat_resize_unlock(pgdat, &flags);