Message ID | 20190715081549.32577-3-osalvador@suse.de (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Fixes for sub-section hotplug | expand |
Oscar Salvador <osalvador@suse.de> writes: > Since [1], shrink_{zone,node}_span work on PAGES_PER_SUBSECTION granularity. > The problem is that deactivation of the section occurs later on in > sparse_remove_section, so pfn_valid()->pfn_section_valid() will always return > true before we deactivate the {sub}section. Can you explain this more? The patch doesn't update section_mem_map update sequence. So what changed? What is the problem in finding pfn_valid() return true there? > > I spotted this during hotplug hotremove tests, there I always saw that > spanned_pages was, at least, left with PAGES_PER_SECTION, even if we > removed all memory linked to that zone. > > Fix this by decoupling section_deactivate from sparse_remove_section, and > re-order the function calls. > > Now, __remove_section will: > > 1) deactivate section > 2) shrink {zone,node}'s pages > 3) remove section > > [1] https://patchwork.kernel.org/patch/11003467/ > > Fixes: mmotm ("mm/hotplug: prepare shrink_{zone, pgdat}_span for sub-section removal") > Signed-off-by: Oscar Salvador <osalvador@suse.de> > --- > include/linux/memory_hotplug.h | 7 ++-- > mm/memory_hotplug.c | 6 +++- > mm/sparse.c | 77 +++++++++++++++++++++++++++++------------- > 3 files changed, 62 insertions(+), 28 deletions(-) > > diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h > index f46ea71b4ffd..d2eb917aad5f 100644 > --- a/include/linux/memory_hotplug.h > +++ b/include/linux/memory_hotplug.h > @@ -348,9 +348,10 @@ extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, > extern bool is_memblock_offlined(struct memory_block *mem); > extern int sparse_add_section(int nid, unsigned long pfn, > unsigned long nr_pages, struct vmem_altmap *altmap); > -extern void sparse_remove_section(struct mem_section *ms, > - unsigned long pfn, unsigned long nr_pages, > - unsigned long map_offset, struct vmem_altmap *altmap); > +int sparse_deactivate_section(unsigned long pfn, unsigned long nr_pages); > +void sparse_remove_section(unsigned long pfn, unsigned long nr_pages, > + unsigned long map_offset, struct vmem_altmap *altmap, > + int section_empty); > extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, > unsigned long pnum); > extern bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, > diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c > index b9ba5b85f9f7..03d535eee60d 100644 > --- a/mm/memory_hotplug.c > +++ b/mm/memory_hotplug.c > @@ -517,12 +517,16 @@ static void __remove_section(struct zone *zone, unsigned long pfn, > struct vmem_altmap *altmap) > { > struct mem_section *ms = __nr_to_section(pfn_to_section_nr(pfn)); > + int ret; > > if (WARN_ON_ONCE(!valid_section(ms))) > return; > > + ret = sparse_deactivate_section(pfn, nr_pages); > __remove_zone(zone, pfn, nr_pages); > - sparse_remove_section(ms, pfn, nr_pages, map_offset, altmap); > + if (ret >= 0) > + sparse_remove_section(pfn, nr_pages, map_offset, altmap, > + ret); > } > > /** > diff --git a/mm/sparse.c b/mm/sparse.c > index 1e224149aab6..d4953ee1d087 100644 > --- a/mm/sparse.c > +++ b/mm/sparse.c > @@ -732,16 +732,47 @@ static void free_map_bootmem(struct page *memmap) > } > #endif /* CONFIG_SPARSEMEM_VMEMMAP */ > > -static void section_deactivate(unsigned long pfn, unsigned long nr_pages, > - struct vmem_altmap *altmap) > +static void section_remove(unsigned long pfn, unsigned long nr_pages, > + struct vmem_altmap *altmap, int section_empty) > +{ > + struct mem_section *ms = __pfn_to_section(pfn); > + bool section_early = early_section(ms); > + struct page *memmap = NULL; > + > + if (section_empty) { > + unsigned long section_nr = pfn_to_section_nr(pfn); > + > + if (!section_early) { > + kfree(ms->usage); > + ms->usage = NULL; > + } > + memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); > + ms->section_mem_map = sparse_encode_mem_map(NULL, section_nr); > + } > + > + if (section_early && memmap) > + free_map_bootmem(memmap); > + else > + depopulate_section_memmap(pfn, nr_pages, altmap); > +} > + > +/** > + * section_deactivate: Deactivate a {sub}section. > + * > + * Return: > + * * -1 - {sub}section has already been deactivated. > + * * 0 - Section is not empty > + * * 1 - Section is empty > + */ > + > +static int section_deactivate(unsigned long pfn, unsigned long nr_pages) > { > DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 }; > DECLARE_BITMAP(tmp, SUBSECTIONS_PER_SECTION) = { 0 }; > struct mem_section *ms = __pfn_to_section(pfn); > - bool section_is_early = early_section(ms); > - struct page *memmap = NULL; > unsigned long *subsection_map = ms->usage > ? &ms->usage->subsection_map[0] : NULL; > + int section_empty = 0; > > subsection_mask_set(map, pfn, nr_pages); > if (subsection_map) > @@ -750,7 +781,7 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages, > if (WARN(!subsection_map || !bitmap_equal(tmp, map, SUBSECTIONS_PER_SECTION), > "section already deactivated (%#lx + %ld)\n", > pfn, nr_pages)) > - return; > + return -1; > > /* > * There are 3 cases to handle across two configurations > @@ -770,21 +801,10 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages, > * For 2/ and 3/ the SPARSEMEM_VMEMMAP={y,n} cases are unified > */ > bitmap_xor(subsection_map, map, subsection_map, SUBSECTIONS_PER_SECTION); > - if (bitmap_empty(subsection_map, SUBSECTIONS_PER_SECTION)) { > - unsigned long section_nr = pfn_to_section_nr(pfn); > - > - if (!section_is_early) { > - kfree(ms->usage); > - ms->usage = NULL; > - } > - memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); > - ms->section_mem_map = sparse_encode_mem_map(NULL, section_nr); > - } > + if (bitmap_empty(subsection_map, SUBSECTIONS_PER_SECTION)) > + section_empty = 1; > > - if (section_is_early && memmap) > - free_map_bootmem(memmap); > - else > - depopulate_section_memmap(pfn, nr_pages, altmap); > + return section_empty; > } > > static struct page * __meminit section_activate(int nid, unsigned long pfn, > @@ -834,7 +854,11 @@ static struct page * __meminit section_activate(int nid, unsigned long pfn, > > memmap = populate_section_memmap(pfn, nr_pages, nid, altmap); > if (!memmap) { > - section_deactivate(pfn, nr_pages, altmap); > + int ret; > + > + ret = section_deactivate(pfn, nr_pages); > + if (ret >= 0) > + section_remove(pfn, nr_pages, altmap, ret); > return ERR_PTR(-ENOMEM); > } > > @@ -919,12 +943,17 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) > } > #endif > > -void sparse_remove_section(struct mem_section *ms, unsigned long pfn, > - unsigned long nr_pages, unsigned long map_offset, > - struct vmem_altmap *altmap) > +int sparse_deactivate_section(unsigned long pfn, unsigned long nr_pages) > +{ > + return section_deactivate(pfn, nr_pages); > +} > + > +void sparse_remove_section(unsigned long pfn, unsigned long nr_pages, > + unsigned long map_offset, struct vmem_altmap *altmap, > + int section_empty) > { > clear_hwpoisoned_pages(pfn_to_page(pfn) + map_offset, > nr_pages - map_offset); > - section_deactivate(pfn, nr_pages, altmap); > + section_remove(pfn, nr_pages, altmap, section_empty); > } > #endif /* CONFIG_MEMORY_HOTPLUG */ > -- > 2.12.3
On Mon, 2019-07-15 at 21:41 +0530, Aneesh Kumar K.V wrote: > Oscar Salvador <osalvador@suse.de> writes: > > > Since [1], shrink_{zone,node}_span work on PAGES_PER_SUBSECTION > > granularity. > > The problem is that deactivation of the section occurs later on in > > sparse_remove_section, so pfn_valid()->pfn_section_valid() will > > always return > > true before we deactivate the {sub}section. > > Can you explain this more? The patch doesn't update section_mem_map > update sequence. So what changed? What is the problem in finding > pfn_valid() return true there? I realized that the changelog was quite modest, so a better explanation will follow. Let us analize what shrink_{zone,node}_span does. We have to remember that shrink_zone_span gets called every time a section is to be removed. There can be three possibilites: 1) section to be removed is the first one of the zone 2) section to be removed is the last one of the zone 3) section to be removed falls in the middle For 1) and 2) cases, we will try to find the next section from bottom/top, and in the third case we will check whether the section contains only holes. Now, let us take the example where a ZONE contains only 1 section, and we remove it. The last loop of shrink_zone_span, will check for {start_pfn,end_pfn] PAGES_PER_SECTION block the following: - section is valid - pfn relates to the current zone/nid - section is not the section to be removed Since we only got 1 section here, the check "start_pfn == pfn" will make us to continue the loop and then we are done. Now, what happens after the patch? We increment pfn on subsection basis, since "start_pfn == pfn", we jump to the next sub-section (pfn+512), and call pfn_valid()- >pfn_section_valid(). Since section has not been yet deactivded, pfn_section_valid() will return true, and we will repeat this until the end of the loop. What should happen instead is: - we deactivate the {sub}-section before calling shirnk_{zone,node}_span - calls to pfn_valid() will now return false for the sections that have been deactivated, and so we will get the pfn from the next activaded sub-section, or nothing if the section is empty (section do not contain active sub-sections). The example relates to the last loop in shrink_zone_span, but the same applies to find_{smalles,biggest}_section. Please, note that we could probably do some hack like replacing: start_pfn == pfn with pfn < end_pfn But the way to fix this is to 1) deactivate {sub}-section and 2) let shrink_{node,zone}_span find the next active {sub-section}. I hope this makes it more clear.
On Mon, Jul 15, 2019 at 2:24 PM Oscar Salvador <osalvador@suse.de> wrote: > > On Mon, 2019-07-15 at 21:41 +0530, Aneesh Kumar K.V wrote: > > Oscar Salvador <osalvador@suse.de> writes: > > > > > Since [1], shrink_{zone,node}_span work on PAGES_PER_SUBSECTION > > > granularity. > > > The problem is that deactivation of the section occurs later on in > > > sparse_remove_section, so pfn_valid()->pfn_section_valid() will > > > always return > > > true before we deactivate the {sub}section. > > > > Can you explain this more? The patch doesn't update section_mem_map > > update sequence. So what changed? What is the problem in finding > > pfn_valid() return true there? > > I realized that the changelog was quite modest, so a better explanation > will follow. > > Let us analize what shrink_{zone,node}_span does. > We have to remember that shrink_zone_span gets called every time a > section is to be removed. > > There can be three possibilites: > > 1) section to be removed is the first one of the zone > 2) section to be removed is the last one of the zone > 3) section to be removed falls in the middle > > For 1) and 2) cases, we will try to find the next section from > bottom/top, and in the third case we will check whether the section > contains only holes. > > Now, let us take the example where a ZONE contains only 1 section, and > we remove it. > The last loop of shrink_zone_span, will check for {start_pfn,end_pfn] > PAGES_PER_SECTION block the following: > > - section is valid > - pfn relates to the current zone/nid > - section is not the section to be removed > > Since we only got 1 section here, the check "start_pfn == pfn" will make us to continue the loop and then we are done. > > Now, what happens after the patch? > > We increment pfn on subsection basis, since "start_pfn == pfn", we jump > to the next sub-section (pfn+512), and call pfn_valid()- > >pfn_section_valid(). > Since section has not been yet deactivded, pfn_section_valid() will > return true, and we will repeat this until the end of the loop. > > What should happen instead is: > > - we deactivate the {sub}-section before calling > shirnk_{zone,node}_span > - calls to pfn_valid() will now return false for the sections that have > been deactivated, and so we will get the pfn from the next activaded > sub-section, or nothing if the section is empty (section do not contain > active sub-sections). > > The example relates to the last loop in shrink_zone_span, but the same > applies to find_{smalles,biggest}_section. > > Please, note that we could probably do some hack like replacing: > > start_pfn == pfn > > with > > pfn < end_pfn > > But the way to fix this is to 1) deactivate {sub}-section and 2) let > shrink_{node,zone}_span find the next active {sub-section}. > > I hope this makes it more clear. This makes it more clear that the problem is with the "start_pfn == pfn" check relative to subsections, but it does not clarify why it needs to clear pfn_valid() before calling shrink_zone_span(). Sections were not invalidated prior to shrink_zone_span() in the pre-subsection implementation and it seems all we need is to keep the same semantic. I.e. skip the range that is currently being removed: diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 37d49579ac15..b69832db442b 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -422,8 +422,8 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, if (page_zone(pfn_to_page(pfn)) != zone) continue; - /* If the section is current section, it continues the loop */ - if (start_pfn == pfn) + /* If the sub-section is current span being removed, skip */ + if (pfn >= start_pfn && pfn < end_pfn) continue; /* If we find valid section, we have nothing to do */ I otherwise don't follow why we would need to deactivate prior to __remove_zone().
Oscar Salvador <osalvador@suse.de> writes: > On Mon, 2019-07-15 at 21:41 +0530, Aneesh Kumar K.V wrote: >> Oscar Salvador <osalvador@suse.de> writes: >> >> > Since [1], shrink_{zone,node}_span work on PAGES_PER_SUBSECTION >> > granularity. >> > The problem is that deactivation of the section occurs later on in >> > sparse_remove_section, so pfn_valid()->pfn_section_valid() will >> > always return >> > true before we deactivate the {sub}section. >> >> Can you explain this more? The patch doesn't update section_mem_map >> update sequence. So what changed? What is the problem in finding >> pfn_valid() return true there? > > I realized that the changelog was quite modest, so a better explanation > will follow. > > Let us analize what shrink_{zone,node}_span does. > We have to remember that shrink_zone_span gets called every time a > section is to be removed. > > There can be three possibilites: > > 1) section to be removed is the first one of the zone > 2) section to be removed is the last one of the zone > 3) section to be removed falls in the middle > > For 1) and 2) cases, we will try to find the next section from > bottom/top, and in the third case we will check whether the section > contains only holes. > > Now, let us take the example where a ZONE contains only 1 section, and > we remove it. > The last loop of shrink_zone_span, will check for {start_pfn,end_pfn] > PAGES_PER_SECTION block the following: > > - section is valid > - pfn relates to the current zone/nid > - section is not the section to be removed > > Since we only got 1 section here, the check "start_pfn == pfn" will make us to continue the loop and then we are done. > > Now, what happens after the patch? > > We increment pfn on subsection basis, since "start_pfn == pfn", we jump > to the next sub-section (pfn+512), and call pfn_valid()- >>pfn_section_valid(). > Since section has not been yet deactivded, pfn_section_valid() will > return true, and we will repeat this until the end of the loop. > > What should happen instead is: > > - we deactivate the {sub}-section before calling > shirnk_{zone,node}_span > - calls to pfn_valid() will now return false for the sections that have > been deactivated, and so we will get the pfn from the next activaded > sub-section, or nothing if the section is empty (section do not contain > active sub-sections). > > The example relates to the last loop in shrink_zone_span, but the same > applies to find_{smalles,biggest}_section. > > Please, note that we could probably do some hack like replacing: > > start_pfn == pfn > > with > > pfn < end_pfn Why do you consider this a hack? /* If the section is current section, it continues the loop */ if (start_pfn == pfn) continue; The comment explains that check is there to handle the exact scenario that you are fixing in this patch. With subsection patch that check is not sufficient. Shouldn't we just fix the check to handle that? Not sure about your comment w.r.t find_{smalles,biggest}_section. We search with pfn range outside the subsection we are trying to remove. So this should not have an impact there? > > But the way to fix this is to 1) deactivate {sub}-section and 2) let > shrink_{node,zone}_span find the next active {sub-section}. > > I hope this makes it more clear. -aneesh
On Tue, Jul 16, 2019 at 07:28:54PM -0700, Dan Williams wrote: > This makes it more clear that the problem is with the "start_pfn == > pfn" check relative to subsections, but it does not clarify why it > needs to clear pfn_valid() before calling shrink_zone_span(). > Sections were not invalidated prior to shrink_zone_span() in the > pre-subsection implementation and it seems all we need is to keep the > same semantic. I.e. skip the range that is currently being removed: Yes, as I said in my reply to Aneesh, that is the other way I thought when fixing it. The reason I went this way is because it seemed more reasonable and natural to me that pfn_valid() would just return the next active sub-section. I just though that we could leverage the fact that we can deactivate a sub-section before scanning for the next one. On a second thought, the changes do not outweight the case, being the first fix enough and less intrusive, so I will send a v2 with that instead.
On Wed, Jul 17, 2019 at 11:08:54AM +0530, Aneesh Kumar K.V wrote: > Oscar Salvador <osalvador@suse.de> writes: > > > On Mon, 2019-07-15 at 21:41 +0530, Aneesh Kumar K.V wrote: > >> Oscar Salvador <osalvador@suse.de> writes: > >> > >> > Since [1], shrink_{zone,node}_span work on PAGES_PER_SUBSECTION > >> > granularity. > >> > The problem is that deactivation of the section occurs later on in > >> > sparse_remove_section, so pfn_valid()->pfn_section_valid() will > >> > always return > >> > true before we deactivate the {sub}section. > >> > >> Can you explain this more? The patch doesn't update section_mem_map > >> update sequence. So what changed? What is the problem in finding > >> pfn_valid() return true there? > > > > I realized that the changelog was quite modest, so a better explanation > > will follow. > > > > Let us analize what shrink_{zone,node}_span does. > > We have to remember that shrink_zone_span gets called every time a > > section is to be removed. > > > > There can be three possibilites: > > > > 1) section to be removed is the first one of the zone > > 2) section to be removed is the last one of the zone > > 3) section to be removed falls in the middle > > > > For 1) and 2) cases, we will try to find the next section from > > bottom/top, and in the third case we will check whether the section > > contains only holes. > > > > Now, let us take the example where a ZONE contains only 1 section, and > > we remove it. > > The last loop of shrink_zone_span, will check for {start_pfn,end_pfn] > > PAGES_PER_SECTION block the following: > > > > - section is valid > > - pfn relates to the current zone/nid > > - section is not the section to be removed > > > > Since we only got 1 section here, the check "start_pfn == pfn" will make us to continue the loop and then we are done. > > > > Now, what happens after the patch? > > > > We increment pfn on subsection basis, since "start_pfn == pfn", we jump > > to the next sub-section (pfn+512), and call pfn_valid()- > >>pfn_section_valid(). > > Since section has not been yet deactivded, pfn_section_valid() will > > return true, and we will repeat this until the end of the loop. > > > > What should happen instead is: > > > > - we deactivate the {sub}-section before calling > > shirnk_{zone,node}_span > > - calls to pfn_valid() will now return false for the sections that have > > been deactivated, and so we will get the pfn from the next activaded > > sub-section, or nothing if the section is empty (section do not contain > > active sub-sections). > > > > The example relates to the last loop in shrink_zone_span, but the same > > applies to find_{smalles,biggest}_section. > > > > Please, note that we could probably do some hack like replacing: > > > > start_pfn == pfn > > > > with > > > > pfn < end_pfn > > Why do you consider this a hack? > > /* If the section is current section, it continues the loop */ > if (start_pfn == pfn) > continue; I did not consider this a hack, but I really did not like to adapt that to the sub-section case as it seemed more natural to 1) deactivate sub-section and 2) look for the next one. So we would not need these checks. I might have bored at that time and I went for the most complex way to fix it. I will send v2 with the less intrusive check. > > The comment explains that check is there to handle the exact scenario > that you are fixing in this patch. With subsection patch that check is > not sufficient. Shouldn't we just fix the check to handle that? > > Not sure about your comment w.r.t find_{smalles,biggest}_section. We > search with pfn range outside the subsection we are trying to remove. > So this should not have an impact there? Yeah, I overlooked the code.
On 17.07.19 09:38, Oscar Salvador wrote: > On Tue, Jul 16, 2019 at 07:28:54PM -0700, Dan Williams wrote: >> This makes it more clear that the problem is with the "start_pfn == >> pfn" check relative to subsections, but it does not clarify why it >> needs to clear pfn_valid() before calling shrink_zone_span(). >> Sections were not invalidated prior to shrink_zone_span() in the >> pre-subsection implementation and it seems all we need is to keep the >> same semantic. I.e. skip the range that is currently being removed: > > Yes, as I said in my reply to Aneesh, that is the other way I thought > when fixing it. > The reason I went this way is because it seemed more reasonable and > natural to me that pfn_valid() would just return the next active > sub-section. > > I just though that we could leverage the fact that we can deactivate > a sub-section before scanning for the next one. > > On a second thought, the changes do not outweight the case, being the first > fix enough and less intrusive, so I will send a v2 with that instead. > > I'd also like to note that we should strive for making all zone-related changes when offlining in the future, not when removing memory. So ideally, any core changes we perform from now, should make that step (IOW implementing that) easier, not harder. Of course, BUGs have to be fixed. The rough idea would be to also mark ZONE_DEVICE sections as ONLINE (or rather rename it to "ACTIVE" to generalize). For each section we would then have - pfn_valid(): We have a valid "struct page" / memmap - pfn_present(): We have actually added that memory via an oficial interface to mm (e.g., arch_add_memory() ) - pfn_online() / (or pfn_active()): Memory is active (online in "buddy"- speak, or memory that was moved to the ZONE_DEVICE zone) When resizing the zones (e.g., when offlining memory), we would then search for pfn_online(), not pfn_present(). In addition to move_pfn_range_to_zone(), we would have remove_pfn_range_from_zone(), called during offline_pages() / by devmem/hmm/pmem code before removing. (I started to look into this, but I don't have any patches yet)
On Wed, Jul 17, 2019 at 10:01:01AM +0200, David Hildenbrand wrote: > I'd also like to note that we should strive for making all zone-related > changes when offlining in the future, not when removing memory. So > ideally, any core changes we perform from now, should make that step > (IOW implementing that) easier, not harder. Of course, BUGs have to be > fixed. > > The rough idea would be to also mark ZONE_DEVICE sections as ONLINE (or > rather rename it to "ACTIVE" to generalize). > > For each section we would then have > > - pfn_valid(): We have a valid "struct page" / memmap > - pfn_present(): We have actually added that memory via an oficial > interface to mm (e.g., arch_add_memory() ) > - pfn_online() / (or pfn_active()): Memory is active (online in "buddy"- > speak, or memory that was moved to the ZONE_DEVICE zone) > > When resizing the zones (e.g., when offlining memory), we would then > search for pfn_online(), not pfn_present(). > > In addition to move_pfn_range_to_zone(), we would have > remove_pfn_range_from_zone(), called during offline_pages() / by > devmem/hmm/pmem code before removing. > > (I started to look into this, but I don't have any patches yet) Yes, it seems like a good approach, and something that makes sense to pursue. FWIF, I sent a patchset [1] for that a long time ago, but I could not follow-up due to time constraints. [1] https://patchwork.kernel.org/cover/10700783/
On Mon, Jul 15, 2019 at 10:15:49AM +0200, Oscar Salvador wrote: > Since [1], shrink_{zone,node}_span work on PAGES_PER_SUBSECTION granularity. > The problem is that deactivation of the section occurs later on in > sparse_remove_section, so pfn_valid()->pfn_section_valid() will always return > true before we deactivate the {sub}section. > > I spotted this during hotplug hotremove tests, there I always saw that > spanned_pages was, at least, left with PAGES_PER_SECTION, even if we > removed all memory linked to that zone. > > Fix this by decoupling section_deactivate from sparse_remove_section, and > re-order the function calls. > > Now, __remove_section will: > > 1) deactivate section > 2) shrink {zone,node}'s pages > 3) remove section > > [1] https://patchwork.kernel.org/patch/11003467/ Hi Andrew, Please, drop this patch as patch [1] is the easiest way to fix this. thanks a lot [1] https://patchwork.kernel.org/patch/11047499/ > > Fixes: mmotm ("mm/hotplug: prepare shrink_{zone, pgdat}_span for sub-section removal") > Signed-off-by: Oscar Salvador <osalvador@suse.de> > --- > include/linux/memory_hotplug.h | 7 ++-- > mm/memory_hotplug.c | 6 +++- > mm/sparse.c | 77 +++++++++++++++++++++++++++++------------- > 3 files changed, 62 insertions(+), 28 deletions(-) > > diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h > index f46ea71b4ffd..d2eb917aad5f 100644 > --- a/include/linux/memory_hotplug.h > +++ b/include/linux/memory_hotplug.h > @@ -348,9 +348,10 @@ extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, > extern bool is_memblock_offlined(struct memory_block *mem); > extern int sparse_add_section(int nid, unsigned long pfn, > unsigned long nr_pages, struct vmem_altmap *altmap); > -extern void sparse_remove_section(struct mem_section *ms, > - unsigned long pfn, unsigned long nr_pages, > - unsigned long map_offset, struct vmem_altmap *altmap); > +int sparse_deactivate_section(unsigned long pfn, unsigned long nr_pages); > +void sparse_remove_section(unsigned long pfn, unsigned long nr_pages, > + unsigned long map_offset, struct vmem_altmap *altmap, > + int section_empty); > extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, > unsigned long pnum); > extern bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, > diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c > index b9ba5b85f9f7..03d535eee60d 100644 > --- a/mm/memory_hotplug.c > +++ b/mm/memory_hotplug.c > @@ -517,12 +517,16 @@ static void __remove_section(struct zone *zone, unsigned long pfn, > struct vmem_altmap *altmap) > { > struct mem_section *ms = __nr_to_section(pfn_to_section_nr(pfn)); > + int ret; > > if (WARN_ON_ONCE(!valid_section(ms))) > return; > > + ret = sparse_deactivate_section(pfn, nr_pages); > __remove_zone(zone, pfn, nr_pages); > - sparse_remove_section(ms, pfn, nr_pages, map_offset, altmap); > + if (ret >= 0) > + sparse_remove_section(pfn, nr_pages, map_offset, altmap, > + ret); > } > > /** > diff --git a/mm/sparse.c b/mm/sparse.c > index 1e224149aab6..d4953ee1d087 100644 > --- a/mm/sparse.c > +++ b/mm/sparse.c > @@ -732,16 +732,47 @@ static void free_map_bootmem(struct page *memmap) > } > #endif /* CONFIG_SPARSEMEM_VMEMMAP */ > > -static void section_deactivate(unsigned long pfn, unsigned long nr_pages, > - struct vmem_altmap *altmap) > +static void section_remove(unsigned long pfn, unsigned long nr_pages, > + struct vmem_altmap *altmap, int section_empty) > +{ > + struct mem_section *ms = __pfn_to_section(pfn); > + bool section_early = early_section(ms); > + struct page *memmap = NULL; > + > + if (section_empty) { > + unsigned long section_nr = pfn_to_section_nr(pfn); > + > + if (!section_early) { > + kfree(ms->usage); > + ms->usage = NULL; > + } > + memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); > + ms->section_mem_map = sparse_encode_mem_map(NULL, section_nr); > + } > + > + if (section_early && memmap) > + free_map_bootmem(memmap); > + else > + depopulate_section_memmap(pfn, nr_pages, altmap); > +} > + > +/** > + * section_deactivate: Deactivate a {sub}section. > + * > + * Return: > + * * -1 - {sub}section has already been deactivated. > + * * 0 - Section is not empty > + * * 1 - Section is empty > + */ > + > +static int section_deactivate(unsigned long pfn, unsigned long nr_pages) > { > DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 }; > DECLARE_BITMAP(tmp, SUBSECTIONS_PER_SECTION) = { 0 }; > struct mem_section *ms = __pfn_to_section(pfn); > - bool section_is_early = early_section(ms); > - struct page *memmap = NULL; > unsigned long *subsection_map = ms->usage > ? &ms->usage->subsection_map[0] : NULL; > + int section_empty = 0; > > subsection_mask_set(map, pfn, nr_pages); > if (subsection_map) > @@ -750,7 +781,7 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages, > if (WARN(!subsection_map || !bitmap_equal(tmp, map, SUBSECTIONS_PER_SECTION), > "section already deactivated (%#lx + %ld)\n", > pfn, nr_pages)) > - return; > + return -1; > > /* > * There are 3 cases to handle across two configurations > @@ -770,21 +801,10 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages, > * For 2/ and 3/ the SPARSEMEM_VMEMMAP={y,n} cases are unified > */ > bitmap_xor(subsection_map, map, subsection_map, SUBSECTIONS_PER_SECTION); > - if (bitmap_empty(subsection_map, SUBSECTIONS_PER_SECTION)) { > - unsigned long section_nr = pfn_to_section_nr(pfn); > - > - if (!section_is_early) { > - kfree(ms->usage); > - ms->usage = NULL; > - } > - memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); > - ms->section_mem_map = sparse_encode_mem_map(NULL, section_nr); > - } > + if (bitmap_empty(subsection_map, SUBSECTIONS_PER_SECTION)) > + section_empty = 1; > > - if (section_is_early && memmap) > - free_map_bootmem(memmap); > - else > - depopulate_section_memmap(pfn, nr_pages, altmap); > + return section_empty; > } > > static struct page * __meminit section_activate(int nid, unsigned long pfn, > @@ -834,7 +854,11 @@ static struct page * __meminit section_activate(int nid, unsigned long pfn, > > memmap = populate_section_memmap(pfn, nr_pages, nid, altmap); > if (!memmap) { > - section_deactivate(pfn, nr_pages, altmap); > + int ret; > + > + ret = section_deactivate(pfn, nr_pages); > + if (ret >= 0) > + section_remove(pfn, nr_pages, altmap, ret); > return ERR_PTR(-ENOMEM); > } > > @@ -919,12 +943,17 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) > } > #endif > > -void sparse_remove_section(struct mem_section *ms, unsigned long pfn, > - unsigned long nr_pages, unsigned long map_offset, > - struct vmem_altmap *altmap) > +int sparse_deactivate_section(unsigned long pfn, unsigned long nr_pages) > +{ > + return section_deactivate(pfn, nr_pages); > +} > + > +void sparse_remove_section(unsigned long pfn, unsigned long nr_pages, > + unsigned long map_offset, struct vmem_altmap *altmap, > + int section_empty) > { > clear_hwpoisoned_pages(pfn_to_page(pfn) + map_offset, > nr_pages - map_offset); > - section_deactivate(pfn, nr_pages, altmap); > + section_remove(pfn, nr_pages, altmap, section_empty); > } > #endif /* CONFIG_MEMORY_HOTPLUG */ > -- > 2.12.3 >
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index f46ea71b4ffd..d2eb917aad5f 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -348,9 +348,10 @@ extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, extern bool is_memblock_offlined(struct memory_block *mem); extern int sparse_add_section(int nid, unsigned long pfn, unsigned long nr_pages, struct vmem_altmap *altmap); -extern void sparse_remove_section(struct mem_section *ms, - unsigned long pfn, unsigned long nr_pages, - unsigned long map_offset, struct vmem_altmap *altmap); +int sparse_deactivate_section(unsigned long pfn, unsigned long nr_pages); +void sparse_remove_section(unsigned long pfn, unsigned long nr_pages, + unsigned long map_offset, struct vmem_altmap *altmap, + int section_empty); extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum); extern bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index b9ba5b85f9f7..03d535eee60d 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -517,12 +517,16 @@ static void __remove_section(struct zone *zone, unsigned long pfn, struct vmem_altmap *altmap) { struct mem_section *ms = __nr_to_section(pfn_to_section_nr(pfn)); + int ret; if (WARN_ON_ONCE(!valid_section(ms))) return; + ret = sparse_deactivate_section(pfn, nr_pages); __remove_zone(zone, pfn, nr_pages); - sparse_remove_section(ms, pfn, nr_pages, map_offset, altmap); + if (ret >= 0) + sparse_remove_section(pfn, nr_pages, map_offset, altmap, + ret); } /** diff --git a/mm/sparse.c b/mm/sparse.c index 1e224149aab6..d4953ee1d087 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -732,16 +732,47 @@ static void free_map_bootmem(struct page *memmap) } #endif /* CONFIG_SPARSEMEM_VMEMMAP */ -static void section_deactivate(unsigned long pfn, unsigned long nr_pages, - struct vmem_altmap *altmap) +static void section_remove(unsigned long pfn, unsigned long nr_pages, + struct vmem_altmap *altmap, int section_empty) +{ + struct mem_section *ms = __pfn_to_section(pfn); + bool section_early = early_section(ms); + struct page *memmap = NULL; + + if (section_empty) { + unsigned long section_nr = pfn_to_section_nr(pfn); + + if (!section_early) { + kfree(ms->usage); + ms->usage = NULL; + } + memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); + ms->section_mem_map = sparse_encode_mem_map(NULL, section_nr); + } + + if (section_early && memmap) + free_map_bootmem(memmap); + else + depopulate_section_memmap(pfn, nr_pages, altmap); +} + +/** + * section_deactivate: Deactivate a {sub}section. + * + * Return: + * * -1 - {sub}section has already been deactivated. + * * 0 - Section is not empty + * * 1 - Section is empty + */ + +static int section_deactivate(unsigned long pfn, unsigned long nr_pages) { DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 }; DECLARE_BITMAP(tmp, SUBSECTIONS_PER_SECTION) = { 0 }; struct mem_section *ms = __pfn_to_section(pfn); - bool section_is_early = early_section(ms); - struct page *memmap = NULL; unsigned long *subsection_map = ms->usage ? &ms->usage->subsection_map[0] : NULL; + int section_empty = 0; subsection_mask_set(map, pfn, nr_pages); if (subsection_map) @@ -750,7 +781,7 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages, if (WARN(!subsection_map || !bitmap_equal(tmp, map, SUBSECTIONS_PER_SECTION), "section already deactivated (%#lx + %ld)\n", pfn, nr_pages)) - return; + return -1; /* * There are 3 cases to handle across two configurations @@ -770,21 +801,10 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages, * For 2/ and 3/ the SPARSEMEM_VMEMMAP={y,n} cases are unified */ bitmap_xor(subsection_map, map, subsection_map, SUBSECTIONS_PER_SECTION); - if (bitmap_empty(subsection_map, SUBSECTIONS_PER_SECTION)) { - unsigned long section_nr = pfn_to_section_nr(pfn); - - if (!section_is_early) { - kfree(ms->usage); - ms->usage = NULL; - } - memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); - ms->section_mem_map = sparse_encode_mem_map(NULL, section_nr); - } + if (bitmap_empty(subsection_map, SUBSECTIONS_PER_SECTION)) + section_empty = 1; - if (section_is_early && memmap) - free_map_bootmem(memmap); - else - depopulate_section_memmap(pfn, nr_pages, altmap); + return section_empty; } static struct page * __meminit section_activate(int nid, unsigned long pfn, @@ -834,7 +854,11 @@ static struct page * __meminit section_activate(int nid, unsigned long pfn, memmap = populate_section_memmap(pfn, nr_pages, nid, altmap); if (!memmap) { - section_deactivate(pfn, nr_pages, altmap); + int ret; + + ret = section_deactivate(pfn, nr_pages); + if (ret >= 0) + section_remove(pfn, nr_pages, altmap, ret); return ERR_PTR(-ENOMEM); } @@ -919,12 +943,17 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) } #endif -void sparse_remove_section(struct mem_section *ms, unsigned long pfn, - unsigned long nr_pages, unsigned long map_offset, - struct vmem_altmap *altmap) +int sparse_deactivate_section(unsigned long pfn, unsigned long nr_pages) +{ + return section_deactivate(pfn, nr_pages); +} + +void sparse_remove_section(unsigned long pfn, unsigned long nr_pages, + unsigned long map_offset, struct vmem_altmap *altmap, + int section_empty) { clear_hwpoisoned_pages(pfn_to_page(pfn) + map_offset, nr_pages - map_offset); - section_deactivate(pfn, nr_pages, altmap); + section_remove(pfn, nr_pages, altmap, section_empty); } #endif /* CONFIG_MEMORY_HOTPLUG */
Since [1], shrink_{zone,node}_span work on PAGES_PER_SUBSECTION granularity. The problem is that deactivation of the section occurs later on in sparse_remove_section, so pfn_valid()->pfn_section_valid() will always return true before we deactivate the {sub}section. I spotted this during hotplug hotremove tests, there I always saw that spanned_pages was, at least, left with PAGES_PER_SECTION, even if we removed all memory linked to that zone. Fix this by decoupling section_deactivate from sparse_remove_section, and re-order the function calls. Now, __remove_section will: 1) deactivate section 2) shrink {zone,node}'s pages 3) remove section [1] https://patchwork.kernel.org/patch/11003467/ Fixes: mmotm ("mm/hotplug: prepare shrink_{zone, pgdat}_span for sub-section removal") Signed-off-by: Oscar Salvador <osalvador@suse.de> --- include/linux/memory_hotplug.h | 7 ++-- mm/memory_hotplug.c | 6 +++- mm/sparse.c | 77 +++++++++++++++++++++++++++++------------- 3 files changed, 62 insertions(+), 28 deletions(-)