Message ID | 20240521-mm-hotplug-sync-v1-2-6d53706c1ba8@google.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | Clean up hotplug zone data synchronization | expand |
Hi Brendan, On Tue, May 21, 2024 at 8:57 PM Brendan Jackman <jackmanb@google.com> wrote: > > These fields are written by memory hotplug under mem_hotplug_lock but > read without any lock. It seems like reader code is robust against the > value being stale or "from the future", but we also need to account > for: > > 1. Load/store tearing (according to Linus[1], this really happens, > even when everything is aligned as you would hope). > > 2. Invented loads[2] - the compiler can spill and re-read these fields > ([2] calls this "invented loads") and assume that they have not > changed. > > Note we don't need READ_ONCE in paths that have the mem_hotplug_lock > for write, but we still need WRITE_ONCE to prevent store-tearing. > > [1] https://lore.kernel.org/all/CAHk-=wj2t+GK+DGQ7Xy6U7zMf72e7Jkxn4_-kGyfH3WFEoH+YQ@mail.gmail.com/T/#u > As discovered via the original big-bad article[2] > [2] https://lwn.net/Articles/793253/ > > Signed-off-by: Brendan Jackman <jackmanb@google.com> > --- > include/linux/mmzone.h | 14 ++++++++++---- > mm/compaction.c | 2 +- > mm/memory_hotplug.c | 20 ++++++++++++-------- > mm/mm_init.c | 2 +- > mm/page_alloc.c | 2 +- > mm/show_mem.c | 8 ++++---- > mm/vmstat.c | 4 ++-- > 7 files changed, 31 insertions(+), 21 deletions(-) > > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h > index 194ef7fed9d6..bdb3be76d10c 100644 > --- a/include/linux/mmzone.h > +++ b/include/linux/mmzone.h > @@ -1018,11 +1018,13 @@ static inline unsigned long zone_cma_pages(struct zone *zone) > #endif > } > > +/* This is unstable unless you hold mem_hotplug_lock. */ > static inline unsigned long zone_end_pfn(const struct zone *zone) > { > - return zone->zone_start_pfn + zone->spanned_pages; > + return zone->zone_start_pfn + READ_ONCE(zone->spanned_pages); > } > > +/* This is unstable unless you hold mem_hotplug_lock. */ > static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn) > { > return zone->zone_start_pfn <= pfn && pfn < zone_end_pfn(zone); > @@ -1033,9 +1035,10 @@ static inline bool zone_is_initialized(struct zone *zone) > return zone->initialized; > } > > +/* This is unstable unless you hold mem_hotplug_lock. */ > static inline bool zone_is_empty(struct zone *zone) > { > - return zone->spanned_pages == 0; > + return READ_ONCE(zone->spanned_pages) == 0; > } > > #ifndef BUILD_VDSO32_64 > @@ -1485,10 +1488,13 @@ static inline bool managed_zone(struct zone *zone) > return zone_managed_pages(zone); > } > > -/* Returns true if a zone has memory */ > +/* > + * Returns true if a zone has memory. > + * This is unstable unless you old mem_hotplug_lock. > + */ > static inline bool populated_zone(struct zone *zone) > { > - return zone->present_pages; > + return READ_ONCE(zone->present_pages); > } > > #ifdef CONFIG_NUMA > diff --git a/mm/compaction.c b/mm/compaction.c > index e731d45befc7..b8066d1fdcf5 100644 > --- a/mm/compaction.c > +++ b/mm/compaction.c > @@ -2239,7 +2239,7 @@ static unsigned int fragmentation_score_zone_weighted(struct zone *zone) > { > unsigned long score; > > - score = zone->present_pages * fragmentation_score_zone(zone); > + score = READ_ONCE(zone->present_pages) * fragmentation_score_zone(zone); > return div64_ul(score, zone->zone_pgdat->node_present_pages + 1); > } > > diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c > index 431b1f6753c0..71b5e3d314a2 100644 > --- a/mm/memory_hotplug.c > +++ b/mm/memory_hotplug.c > @@ -463,6 +463,8 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, > int nid = zone_to_nid(zone); > > if (zone->zone_start_pfn == start_pfn) { > + unsigned long old_end_pfn = zone_end_pfn(zone); > + > /* > * If the section is smallest section in the zone, it need > * shrink zone->zone_start_pfn and zone->zone_spanned_pages. > @@ -470,13 +472,13 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, > * for shrinking zone. > */ > pfn = find_smallest_section_pfn(nid, zone, end_pfn, > - zone_end_pfn(zone)); > + old_end_pfn); > if (pfn) { > - zone->spanned_pages = zone_end_pfn(zone) - pfn; > + WRITE_ONCE(zone->spanned_pages, old_end_pfn - pfn); > zone->zone_start_pfn = pfn; > } else { > zone->zone_start_pfn = 0; > - zone->spanned_pages = 0; > + WRITE_ONCE(zone->spanned_pages, 0); > } > } else if (zone_end_pfn(zone) == end_pfn) { > /* > @@ -488,10 +490,11 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, > pfn = find_biggest_section_pfn(nid, zone, zone->zone_start_pfn, > start_pfn); > if (pfn) > - zone->spanned_pages = pfn - zone->zone_start_pfn + 1; > + WRITE_ONCE(zone->spanned_pages, > + pfn - zone->zone_start_pfn + 1); > else { > zone->zone_start_pfn = 0; > - zone->spanned_pages = 0; > + WRITE_ONCE(zone->spanned_pages, 0); > } > } > } > @@ -710,7 +713,8 @@ static void __meminit resize_zone_range(struct zone *zone, unsigned long start_p > if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn) > zone->zone_start_pfn = start_pfn; > > - zone->spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn; > + WRITE_ONCE(zone->spanned_pages, > + max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn); > } > > static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned long start_pfn, > @@ -795,7 +799,7 @@ static void auto_movable_stats_account_zone(struct auto_movable_stats *stats, > struct zone *zone) > { > if (zone_idx(zone) == ZONE_MOVABLE) { > - stats->movable_pages += zone->present_pages; > + stats->movable_pages += READ_ONCE(zone->present_pages); > } else { > stats->kernel_early_pages += zone->present_early_pages; > #ifdef CONFIG_CMA > @@ -1077,7 +1081,7 @@ void adjust_present_page_count(struct page *page, struct memory_group *group, > */ > if (early_section(__pfn_to_section(page_to_pfn(page)))) > zone->present_early_pages += nr_pages; > - zone->present_pages += nr_pages; > + WRITE_ONCE(zone->present_pages, zone->present_pages + nr_pages); I'm not sure that using the WRITE_ONCE() wrapper would prevent load tearing on 'zone->present_pages', but it's probably just me overthinking it :) Thanks, Lance > zone->zone_pgdat->node_present_pages += nr_pages; > > if (group && movable) > diff --git a/mm/mm_init.c b/mm/mm_init.c > index c725618aeb58..ec66f2eadb95 100644 > --- a/mm/mm_init.c > +++ b/mm/mm_init.c > @@ -1540,7 +1540,7 @@ void __ref free_area_init_core_hotplug(struct pglist_data *pgdat) > for (z = 0; z < MAX_NR_ZONES; z++) { > struct zone *zone = pgdat->node_zones + z; > > - zone->present_pages = 0; > + WRITE_ONCE(zone->present_pages, 0); > zone_init_internals(zone, z, nid, 0); > } > } > diff --git a/mm/page_alloc.c b/mm/page_alloc.c > index 5116a2b9ea6e..1eb9000ec7d7 100644 > --- a/mm/page_alloc.c > +++ b/mm/page_alloc.c > @@ -5728,7 +5728,7 @@ __meminit void zone_pcp_init(struct zone *zone) > > if (populated_zone(zone)) > pr_debug(" %s zone: %lu pages, LIFO batch:%u\n", zone->name, > - zone->present_pages, zone_batchsize(zone)); > + READ_ONCE(zone->present_pages), zone_batchsize(zone)); > } > > void adjust_managed_page_count(struct page *page, long count) > diff --git a/mm/show_mem.c b/mm/show_mem.c > index bdb439551eef..667680a6107b 100644 > --- a/mm/show_mem.c > +++ b/mm/show_mem.c > @@ -337,7 +337,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z > K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)), > K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)), > K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)), > - K(zone->present_pages), > + K(READ_ONCE(zone->present_pages)), > K(zone_managed_pages(zone)), > K(zone_page_state(zone, NR_MLOCK)), > K(zone_page_state(zone, NR_BOUNCE)), > @@ -407,11 +407,11 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) > > for_each_populated_zone(zone) { > > - total += zone->present_pages; > - reserved += zone->present_pages - zone_managed_pages(zone); > + total += READ_ONCE(zone->present_pages); > + reserved += READ_ONCE(zone->present_pages) - zone_managed_pages(zone); > > if (is_highmem(zone)) > - highmem += zone->present_pages; > + highmem += READ_ONCE(zone->present_pages); > } > > printk("%lu pages RAM\n", total); > diff --git a/mm/vmstat.c b/mm/vmstat.c > index 8507c497218b..5a9c4b5768e5 100644 > --- a/mm/vmstat.c > +++ b/mm/vmstat.c > @@ -1708,8 +1708,8 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, > min_wmark_pages(zone), > low_wmark_pages(zone), > high_wmark_pages(zone), > - zone->spanned_pages, > - zone->present_pages, > + READ_ONCE(zone->spanned_pages), > + READ_ONCE(zone->present_pages), > zone_managed_pages(zone), > zone_cma_pages(zone)); > > > -- > 2.45.0.rc1.225.g2a3ae87e7f-goog > >
Hi Lance, thanks for taking a look. On Wed, May 22, 2024 at 12:25:30PM +0800, Lance Yang wrote: > Hi Brendan, > > On Tue, May 21, 2024 at 8:57 PM Brendan Jackman <jackmanb@google.com> wrote: > > @@ -1077,7 +1081,7 @@ void adjust_present_page_count(struct page *page, struct memory_group *group, > > */ > > if (early_section(__pfn_to_section(page_to_pfn(page)))) > > zone->present_early_pages += nr_pages; > > - zone->present_pages += nr_pages; > > + WRITE_ONCE(zone->present_pages, zone->present_pages + nr_pages); > > I'm not sure that using the WRITE_ONCE() wrapper would prevent load tearing > on 'zone->present_pages', but it's probably just me overthinking it :) Hmm.. this isn't for load-tearing, it's for store-tearing. I have a feeling I might be missing your pont here though, can you elaborate? I have just noticed that the original "big bad optimizing compiler" article[1] only says store-tearing has been observed in the wild when the value being stored can be split into immediates (i.e. is constant). But it doesn't really seem wise to rely on that. From what I can tell from tools/memory-model/Documentation you are really out in the wild with unmarked accesses. [1] https://lwn.net/Articles/793253
On Tue, May 21, 2024 at 12:57:19PM +0000, Brendan Jackman wrote: > These fields are written by memory hotplug under mem_hotplug_lock but > read without any lock. It seems like reader code is robust against the > value being stale or "from the future", but we also need to account > for: > > 1. Load/store tearing (according to Linus[1], this really happens, > even when everything is aligned as you would hope). > > 2. Invented loads[2] - the compiler can spill and re-read these fields > ([2] calls this "invented loads") and assume that they have not > changed. > > Note we don't need READ_ONCE in paths that have the mem_hotplug_lock > for write, but we still need WRITE_ONCE to prevent store-tearing. > > [1] https://lore.kernel.org/all/CAHk-=wj2t+GK+DGQ7Xy6U7zMf72e7Jkxn4_-kGyfH3WFEoH+YQ@mail.gmail.com/T/#u > As discovered via the original big-bad article[2] > [2] https://lwn.net/Articles/793253/ > > Signed-off-by: Brendan Jackman <jackmanb@google.com> Oh, from a quick look it seems cma_pages would need this too. present_early_pages seems fine. I'll wait a few days in case anyone points out this whole thing is garbage, then check more carefully and send a v2.
On Wed, May 22, 2024 at 4:38 PM Brendan Jackman <jackmanb@google.com> wrote: > > Hi Lance, thanks for taking a look. > > On Wed, May 22, 2024 at 12:25:30PM +0800, Lance Yang wrote: > > Hi Brendan, > > > > On Tue, May 21, 2024 at 8:57 PM Brendan Jackman <jackmanb@google.com> wrote: > > > @@ -1077,7 +1081,7 @@ void adjust_present_page_count(struct page *page, struct memory_group *group, > > > */ > > > if (early_section(__pfn_to_section(page_to_pfn(page)))) > > > zone->present_early_pages += nr_pages; > > > - zone->present_pages += nr_pages; > > > + WRITE_ONCE(zone->present_pages, zone->present_pages + nr_pages); > > > > I'm not sure that using the WRITE_ONCE() wrapper would prevent load tearing > > on 'zone->present_pages', but it's probably just me overthinking it :) > > Hmm.. this isn't for load-tearing, it's for store-tearing. I have a > feeling I might be missing your pont here though, can you elaborate? Sorry, my explanation wasn't clear :( I'm a bit confused about whether 'WRITE_ONCE(zone->present_pages, zone->present_pages + nr_pages);' is equivalent to the following: 1 a = zone->present_pages + nr_pages; 2 WRITE_ONCE(zone->present_pages, a); If so, is there any possibility of load tearing on 'zone->present_pages' in line 1? > > I have just noticed that the original "big bad optimizing compiler" > article[1] only says store-tearing has been observed in the wild when > the value being stored can be split into immediates (i.e. is > constant). But it doesn't really seem wise to rely on that. From what > I can tell from tools/memory-model/Documentation you are really out in > the wild with unmarked accesses. > > [1] https://lwn.net/Articles/793253 Thanks for clarifying! Lance
On Wed, May 22, 2024 at 05:20:08PM +0800, Lance Yang wrote: > On Wed, May 22, 2024 at 4:38 PM Brendan Jackman <jackmanb@google.com> wrote: > > > > Hi Lance, thanks for taking a look. > > > > On Wed, May 22, 2024 at 12:25:30PM +0800, Lance Yang wrote: > > > Hi Brendan, > > > > > > On Tue, May 21, 2024 at 8:57 PM Brendan Jackman <jackmanb@google.com> wrote: > > > > @@ -1077,7 +1081,7 @@ void adjust_present_page_count(struct page *page, struct memory_group *group, > > > > */ > > > > if (early_section(__pfn_to_section(page_to_pfn(page)))) > > > > zone->present_early_pages += nr_pages; > > > > - zone->present_pages += nr_pages; > > > > + WRITE_ONCE(zone->present_pages, zone->present_pages + nr_pages); > > > > > > I'm not sure that using the WRITE_ONCE() wrapper would prevent load tearing > > > on 'zone->present_pages', but it's probably just me overthinking it :) > > > > Hmm.. this isn't for load-tearing, it's for store-tearing. I have a > > feeling I might be missing your pont here though, can you elaborate? > > Sorry, my explanation wasn't clear :( > > I'm a bit confused about whether 'WRITE_ONCE(zone->present_pages, > zone->present_pages + nr_pages);' > is equivalent to the following: > > 1 a = zone->present_pages + nr_pages; > 2 WRITE_ONCE(zone->present_pages, a); > > If so, is there any possibility of load tearing on > 'zone->present_pages' in line 1? Ah gotcha, thanks for clarifying. Loads are protected by mem_hotplug_lock here, so it's fine for them to get split up (because the value can't change between loads). This is what I was referring to in the bit of the commit message about not needing READ_ONCE.
On Wed, May 22, 2024 at 6:10 PM Brendan Jackman <jackmanb@google.com> wrote: > > On Wed, May 22, 2024 at 05:20:08PM +0800, Lance Yang wrote: > > On Wed, May 22, 2024 at 4:38 PM Brendan Jackman <jackmanb@google.com> wrote: > > > > > > Hi Lance, thanks for taking a look. > > > > > > On Wed, May 22, 2024 at 12:25:30PM +0800, Lance Yang wrote: > > > > Hi Brendan, > > > > > > > > On Tue, May 21, 2024 at 8:57 PM Brendan Jackman <jackmanb@google.com> wrote: > > > > > @@ -1077,7 +1081,7 @@ void adjust_present_page_count(struct page *page, struct memory_group *group, > > > > > */ > > > > > if (early_section(__pfn_to_section(page_to_pfn(page)))) > > > > > zone->present_early_pages += nr_pages; > > > > > - zone->present_pages += nr_pages; > > > > > + WRITE_ONCE(zone->present_pages, zone->present_pages + nr_pages); > > > > > > > > I'm not sure that using the WRITE_ONCE() wrapper would prevent load tearing > > > > on 'zone->present_pages', but it's probably just me overthinking it :) > > > > > > Hmm.. this isn't for load-tearing, it's for store-tearing. I have a > > > feeling I might be missing your pont here though, can you elaborate? > > > > Sorry, my explanation wasn't clear :( > > > > I'm a bit confused about whether 'WRITE_ONCE(zone->present_pages, > > zone->present_pages + nr_pages);' > > is equivalent to the following: > > > > 1 a = zone->present_pages + nr_pages; > > 2 WRITE_ONCE(zone->present_pages, a); > > > > If so, is there any possibility of load tearing on > > 'zone->present_pages' in line 1? > > Ah gotcha, thanks for clarifying. Loads are protected by > mem_hotplug_lock here, so it's fine for them to get split up (because > the value can't change between loads). This is what I was referring to > in the bit of the commit message about not needing READ_ONCE. I see, thanks again for clarifying! Lance
On 21.05.24 14:57, Brendan Jackman wrote: > These fields are written by memory hotplug under mem_hotplug_lock but > read without any lock. It seems like reader code is robust against the > value being stale or "from the future", but we also need to account > for: > > 1. Load/store tearing (according to Linus[1], this really happens, > even when everything is aligned as you would hope). > > 2. Invented loads[2] - the compiler can spill and re-read these fields > ([2] calls this "invented loads") and assume that they have not > changed. > > Note we don't need READ_ONCE in paths that have the mem_hotplug_lock > for write, but we still need WRITE_ONCE to prevent store-tearing. > > [1] https://lore.kernel.org/all/CAHk-=wj2t+GK+DGQ7Xy6U7zMf72e7Jkxn4_-kGyfH3WFEoH+YQ@mail.gmail.com/T/#u > As discovered via the original big-bad article[2] > [2] https://lwn.net/Articles/793253/ > > Signed-off-by: Brendan Jackman <jackmanb@google.com> > --- > include/linux/mmzone.h | 14 ++++++++++---- > mm/compaction.c | 2 +- > mm/memory_hotplug.c | 20 ++++++++++++-------- > mm/mm_init.c | 2 +- > mm/page_alloc.c | 2 +- > mm/show_mem.c | 8 ++++---- > mm/vmstat.c | 4 ++-- > 7 files changed, 31 insertions(+), 21 deletions(-) > > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h > index 194ef7fed9d6..bdb3be76d10c 100644 > --- a/include/linux/mmzone.h > +++ b/include/linux/mmzone.h > @@ -1018,11 +1018,13 @@ static inline unsigned long zone_cma_pages(struct zone *zone) > #endif > } > > +/* This is unstable unless you hold mem_hotplug_lock. */ > static inline unsigned long zone_end_pfn(const struct zone *zone) > { > - return zone->zone_start_pfn + zone->spanned_pages; > + return zone->zone_start_pfn + READ_ONCE(zone->spanned_pages); It's weird to apply that logic only to spanned_pages, whereby zone_start_pfn can (and will) similarly change when onlining/offlining memory.
On Wed, May 22, 2024 at 04:05:12PM +0200, David Hildenbrand wrote: > On 21.05.24 14:57, Brendan Jackman wrote: > > + return zone->zone_start_pfn + READ_ONCE(zone->spanned_pages); > > It's weird to apply that logic only to spanned_pages, whereby zone_start_pfn > can (and will) similarly change when onlining/offlining memory. > Oh, yep. For some reason I had decided that zone_start_pfn was fixed but that is (actually very obviously) not true! Will take a closer look and extend v2 to cover that too, unless someone finds a reason this whole patch is nonsense. Thanks for the review.
On Wed, May 22, 2024 at 02:11:16PM +0000, Brendan Jackman wrote: > On Wed, May 22, 2024 at 04:05:12PM +0200, David Hildenbrand wrote: > > On 21.05.24 14:57, Brendan Jackman wrote: > > > + return zone->zone_start_pfn + READ_ONCE(zone->spanned_pages); > > > > It's weird to apply that logic only to spanned_pages, whereby zone_start_pfn > > can (and will) similarly change when onlining/offlining memory. > > > Oh, yep. For some reason I had decided that zone_start_pfn was fixed > but that is (actually very obviously) not true! > > Will take a closer look and extend v2 to cover that too, unless > someone finds a reason this whole patch is nonsense. > > Thanks for the review. Hmm so while poking around during spare moments this week I learned that compaction.c also stores a bunch of data in struct zone that is unsynchronized. It seems pretty unlikely that you can corrupt any memory there (unless there's some race possible with pfn_to_online_page, which is an orthogonal question), but it does seem like if the compiler gets smart with us we could maybe have a compaction run that takes quasi-forever or something weird like that. It seems easy enough to just spam READ_ONCE/WRITE_ONCE everywhere there too, this would remove that risk, make KCSAN happy and serve as a kinda "this is unsynchronized, take care" comment. (There's also at least one place where we could put data_race()). On the other hand it's a bit verbose & visually ugly. Personally I think it's a pretty minor downside, but anyone feel differently?
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 194ef7fed9d6..bdb3be76d10c 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1018,11 +1018,13 @@ static inline unsigned long zone_cma_pages(struct zone *zone) #endif } +/* This is unstable unless you hold mem_hotplug_lock. */ static inline unsigned long zone_end_pfn(const struct zone *zone) { - return zone->zone_start_pfn + zone->spanned_pages; + return zone->zone_start_pfn + READ_ONCE(zone->spanned_pages); } +/* This is unstable unless you hold mem_hotplug_lock. */ static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn) { return zone->zone_start_pfn <= pfn && pfn < zone_end_pfn(zone); @@ -1033,9 +1035,10 @@ static inline bool zone_is_initialized(struct zone *zone) return zone->initialized; } +/* This is unstable unless you hold mem_hotplug_lock. */ static inline bool zone_is_empty(struct zone *zone) { - return zone->spanned_pages == 0; + return READ_ONCE(zone->spanned_pages) == 0; } #ifndef BUILD_VDSO32_64 @@ -1485,10 +1488,13 @@ static inline bool managed_zone(struct zone *zone) return zone_managed_pages(zone); } -/* Returns true if a zone has memory */ +/* + * Returns true if a zone has memory. + * This is unstable unless you old mem_hotplug_lock. + */ static inline bool populated_zone(struct zone *zone) { - return zone->present_pages; + return READ_ONCE(zone->present_pages); } #ifdef CONFIG_NUMA diff --git a/mm/compaction.c b/mm/compaction.c index e731d45befc7..b8066d1fdcf5 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2239,7 +2239,7 @@ static unsigned int fragmentation_score_zone_weighted(struct zone *zone) { unsigned long score; - score = zone->present_pages * fragmentation_score_zone(zone); + score = READ_ONCE(zone->present_pages) * fragmentation_score_zone(zone); return div64_ul(score, zone->zone_pgdat->node_present_pages + 1); } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 431b1f6753c0..71b5e3d314a2 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -463,6 +463,8 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, int nid = zone_to_nid(zone); if (zone->zone_start_pfn == start_pfn) { + unsigned long old_end_pfn = zone_end_pfn(zone); + /* * If the section is smallest section in the zone, it need * shrink zone->zone_start_pfn and zone->zone_spanned_pages. @@ -470,13 +472,13 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, * for shrinking zone. */ pfn = find_smallest_section_pfn(nid, zone, end_pfn, - zone_end_pfn(zone)); + old_end_pfn); if (pfn) { - zone->spanned_pages = zone_end_pfn(zone) - pfn; + WRITE_ONCE(zone->spanned_pages, old_end_pfn - pfn); zone->zone_start_pfn = pfn; } else { zone->zone_start_pfn = 0; - zone->spanned_pages = 0; + WRITE_ONCE(zone->spanned_pages, 0); } } else if (zone_end_pfn(zone) == end_pfn) { /* @@ -488,10 +490,11 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, pfn = find_biggest_section_pfn(nid, zone, zone->zone_start_pfn, start_pfn); if (pfn) - zone->spanned_pages = pfn - zone->zone_start_pfn + 1; + WRITE_ONCE(zone->spanned_pages, + pfn - zone->zone_start_pfn + 1); else { zone->zone_start_pfn = 0; - zone->spanned_pages = 0; + WRITE_ONCE(zone->spanned_pages, 0); } } } @@ -710,7 +713,8 @@ static void __meminit resize_zone_range(struct zone *zone, unsigned long start_p if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn) zone->zone_start_pfn = start_pfn; - zone->spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn; + WRITE_ONCE(zone->spanned_pages, + max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn); } static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned long start_pfn, @@ -795,7 +799,7 @@ static void auto_movable_stats_account_zone(struct auto_movable_stats *stats, struct zone *zone) { if (zone_idx(zone) == ZONE_MOVABLE) { - stats->movable_pages += zone->present_pages; + stats->movable_pages += READ_ONCE(zone->present_pages); } else { stats->kernel_early_pages += zone->present_early_pages; #ifdef CONFIG_CMA @@ -1077,7 +1081,7 @@ void adjust_present_page_count(struct page *page, struct memory_group *group, */ if (early_section(__pfn_to_section(page_to_pfn(page)))) zone->present_early_pages += nr_pages; - zone->present_pages += nr_pages; + WRITE_ONCE(zone->present_pages, zone->present_pages + nr_pages); zone->zone_pgdat->node_present_pages += nr_pages; if (group && movable) diff --git a/mm/mm_init.c b/mm/mm_init.c index c725618aeb58..ec66f2eadb95 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1540,7 +1540,7 @@ void __ref free_area_init_core_hotplug(struct pglist_data *pgdat) for (z = 0; z < MAX_NR_ZONES; z++) { struct zone *zone = pgdat->node_zones + z; - zone->present_pages = 0; + WRITE_ONCE(zone->present_pages, 0); zone_init_internals(zone, z, nid, 0); } } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5116a2b9ea6e..1eb9000ec7d7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5728,7 +5728,7 @@ __meminit void zone_pcp_init(struct zone *zone) if (populated_zone(zone)) pr_debug(" %s zone: %lu pages, LIFO batch:%u\n", zone->name, - zone->present_pages, zone_batchsize(zone)); + READ_ONCE(zone->present_pages), zone_batchsize(zone)); } void adjust_managed_page_count(struct page *page, long count) diff --git a/mm/show_mem.c b/mm/show_mem.c index bdb439551eef..667680a6107b 100644 --- a/mm/show_mem.c +++ b/mm/show_mem.c @@ -337,7 +337,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)), K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)), K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)), - K(zone->present_pages), + K(READ_ONCE(zone->present_pages)), K(zone_managed_pages(zone)), K(zone_page_state(zone, NR_MLOCK)), K(zone_page_state(zone, NR_BOUNCE)), @@ -407,11 +407,11 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) for_each_populated_zone(zone) { - total += zone->present_pages; - reserved += zone->present_pages - zone_managed_pages(zone); + total += READ_ONCE(zone->present_pages); + reserved += READ_ONCE(zone->present_pages) - zone_managed_pages(zone); if (is_highmem(zone)) - highmem += zone->present_pages; + highmem += READ_ONCE(zone->present_pages); } printk("%lu pages RAM\n", total); diff --git a/mm/vmstat.c b/mm/vmstat.c index 8507c497218b..5a9c4b5768e5 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1708,8 +1708,8 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, min_wmark_pages(zone), low_wmark_pages(zone), high_wmark_pages(zone), - zone->spanned_pages, - zone->present_pages, + READ_ONCE(zone->spanned_pages), + READ_ONCE(zone->present_pages), zone_managed_pages(zone), zone_cma_pages(zone));
These fields are written by memory hotplug under mem_hotplug_lock but read without any lock. It seems like reader code is robust against the value being stale or "from the future", but we also need to account for: 1. Load/store tearing (according to Linus[1], this really happens, even when everything is aligned as you would hope). 2. Invented loads[2] - the compiler can spill and re-read these fields ([2] calls this "invented loads") and assume that they have not changed. Note we don't need READ_ONCE in paths that have the mem_hotplug_lock for write, but we still need WRITE_ONCE to prevent store-tearing. [1] https://lore.kernel.org/all/CAHk-=wj2t+GK+DGQ7Xy6U7zMf72e7Jkxn4_-kGyfH3WFEoH+YQ@mail.gmail.com/T/#u As discovered via the original big-bad article[2] [2] https://lwn.net/Articles/793253/ Signed-off-by: Brendan Jackman <jackmanb@google.com> --- include/linux/mmzone.h | 14 ++++++++++---- mm/compaction.c | 2 +- mm/memory_hotplug.c | 20 ++++++++++++-------- mm/mm_init.c | 2 +- mm/page_alloc.c | 2 +- mm/show_mem.c | 8 ++++---- mm/vmstat.c | 4 ++-- 7 files changed, 31 insertions(+), 21 deletions(-)