Message ID | 20220304015941.1704249-1-eric.dumazet@gmail.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | mm/page_alloc: call check_new_pages() while zone spinlock is not held | expand |
On Thu, Mar 3, 2022 at 5:59 PM Eric Dumazet <eric.dumazet@gmail.com> wrote: > > From: Eric Dumazet <edumazet@google.com> > > For high order pages not using pcp, rmqueue() is currently calling > the costly check_new_pages() while zone spinlock is held. > > This is not needed, we can release the spinlock sooner to reduce > zone spinlock contention. > > Signed-off-by: Eric Dumazet <edumazet@google.com> > Cc: Mel Gorman <mgorman@techsingularity.net> > Cc: Vlastimil Babka <vbabka@suse.cz> > Cc: Michal Hocko <mhocko@kernel.org> > Cc: Shakeel Butt <shakeelb@google.com> > Cc: Wei Xu <weixugc@google.com> > Cc: Greg Thelen <gthelen@google.com> > Cc: Hugh Dickins <hughd@google.com> > Cc: David Rientjes <rientjes@google.com> > --- > mm/page_alloc.c | 17 +++++++---------- > 1 file changed, 7 insertions(+), 10 deletions(-) > > diff --git a/mm/page_alloc.c b/mm/page_alloc.c > index 3589febc6d31928f850ebe5a4015ddc40e0469f3..0890a65f8cc2259e82bc1f5ba95a592fb30f9fb8 100644 > --- a/mm/page_alloc.c > +++ b/mm/page_alloc.c > @@ -3685,7 +3685,6 @@ struct page *rmqueue(struct zone *preferred_zone, > gfp_t gfp_flags, unsigned int alloc_flags, > int migratetype) > { > - unsigned long flags; > struct page *page; > > if (likely(pcp_allowed_order(order))) { > @@ -3706,10 +3705,12 @@ struct page *rmqueue(struct zone *preferred_zone, > * allocate greater than order-1 page units with __GFP_NOFAIL. > */ > WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); > - spin_lock_irqsave(&zone->lock, flags); > > do { > + unsigned long flags; > + > page = NULL; > + spin_lock_irqsave(&zone->lock, flags); > /* > * order-0 request can reach here when the pcplist is skipped > * due to non-CMA allocation context. HIGHATOMIC area is > @@ -3723,13 +3724,13 @@ struct page *rmqueue(struct zone *preferred_zone, > } > if (!page) > page = __rmqueue(zone, order, migratetype, alloc_flags); > - } while (page && check_new_pages(page, order)); > - if (!page) > - goto failed; > + spin_unlock_irqrestore(&zone->lock, flags); > + if (!page) > + return NULL; > + } while (check_new_pages(page, order)); > Oh well, it seems hard irqs have to be disabled when calling the following function. I will send a V2. > __mod_zone_freepage_state(zone, -(1 << order), > get_pcppage_migratetype(page)); > - spin_unlock_irqrestore(&zone->lock, flags); > > __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); > zone_statistics(preferred_zone, zone, 1); > @@ -3743,10 +3744,6 @@ struct page *rmqueue(struct zone *preferred_zone, > > VM_BUG_ON_PAGE(page && bad_range(zone, page), page); > return page; > - > -failed: > - spin_unlock_irqrestore(&zone->lock, flags); > - return NULL; > } > > #ifdef CONFIG_FAIL_PAGE_ALLOC > -- > 2.35.1.616.g0bdcbb4464-goog >
On 3/4/22 02:59, Eric Dumazet wrote: > From: Eric Dumazet <edumazet@google.com> > > For high order pages not using pcp, rmqueue() is currently calling > the costly check_new_pages() while zone spinlock is held. > > This is not needed, we can release the spinlock sooner to reduce > zone spinlock contention. > > Signed-off-by: Eric Dumazet <edumazet@google.com> > Cc: Mel Gorman <mgorman@techsingularity.net> > Cc: Vlastimil Babka <vbabka@suse.cz> > Cc: Michal Hocko <mhocko@kernel.org> > Cc: Shakeel Butt <shakeelb@google.com> > Cc: Wei Xu <weixugc@google.com> > Cc: Greg Thelen <gthelen@google.com> > Cc: Hugh Dickins <hughd@google.com> > Cc: David Rientjes <rientjes@google.com> > --- > mm/page_alloc.c | 17 +++++++---------- > 1 file changed, 7 insertions(+), 10 deletions(-) Just a quick reply from what AFAIK immediately see as a issue. > > diff --git a/mm/page_alloc.c b/mm/page_alloc.c > index 3589febc6d31928f850ebe5a4015ddc40e0469f3..0890a65f8cc2259e82bc1f5ba95a592fb30f9fb8 100644 > --- a/mm/page_alloc.c > +++ b/mm/page_alloc.c > @@ -3685,7 +3685,6 @@ struct page *rmqueue(struct zone *preferred_zone, > gfp_t gfp_flags, unsigned int alloc_flags, > int migratetype) > { > - unsigned long flags; > struct page *page; > > if (likely(pcp_allowed_order(order))) { > @@ -3706,10 +3705,12 @@ struct page *rmqueue(struct zone *preferred_zone, > * allocate greater than order-1 page units with __GFP_NOFAIL. > */ > WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); > - spin_lock_irqsave(&zone->lock, flags); > > do { > + unsigned long flags; > + > page = NULL; > + spin_lock_irqsave(&zone->lock, flags); > /* > * order-0 request can reach here when the pcplist is skipped > * due to non-CMA allocation context. HIGHATOMIC area is > @@ -3723,13 +3724,13 @@ struct page *rmqueue(struct zone *preferred_zone, > } > if (!page) > page = __rmqueue(zone, order, migratetype, alloc_flags); > - } while (page && check_new_pages(page, order)); > - if (!page) > - goto failed; > + spin_unlock_irqrestore(&zone->lock, flags); > + if (!page) > + return NULL; > + } while (check_new_pages(page, order)); > > __mod_zone_freepage_state(zone, -(1 << order), > get_pcppage_migratetype(page)); > - spin_unlock_irqrestore(&zone->lock, flags); __mod_zone_freepage_state() outside the lock is AFAIK unsafe. > > __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); > zone_statistics(preferred_zone, zone, 1); > @@ -3743,10 +3744,6 @@ struct page *rmqueue(struct zone *preferred_zone, > > VM_BUG_ON_PAGE(page && bad_range(zone, page), page); > return page; > - > -failed: > - spin_unlock_irqrestore(&zone->lock, flags); > - return NULL; > } > > #ifdef CONFIG_FAIL_PAGE_ALLOC
On 3/4/22 03:09, Eric Dumazet wrote: > On Thu, Mar 3, 2022 at 5:59 PM Eric Dumazet <eric.dumazet@gmail.com> wrote: >> >> From: Eric Dumazet <edumazet@google.com> >> >> For high order pages not using pcp, rmqueue() is currently calling >> the costly check_new_pages() while zone spinlock is held. >> >> This is not needed, we can release the spinlock sooner to reduce >> zone spinlock contention. >> >> Signed-off-by: Eric Dumazet <edumazet@google.com> >> Cc: Mel Gorman <mgorman@techsingularity.net> >> Cc: Vlastimil Babka <vbabka@suse.cz> >> Cc: Michal Hocko <mhocko@kernel.org> >> Cc: Shakeel Butt <shakeelb@google.com> >> Cc: Wei Xu <weixugc@google.com> >> Cc: Greg Thelen <gthelen@google.com> >> Cc: Hugh Dickins <hughd@google.com> >> Cc: David Rientjes <rientjes@google.com> >> --- >> mm/page_alloc.c | 17 +++++++---------- >> 1 file changed, 7 insertions(+), 10 deletions(-) >> >> diff --git a/mm/page_alloc.c b/mm/page_alloc.c >> index 3589febc6d31928f850ebe5a4015ddc40e0469f3..0890a65f8cc2259e82bc1f5ba95a592fb30f9fb8 100644 >> --- a/mm/page_alloc.c >> +++ b/mm/page_alloc.c >> @@ -3685,7 +3685,6 @@ struct page *rmqueue(struct zone *preferred_zone, >> gfp_t gfp_flags, unsigned int alloc_flags, >> int migratetype) >> { >> - unsigned long flags; >> struct page *page; >> >> if (likely(pcp_allowed_order(order))) { >> @@ -3706,10 +3705,12 @@ struct page *rmqueue(struct zone *preferred_zone, >> * allocate greater than order-1 page units with __GFP_NOFAIL. >> */ >> WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); >> - spin_lock_irqsave(&zone->lock, flags); >> >> do { >> + unsigned long flags; >> + >> page = NULL; >> + spin_lock_irqsave(&zone->lock, flags); >> /* >> * order-0 request can reach here when the pcplist is skipped >> * due to non-CMA allocation context. HIGHATOMIC area is >> @@ -3723,13 +3724,13 @@ struct page *rmqueue(struct zone *preferred_zone, >> } >> if (!page) >> page = __rmqueue(zone, order, migratetype, alloc_flags); >> - } while (page && check_new_pages(page, order)); >> - if (!page) >> - goto failed; >> + spin_unlock_irqrestore(&zone->lock, flags); >> + if (!page) >> + return NULL; >> + } while (check_new_pages(page, order)); >> > > Oh well, it seems hard irqs have to be disabled when calling the > following function. Ah, you found out already. Well maybe it could simply be moved inside the loop under the locked section and always done when page != NULL? I mean if check_new_pages() fails we just leak the problematic pages anyway so they are no longer free to allocate anymore and we should not count them as such. > I will send a V2. > >> __mod_zone_freepage_state(zone, -(1 << order), >> get_pcppage_migratetype(page)); >> - spin_unlock_irqrestore(&zone->lock, flags); >> >> __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); >> zone_statistics(preferred_zone, zone, 1); >> @@ -3743,10 +3744,6 @@ struct page *rmqueue(struct zone *preferred_zone, >> >> VM_BUG_ON_PAGE(page && bad_range(zone, page), page); >> return page; >> - >> -failed: >> - spin_unlock_irqrestore(&zone->lock, flags); >> - return NULL; >> } >> >> #ifdef CONFIG_FAIL_PAGE_ALLOC >> -- >> 2.35.1.616.g0bdcbb4464-goog >>
On Fri, Mar 4, 2022 at 12:32 AM Vlastimil Babka <vbabka@suse.cz> wrote: > > > Ah, you found out already. Well maybe it could simply be moved inside the > loop under the locked section and always done when page != NULL? I mean if > check_new_pages() fails we just leak the problematic pages anyway so they > are no longer free to allocate anymore and we should not count them as such. > Yes, and I guess that calling __mod_zone_freepage_state() for the page about to be leaked is just fine. This means we can call check_newpages() without holding zone spinlock or hard irqs being masked. I will test something like: diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3589febc6d31928f850ebe5a4015ddc40e0469f3..1804287c1b792b8aa0e964b17eb002b6b1115258 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3706,10 +3706,10 @@ struct page *rmqueue(struct zone *preferred_zone, * allocate greater than order-1 page units with __GFP_NOFAIL. */ WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); - spin_lock_irqsave(&zone->lock, flags); do { page = NULL; + spin_lock_irqsave(&zone->lock, flags); /* * order-0 request can reach here when the pcplist is skipped * due to non-CMA allocation context. HIGHATOMIC area is @@ -3721,15 +3721,15 @@ struct page *rmqueue(struct zone *preferred_zone, if (page) trace_mm_page_alloc_zone_locked(page, order, migratetype); } - if (!page) + if (!page) { page = __rmqueue(zone, order, migratetype, alloc_flags); - } while (page && check_new_pages(page, order)); - if (!page) - goto failed; - - __mod_zone_freepage_state(zone, -(1 << order), - get_pcppage_migratetype(page)); - spin_unlock_irqrestore(&zone->lock, flags); + if (!page) + goto failed; + } + __mod_zone_freepage_state(zone, -(1 << order), + get_pcppage_migratetype(page)); + spin_unlock_irqrestore(&zone->lock, flags); + } while (check_new_pages(page, order)); __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); zone_statistics(preferred_zone, zone, 1);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3589febc6d31928f850ebe5a4015ddc40e0469f3..0890a65f8cc2259e82bc1f5ba95a592fb30f9fb8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3685,7 +3685,6 @@ struct page *rmqueue(struct zone *preferred_zone, gfp_t gfp_flags, unsigned int alloc_flags, int migratetype) { - unsigned long flags; struct page *page; if (likely(pcp_allowed_order(order))) { @@ -3706,10 +3705,12 @@ struct page *rmqueue(struct zone *preferred_zone, * allocate greater than order-1 page units with __GFP_NOFAIL. */ WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); - spin_lock_irqsave(&zone->lock, flags); do { + unsigned long flags; + page = NULL; + spin_lock_irqsave(&zone->lock, flags); /* * order-0 request can reach here when the pcplist is skipped * due to non-CMA allocation context. HIGHATOMIC area is @@ -3723,13 +3724,13 @@ struct page *rmqueue(struct zone *preferred_zone, } if (!page) page = __rmqueue(zone, order, migratetype, alloc_flags); - } while (page && check_new_pages(page, order)); - if (!page) - goto failed; + spin_unlock_irqrestore(&zone->lock, flags); + if (!page) + return NULL; + } while (check_new_pages(page, order)); __mod_zone_freepage_state(zone, -(1 << order), get_pcppage_migratetype(page)); - spin_unlock_irqrestore(&zone->lock, flags); __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); zone_statistics(preferred_zone, zone, 1); @@ -3743,10 +3744,6 @@ struct page *rmqueue(struct zone *preferred_zone, VM_BUG_ON_PAGE(page && bad_range(zone, page), page); return page; - -failed: - spin_unlock_irqrestore(&zone->lock, flags); - return NULL; } #ifdef CONFIG_FAIL_PAGE_ALLOC