diff mbox series

[5/6] mm/page_alloc: Protect PCP lists with a spinlock

Message ID 20220420095906.27349-6-mgorman@techsingularity.net (mailing list archive)
State New
Headers show
Series Drain remote per-cpu directly | expand

Commit Message

Mel Gorman April 20, 2022, 9:59 a.m. UTC
Currently the PCP lists are protected by using local_lock_irqsave to
prevent migration and IRQ reentrancy but this is inconvenient. Remote
draining of the lists is impossible and a workqueue is required and
every task allocation/free must disable then enable interrupts which is
expensive.

As preparation for dealing with both of those problems, protect the
lists with a spinlock. The IRQ-unsafe version of the lock is used
because IRQs are already disabled by local_lock_irqsave. spin_trylock
is used in preparation for a time when local_lock could be used instead
of lock_lock_irqsave.

The per_cpu_pages still fits within the same number of cache lines after
this patch relative to before the series.

struct per_cpu_pages {
        spinlock_t                 lock;                 /*     0     4 */
        int                        count;                /*     4     4 */
        int                        high;                 /*     8     4 */
        int                        batch;                /*    12     4 */
        short int                  free_factor;          /*    16     2 */
        short int                  expire;               /*    18     2 */

        /* XXX 4 bytes hole, try to pack */

        struct list_head           lists[13];            /*    24   208 */

        /* size: 256, cachelines: 4, members: 7 */
        /* sum members: 228, holes: 1, sum holes: 4 */
        /* padding: 24 */
} __attribute__((__aligned__(64)));

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
---
 include/linux/mmzone.h |   1 +
 mm/page_alloc.c        | 155 +++++++++++++++++++++++++++++++++++------
 2 files changed, 136 insertions(+), 20 deletions(-)

Comments

Hillf Danton April 20, 2022, 2:02 p.m. UTC | #1
On Wed, 20 Apr 2022 10:59:05 +0100 Mel Gorman wrote:
>  void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
>  {
> -	unsigned long flags;
>  	int to_drain, batch;
>  
> -	local_lock_irqsave(&pagesets.lock, flags);
>  	batch = READ_ONCE(pcp->batch);
>  	to_drain = min(pcp->count, batch);
> -	if (to_drain > 0)
> +	if (to_drain > 0) {
> +		unsigned long flags;
> +
> +		/* free_pcppages_bulk expects IRQs disabled for zone->lock */
> +		local_irq_save(flags);
> +
> +		spin_lock(&pcp->lock);

Nit, spin_lock_irqsave() instead.

>  		free_pcppages_bulk(zone, to_drain, pcp, 0);
> -	local_unlock_irqrestore(&pagesets.lock, flags);
> +		spin_unlock(&pcp->lock);
> +
> +		local_irq_restore(flags);
> +	}
>  }
>  #endif

Hillf
Nicolas Saenz Julienne April 20, 2022, 2:35 p.m. UTC | #2
Hi Hillf,

On Wed, 2022-04-20 at 22:02 +0800, Hillf Danton wrote:
> On Wed, 20 Apr 2022 10:59:05 +0100 Mel Gorman wrote:
> >  void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
> >  {
> > -	unsigned long flags;
> >  	int to_drain, batch;
> >  
> > -	local_lock_irqsave(&pagesets.lock, flags);
> >  	batch = READ_ONCE(pcp->batch);
> >  	to_drain = min(pcp->count, batch);
> > -	if (to_drain > 0)
> > +	if (to_drain > 0) {
> > +		unsigned long flags;
> > +
> > +		/* free_pcppages_bulk expects IRQs disabled for zone->lock */
> > +		local_irq_save(flags);
> > +
> > +		spin_lock(&pcp->lock);
> 
> Nit, spin_lock_irqsave() instead.

See cover letter's:

"This series is a partial series. Follow-on work would allow the local_irq_save
to be converted to a local_irq to avoid IRQs being disabled/enabled in most
cases. However, there are enough corner cases that it deserves a series on its
own separated by one kernel release and the priority right now is to avoid
interference of high priority tasks."

Regards,
Nicolas Saenz Julienne April 26, 2022, 4:42 p.m. UTC | #3
On Wed, 2022-04-20 at 10:59 +0100, Mel Gorman wrote:
> Currently the PCP lists are protected by using local_lock_irqsave to
> prevent migration and IRQ reentrancy but this is inconvenient. Remote
> draining of the lists is impossible and a workqueue is required and
> every task allocation/free must disable then enable interrupts which is
> expensive.
> 
> As preparation for dealing with both of those problems, protect the
> lists with a spinlock. The IRQ-unsafe version of the lock is used
> because IRQs are already disabled by local_lock_irqsave. spin_trylock
> is used in preparation for a time when local_lock could be used instead
> of lock_lock_irqsave.
> 
> The per_cpu_pages still fits within the same number of cache lines after
> this patch relative to before the series.
> 
> struct per_cpu_pages {
>         spinlock_t                 lock;                 /*     0     4 */
>         int                        count;                /*     4     4 */
>         int                        high;                 /*     8     4 */
>         int                        batch;                /*    12     4 */
>         short int                  free_factor;          /*    16     2 */
>         short int                  expire;               /*    18     2 */
> 
>         /* XXX 4 bytes hole, try to pack */
> 
>         struct list_head           lists[13];            /*    24   208 */
> 
>         /* size: 256, cachelines: 4, members: 7 */
>         /* sum members: 228, holes: 1, sum holes: 4 */
>         /* padding: 24 */
> } __attribute__((__aligned__(64)));
> 
> Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
> ---
>  include/linux/mmzone.h |   1 +
>  mm/page_alloc.c        | 155 +++++++++++++++++++++++++++++++++++------
>  2 files changed, 136 insertions(+), 20 deletions(-)
> 
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index abe530748de6..8b5757735428 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -385,6 +385,7 @@ enum zone_watermarks {
>  
>  /* Fields and list protected by pagesets local_lock in page_alloc.c */
>  struct per_cpu_pages {
> +	spinlock_t lock;	/* Protects lists field */
>  	int count;		/* number of pages in the list */
>  	int high;		/* high watermark, emptying needed */
>  	int batch;		/* chunk size for buddy add/remove */
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index dc0fdeb3795c..813c84b67c65 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -132,6 +132,17 @@ static DEFINE_PER_CPU(struct pagesets, pagesets) __maybe_unused = {
>  	.lock = INIT_LOCAL_LOCK(lock),
>  };
>  
> +#ifdef CONFIG_SMP
> +/* On SMP, spin_trylock is sufficient protection */
> +#define pcp_trylock_prepare(flags)	do { } while (0)
> +#define pcp_trylock_finish(flag)	do { } while (0)
> +#else
> +
> +/* UP spin_trylock always succeeds so disable IRQs to prevent re-entrancy. */

This is only needed on non-RT kernels. RT UP builds go through
rt_spin_trylock(), which behaves like its SMP counterpart.

> +#define pcp_trylock_prepare(flags)	local_irq_save(flags)
> +#define pcp_trylock_finish(flags)	local_irq_restore(flags)
> +#endif
> +
>  #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
>  DEFINE_PER_CPU(int, numa_node);
>  EXPORT_PER_CPU_SYMBOL(numa_node);
> @@ -3082,15 +3093,22 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
>   */
>  void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
>  {
> -	unsigned long flags;
>  	int to_drain, batch;
>  
> -	local_lock_irqsave(&pagesets.lock, flags);
>  	batch = READ_ONCE(pcp->batch);
>  	to_drain = min(pcp->count, batch);
> -	if (to_drain > 0)
> +	if (to_drain > 0) {
> +		unsigned long flags;
> +
> +		/* free_pcppages_bulk expects IRQs disabled for zone->lock */
> +		local_irq_save(flags);

Why dropping the local_lock? That approach is nicer to RT builds, and I don't
think it makes a difference from a non-RT perspective.

That said, IIUC, this will eventually disappear with subsequent patches, right?

> +
> +		spin_lock(&pcp->lock);
>  		free_pcppages_bulk(zone, to_drain, pcp, 0);
> -	local_unlock_irqrestore(&pagesets.lock, flags);
> +		spin_unlock(&pcp->lock);
> +
> +		local_irq_restore(flags);
> +	}
>  }
>  #endif
>  

[...]

> @@ -3668,9 +3748,30 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
>  			int migratetype,
>  			unsigned int alloc_flags,
>  			struct per_cpu_pages *pcp,
> -			struct list_head *list)
> +			struct list_head *list,
> +			bool locked)
>  {
>  	struct page *page;
> +	unsigned long __maybe_unused UP_flags;
> +
> +	/*
> +	 * spin_trylock is not necessary right now due to due to
> +	 * local_lock_irqsave and is a preparation step for
> +	 * a conversion to local_lock using the trylock to prevent
> +	 * IRQ re-entrancy. If pcp->lock cannot be acquired, the caller
> +	 * uses rmqueue_buddy.
> +	 *
> +	 * TODO: Convert local_lock_irqsave to local_lock. Care
> +	 * 	 is needed as the type of local_lock would need a
> +	 * 	 PREEMPT_RT version due to threaded IRQs.
> +	 */

I missed this in our previous conversation, but as far as RT is concerned
local_lock_irqsave() is the same as local_lock(), see in local_lock_internal.h:

#define __local_lock_irqsave(lock, flags)			\
	do {							\
		typecheck(unsigned long, flags);		\
		flags = 0;					\
		__local_lock(lock);				\
	} while (0)

Something similar happens with spin_lock_irqsave(), see in spinlock_rt.h:

#define spin_lock_irqsave(lock, flags)			 \
	do {						 \
		typecheck(unsigned long, flags);	 \
		flags = 0;				 \
		spin_lock(lock);			 \
	} while (0)

IIUC, RT will run this code paths in threaded IRQ context, where we can think
of RT spinlocks as mutexes (plus some extra priority inheritance magic).

Regards,
Vlastimil Babka April 26, 2022, 4:48 p.m. UTC | #4
On 4/26/22 18:42, Nicolas Saenz Julienne wrote:
> On Wed, 2022-04-20 at 10:59 +0100, Mel Gorman wrote:
>> @@ -3082,15 +3093,22 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
>>   */
>>  void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
>>  {
>> -	unsigned long flags;
>>  	int to_drain, batch;
>>  
>> -	local_lock_irqsave(&pagesets.lock, flags);
>>  	batch = READ_ONCE(pcp->batch);
>>  	to_drain = min(pcp->count, batch);
>> -	if (to_drain > 0)
>> +	if (to_drain > 0) {
>> +		unsigned long flags;
>> +
>> +		/* free_pcppages_bulk expects IRQs disabled for zone->lock */
>> +		local_irq_save(flags);
> 
> Why dropping the local_lock? That approach is nicer to RT builds, and I don't
> think it makes a difference from a non-RT perspective.

I think the separate irq_disable+spin_lock here is actually broken on RT
config, as explained in Documentation/locking/locktypes.rst. pcp->lock would
have to be a raw_spin_lock.

> That said, IIUC, this will eventually disappear with subsequent patches, right?

So it wouldn't be mergeable even as a temporary step.

> 
>> +
>> +		spin_lock(&pcp->lock);
>>  		free_pcppages_bulk(zone, to_drain, pcp, 0);
>> -	local_unlock_irqrestore(&pagesets.lock, flags);
>> +		spin_unlock(&pcp->lock);
>> +
>> +		local_irq_restore(flags);
>> +	}
>>  }
>>  #endif
>>
Minchan Kim April 26, 2022, 7:24 p.m. UTC | #5
On Wed, Apr 20, 2022 at 10:59:05AM +0100, Mel Gorman wrote:
> Currently the PCP lists are protected by using local_lock_irqsave to
> prevent migration and IRQ reentrancy but this is inconvenient. Remote
> draining of the lists is impossible and a workqueue is required and
> every task allocation/free must disable then enable interrupts which is
> expensive.
> 
> As preparation for dealing with both of those problems, protect the
> lists with a spinlock. The IRQ-unsafe version of the lock is used
> because IRQs are already disabled by local_lock_irqsave. spin_trylock
> is used in preparation for a time when local_lock could be used instead
> of lock_lock_irqsave.
> 
> The per_cpu_pages still fits within the same number of cache lines after
> this patch relative to before the series.
> 
> struct per_cpu_pages {
>         spinlock_t                 lock;                 /*     0     4 */
>         int                        count;                /*     4     4 */
>         int                        high;                 /*     8     4 */
>         int                        batch;                /*    12     4 */
>         short int                  free_factor;          /*    16     2 */
>         short int                  expire;               /*    18     2 */
> 
>         /* XXX 4 bytes hole, try to pack */
> 
>         struct list_head           lists[13];            /*    24   208 */
> 
>         /* size: 256, cachelines: 4, members: 7 */
>         /* sum members: 228, holes: 1, sum holes: 4 */
>         /* padding: 24 */
> } __attribute__((__aligned__(64)));
> 
> Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
> ---
>  include/linux/mmzone.h |   1 +
>  mm/page_alloc.c        | 155 +++++++++++++++++++++++++++++++++++------
>  2 files changed, 136 insertions(+), 20 deletions(-)
> 
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index abe530748de6..8b5757735428 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -385,6 +385,7 @@ enum zone_watermarks {
>  
>  /* Fields and list protected by pagesets local_lock in page_alloc.c */
>  struct per_cpu_pages {
> +	spinlock_t lock;	/* Protects lists field */
>  	int count;		/* number of pages in the list */
>  	int high;		/* high watermark, emptying needed */
>  	int batch;		/* chunk size for buddy add/remove */
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index dc0fdeb3795c..813c84b67c65 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -132,6 +132,17 @@ static DEFINE_PER_CPU(struct pagesets, pagesets) __maybe_unused = {
>  	.lock = INIT_LOCAL_LOCK(lock),
>  };
>  
> +#ifdef CONFIG_SMP
> +/* On SMP, spin_trylock is sufficient protection */
> +#define pcp_trylock_prepare(flags)	do { } while (0)
> +#define pcp_trylock_finish(flag)	do { } while (0)
> +#else
> +
> +/* UP spin_trylock always succeeds so disable IRQs to prevent re-entrancy. */
> +#define pcp_trylock_prepare(flags)	local_irq_save(flags)
> +#define pcp_trylock_finish(flags)	local_irq_restore(flags)
> +#endif
> +
>  #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
>  DEFINE_PER_CPU(int, numa_node);
>  EXPORT_PER_CPU_SYMBOL(numa_node);
> @@ -3082,15 +3093,22 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
>   */
>  void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
>  {
> -	unsigned long flags;
>  	int to_drain, batch;
>  
> -	local_lock_irqsave(&pagesets.lock, flags);
>  	batch = READ_ONCE(pcp->batch);
>  	to_drain = min(pcp->count, batch);
> -	if (to_drain > 0)
> +	if (to_drain > 0) {
> +		unsigned long flags;
> +
> +		/* free_pcppages_bulk expects IRQs disabled for zone->lock */
> +		local_irq_save(flags);
> +
> +		spin_lock(&pcp->lock);
>  		free_pcppages_bulk(zone, to_drain, pcp, 0);
> -	local_unlock_irqrestore(&pagesets.lock, flags);
> +		spin_unlock(&pcp->lock);
> +
> +		local_irq_restore(flags);
> +	}
>  }
>  #endif
>  
> @@ -3103,16 +3121,21 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
>   */
>  static void drain_pages_zone(unsigned int cpu, struct zone *zone)
>  {
> -	unsigned long flags;
>  	struct per_cpu_pages *pcp;
>  
> -	local_lock_irqsave(&pagesets.lock, flags);
> -
>  	pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
> -	if (pcp->count)
> +	if (pcp->count) {
> +		unsigned long flags;
> +
> +		/* free_pcppages_bulk expects IRQs disabled for zone->lock */
> +		local_irq_save(flags);
> +
> +		spin_lock(&pcp->lock);
>  		free_pcppages_bulk(zone, pcp->count, pcp, 0);
> +		spin_unlock(&pcp->lock);
>  
> -	local_unlock_irqrestore(&pagesets.lock, flags);
> +		local_irq_restore(flags);
> +	}
>  }
>  
>  /*
> @@ -3380,18 +3403,30 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone,
>  	return min(READ_ONCE(pcp->batch) << 2, high);
>  }
>  
> -static void free_unref_page_commit(struct page *page, int migratetype,
> -				   unsigned int order)
> +/* Returns true if the page was committed to the per-cpu list. */
> +static bool free_unref_page_commit(struct page *page, int migratetype,
> +				   unsigned int order, bool locked)
>  {
>  	struct zone *zone = page_zone(page);
>  	struct per_cpu_pages *pcp;
>  	int high;
>  	int pindex;
>  	bool free_high;
> +	unsigned long __maybe_unused UP_flags;
>  
>  	__count_vm_event(PGFREE);
>  	pcp = this_cpu_ptr(zone->per_cpu_pageset);
>  	pindex = order_to_pindex(migratetype, order);
> +
> +	if (!locked) {
> +		/* Protect against a parallel drain. */
> +		pcp_trylock_prepare(UP_flags);
> +		if (!spin_trylock(&pcp->lock)) {
> +			pcp_trylock_finish(UP_flags);
> +			return false;
> +		}
> +	}
> +
>  	list_add(&page->pcp_list, &pcp->lists[pindex]);
>  	pcp->count += 1 << order;
>  
> @@ -3409,6 +3444,13 @@ static void free_unref_page_commit(struct page *page, int migratetype,
>  
>  		free_pcppages_bulk(zone, nr_pcp_free(pcp, high, batch, free_high), pcp, pindex);
>  	}
> +
> +	if (!locked) {
> +		spin_unlock(&pcp->lock);
> +		pcp_trylock_finish(UP_flags);
> +	}
> +
> +	return true;
>  }
>  
>  /*
> @@ -3419,6 +3461,7 @@ void free_unref_page(struct page *page, unsigned int order)
>  	unsigned long flags;
>  	unsigned long pfn = page_to_pfn(page);
>  	int migratetype;
> +	bool freed_pcp = false;
>  
>  	if (!free_unref_page_prepare(page, pfn, order))
>  		return;
> @@ -3440,8 +3483,11 @@ void free_unref_page(struct page *page, unsigned int order)
>  	}
>  
>  	local_lock_irqsave(&pagesets.lock, flags);
> -	free_unref_page_commit(page, migratetype, order);
> +	freed_pcp = free_unref_page_commit(page, migratetype, order, false);
>  	local_unlock_irqrestore(&pagesets.lock, flags);
> +
> +	if (unlikely(!freed_pcp))
> +		free_one_page(page_zone(page), page, pfn, order, migratetype, FPI_NONE);
>  }
>  
>  /*
> @@ -3450,10 +3496,19 @@ void free_unref_page(struct page *page, unsigned int order)
>  void free_unref_page_list(struct list_head *list)
>  {
>  	struct page *page, *next;
> +	struct per_cpu_pages *pcp;
> +	struct zone *locked_zone;
>  	unsigned long flags;
>  	int batch_count = 0;
>  	int migratetype;
>  
> +	/*
> +	 * An empty list is possible. Check early so that the later
> +	 * lru_to_page() does not potentially read garbage.
> +	 */
> +	if (list_empty(list))
> +		return;
> +
>  	/* Prepare pages for freeing */
>  	list_for_each_entry_safe(page, next, list, lru) {
>  		unsigned long pfn = page_to_pfn(page);
> @@ -3474,8 +3529,26 @@ void free_unref_page_list(struct list_head *list)
>  		}
>  	}
>  
> +	VM_BUG_ON(in_hardirq());

You need to check the list_empty here again and bail out if it is.

> +
>  	local_lock_irqsave(&pagesets.lock, flags);
> +
> +	page = lru_to_page(list);
Mel Gorman April 29, 2022, 9:05 a.m. UTC | #6
On Tue, Apr 26, 2022 at 12:24:56PM -0700, Minchan Kim wrote:
> > @@ -3450,10 +3496,19 @@ void free_unref_page(struct page *page, unsigned int order)
> >  void free_unref_page_list(struct list_head *list)
> >  {
> >  	struct page *page, *next;
> > +	struct per_cpu_pages *pcp;
> > +	struct zone *locked_zone;
> >  	unsigned long flags;
> >  	int batch_count = 0;
> >  	int migratetype;
> >  
> > +	/*
> > +	 * An empty list is possible. Check early so that the later
> > +	 * lru_to_page() does not potentially read garbage.
> > +	 */
> > +	if (list_empty(list))
> > +		return;
> > +
> >  	/* Prepare pages for freeing */
> >  	list_for_each_entry_safe(page, next, list, lru) {
> >  		unsigned long pfn = page_to_pfn(page);
> > @@ -3474,8 +3529,26 @@ void free_unref_page_list(struct list_head *list)
> >  		}
> >  	}
> >  
> > +	VM_BUG_ON(in_hardirq());
> 
> You need to check the list_empty here again and bail out if it is.
> 

You're right, every page could have failed to prepare or was isolated.
Mel Gorman April 29, 2022, 9:13 a.m. UTC | #7
On Tue, Apr 26, 2022 at 06:42:43PM +0200, Nicolas Saenz Julienne wrote:
> > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > index dc0fdeb3795c..813c84b67c65 100644
> > --- a/mm/page_alloc.c
> > +++ b/mm/page_alloc.c
> > @@ -132,6 +132,17 @@ static DEFINE_PER_CPU(struct pagesets, pagesets) __maybe_unused = {
> >  	.lock = INIT_LOCAL_LOCK(lock),
> >  };
> >  
> > +#ifdef CONFIG_SMP
> > +/* On SMP, spin_trylock is sufficient protection */
> > +#define pcp_trylock_prepare(flags)	do { } while (0)
> > +#define pcp_trylock_finish(flag)	do { } while (0)
> > +#else
> > +
> > +/* UP spin_trylock always succeeds so disable IRQs to prevent re-entrancy. */
> 
> This is only needed on non-RT kernels. RT UP builds go through
> rt_spin_trylock(), which behaves like its SMP counterpart.
> 

I missed that.

> > +#define pcp_trylock_prepare(flags)	local_irq_save(flags)
> > +#define pcp_trylock_finish(flags)	local_irq_restore(flags)
> > +#endif
> > +
> >  #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
> >  DEFINE_PER_CPU(int, numa_node);
> >  EXPORT_PER_CPU_SYMBOL(numa_node);
> > @@ -3082,15 +3093,22 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
> >   */
> >  void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
> >  {
> > -	unsigned long flags;
> >  	int to_drain, batch;
> >  
> > -	local_lock_irqsave(&pagesets.lock, flags);
> >  	batch = READ_ONCE(pcp->batch);
> >  	to_drain = min(pcp->count, batch);
> > -	if (to_drain > 0)
> > +	if (to_drain > 0) {
> > +		unsigned long flags;
> > +
> > +		/* free_pcppages_bulk expects IRQs disabled for zone->lock */
> > +		local_irq_save(flags);
> 
> Why dropping the local_lock? That approach is nicer to RT builds, and I don't
> think it makes a difference from a non-RT perspective.
> 

The local_lock was dropped because local_lock stabilises the lookup of
pcp but in this context, we are looking at a remote pcp and local_lock
does not protect the pcp reference. It confuses what local_lock is
protecting exactly.

> That said, IIUC, this will eventually disappear with subsequent patches, right?
> 

It would have but this patch as also wrong as pointed out by Vlastimil.
So even if it was eventually correct, it would not be safe to add this
patch as-is and spin_lock_irqsave is required.

> > +
> > +		spin_lock(&pcp->lock);
> >  		free_pcppages_bulk(zone, to_drain, pcp, 0);
> > -	local_unlock_irqrestore(&pagesets.lock, flags);
> > +		spin_unlock(&pcp->lock);
> > +
> > +		local_irq_restore(flags);
> > +	}
> >  }
> >  #endif
> >  
> 
> [...]
> 
> > @@ -3668,9 +3748,30 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
> >  			int migratetype,
> >  			unsigned int alloc_flags,
> >  			struct per_cpu_pages *pcp,
> > -			struct list_head *list)
> > +			struct list_head *list,
> > +			bool locked)
> >  {
> >  	struct page *page;
> > +	unsigned long __maybe_unused UP_flags;
> > +
> > +	/*
> > +	 * spin_trylock is not necessary right now due to due to
> > +	 * local_lock_irqsave and is a preparation step for
> > +	 * a conversion to local_lock using the trylock to prevent
> > +	 * IRQ re-entrancy. If pcp->lock cannot be acquired, the caller
> > +	 * uses rmqueue_buddy.
> > +	 *
> > +	 * TODO: Convert local_lock_irqsave to local_lock. Care
> > +	 * 	 is needed as the type of local_lock would need a
> > +	 * 	 PREEMPT_RT version due to threaded IRQs.
> > +	 */
> 
> I missed this in our previous conversation, but as far as RT is concerned
> local_lock_irqsave() is the same as local_lock(), see in local_lock_internal.h:
> 
> #define __local_lock_irqsave(lock, flags)			\
> 	do {							\
> 		typecheck(unsigned long, flags);		\
> 		flags = 0;					\
> 		__local_lock(lock);				\
> 	} while (0)
> 
> Something similar happens with spin_lock_irqsave(), see in spinlock_rt.h:
> 
> #define spin_lock_irqsave(lock, flags)			 \
> 	do {						 \
> 		typecheck(unsigned long, flags);	 \
> 		flags = 0;				 \
> 		spin_lock(lock);			 \
> 	} while (0)
> 
> IIUC, RT will run this code paths in threaded IRQ context, where we can think
> of RT spinlocks as mutexes (plus some extra priority inheritance magic).
> 

Ah, thanks for the explanation.

Based on your feedback, Vlastimil's and Minchan's, the current delta
between this series and what I'm preparing for testing is

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 17d11eb0413e..4ac39d30ec8f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -132,8 +132,11 @@ static DEFINE_PER_CPU(struct pagesets, pagesets) __maybe_unused = {
 	.lock = INIT_LOCAL_LOCK(lock),
 };
 
-#ifdef CONFIG_SMP
-/* On SMP, spin_trylock is sufficient protection */
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
+/*
+ * On SMP, spin_trylock is sufficient protection.
+ * On PREEMPT_RT, spin_trylock is equivalent on both SMP and UP.
+ */
 #define pcp_trylock_prepare(flags)	do { } while (0)
 #define pcp_trylock_finish(flag)	do { } while (0)
 #else
@@ -3091,14 +3094,14 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
 	if (to_drain > 0) {
 		unsigned long flags;
 
-		/* free_pcppages_bulk expects IRQs disabled for zone->lock */
-		local_irq_save(flags);
-
-		spin_lock(&pcp->lock);
+		/*
+		 * free_pcppages_bulk expects IRQs disabled for zone->lock
+		 * so even though pcp->lock is not intended to be IRQ-safe,
+		 * it's needed in this context.
+		 */
+		spin_lock_irqsave(&pcp->lock, flags);
 		free_pcppages_bulk(zone, to_drain, pcp, 0);
-		spin_unlock(&pcp->lock);
-
-		local_irq_restore(flags);
+		spin_unlock_irqrestore(&pcp->lock, flags);
 	}
 }
 #endif
@@ -3114,14 +3117,10 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone)
 	if (pcp->count) {
 		unsigned long flags;
 
-		/* free_pcppages_bulk expects IRQs disabled for zone->lock */
-		local_irq_save(flags);
-
-		spin_lock(&pcp->lock);
+		/* See drain_zone_pages on why this is disabling IRQs */
+		spin_lock_irqsave(&pcp->lock, flags);
 		free_pcppages_bulk(zone, pcp->count, pcp, 0);
-		spin_unlock(&pcp->lock);
-
-		local_irq_restore(flags);
+		spin_unlock_irqrestore(&pcp->lock, flags);
 	}
 }
 
@@ -3480,6 +3479,13 @@ void free_unref_page_list(struct list_head *list)
 		}
 	}
 
+	/*
+	 * Preparation could have drained the list due to failing to prepare
+	 * or all pages are being isolated.
+	 */
+	if (list_empty(list))
+		return;
+
 	VM_BUG_ON(in_hardirq());
 
 	local_lock_irqsave(&pagesets.lock, flags);
@@ -3515,6 +3521,9 @@ void free_unref_page_list(struct list_head *list)
 		 * allocator directly. This is expensive as the zone lock will
 		 * be acquired multiple times but if a drain is in progress
 		 * then an expensive operation is already taking place.
+		 *
+		 * TODO: Always false at the moment due to local_lock_irqsave
+		 *       and is preparation for converting to local_lock.
 		 */
 		if (unlikely(!free_unref_page_commit(page, migratetype, 0, true)))
 			free_one_page(page_zone(page), page, page_to_pfn(page), 0, migratetype, FPI_NONE);
@@ -3717,9 +3726,7 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
 	 * IRQ re-entrancy. If pcp->lock cannot be acquired, the caller
 	 * uses rmqueue_buddy.
 	 *
-	 * TODO: Convert local_lock_irqsave to local_lock. Care
-	 * 	 is needed as the type of local_lock would need a
-	 * 	 PREEMPT_RT version due to threaded IRQs.
+	 * TODO: Convert local_lock_irqsave to local_lock.
 	 */
 	if (unlikely(!locked)) {
 		pcp_trylock_prepare(UP_flags);
diff mbox series

Patch

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index abe530748de6..8b5757735428 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -385,6 +385,7 @@  enum zone_watermarks {
 
 /* Fields and list protected by pagesets local_lock in page_alloc.c */
 struct per_cpu_pages {
+	spinlock_t lock;	/* Protects lists field */
 	int count;		/* number of pages in the list */
 	int high;		/* high watermark, emptying needed */
 	int batch;		/* chunk size for buddy add/remove */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index dc0fdeb3795c..813c84b67c65 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -132,6 +132,17 @@  static DEFINE_PER_CPU(struct pagesets, pagesets) __maybe_unused = {
 	.lock = INIT_LOCAL_LOCK(lock),
 };
 
+#ifdef CONFIG_SMP
+/* On SMP, spin_trylock is sufficient protection */
+#define pcp_trylock_prepare(flags)	do { } while (0)
+#define pcp_trylock_finish(flag)	do { } while (0)
+#else
+
+/* UP spin_trylock always succeeds so disable IRQs to prevent re-entrancy. */
+#define pcp_trylock_prepare(flags)	local_irq_save(flags)
+#define pcp_trylock_finish(flags)	local_irq_restore(flags)
+#endif
+
 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
 DEFINE_PER_CPU(int, numa_node);
 EXPORT_PER_CPU_SYMBOL(numa_node);
@@ -3082,15 +3093,22 @@  static int rmqueue_bulk(struct zone *zone, unsigned int order,
  */
 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
 {
-	unsigned long flags;
 	int to_drain, batch;
 
-	local_lock_irqsave(&pagesets.lock, flags);
 	batch = READ_ONCE(pcp->batch);
 	to_drain = min(pcp->count, batch);
-	if (to_drain > 0)
+	if (to_drain > 0) {
+		unsigned long flags;
+
+		/* free_pcppages_bulk expects IRQs disabled for zone->lock */
+		local_irq_save(flags);
+
+		spin_lock(&pcp->lock);
 		free_pcppages_bulk(zone, to_drain, pcp, 0);
-	local_unlock_irqrestore(&pagesets.lock, flags);
+		spin_unlock(&pcp->lock);
+
+		local_irq_restore(flags);
+	}
 }
 #endif
 
@@ -3103,16 +3121,21 @@  void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
  */
 static void drain_pages_zone(unsigned int cpu, struct zone *zone)
 {
-	unsigned long flags;
 	struct per_cpu_pages *pcp;
 
-	local_lock_irqsave(&pagesets.lock, flags);
-
 	pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
-	if (pcp->count)
+	if (pcp->count) {
+		unsigned long flags;
+
+		/* free_pcppages_bulk expects IRQs disabled for zone->lock */
+		local_irq_save(flags);
+
+		spin_lock(&pcp->lock);
 		free_pcppages_bulk(zone, pcp->count, pcp, 0);
+		spin_unlock(&pcp->lock);
 
-	local_unlock_irqrestore(&pagesets.lock, flags);
+		local_irq_restore(flags);
+	}
 }
 
 /*
@@ -3380,18 +3403,30 @@  static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone,
 	return min(READ_ONCE(pcp->batch) << 2, high);
 }
 
-static void free_unref_page_commit(struct page *page, int migratetype,
-				   unsigned int order)
+/* Returns true if the page was committed to the per-cpu list. */
+static bool free_unref_page_commit(struct page *page, int migratetype,
+				   unsigned int order, bool locked)
 {
 	struct zone *zone = page_zone(page);
 	struct per_cpu_pages *pcp;
 	int high;
 	int pindex;
 	bool free_high;
+	unsigned long __maybe_unused UP_flags;
 
 	__count_vm_event(PGFREE);
 	pcp = this_cpu_ptr(zone->per_cpu_pageset);
 	pindex = order_to_pindex(migratetype, order);
+
+	if (!locked) {
+		/* Protect against a parallel drain. */
+		pcp_trylock_prepare(UP_flags);
+		if (!spin_trylock(&pcp->lock)) {
+			pcp_trylock_finish(UP_flags);
+			return false;
+		}
+	}
+
 	list_add(&page->pcp_list, &pcp->lists[pindex]);
 	pcp->count += 1 << order;
 
@@ -3409,6 +3444,13 @@  static void free_unref_page_commit(struct page *page, int migratetype,
 
 		free_pcppages_bulk(zone, nr_pcp_free(pcp, high, batch, free_high), pcp, pindex);
 	}
+
+	if (!locked) {
+		spin_unlock(&pcp->lock);
+		pcp_trylock_finish(UP_flags);
+	}
+
+	return true;
 }
 
 /*
@@ -3419,6 +3461,7 @@  void free_unref_page(struct page *page, unsigned int order)
 	unsigned long flags;
 	unsigned long pfn = page_to_pfn(page);
 	int migratetype;
+	bool freed_pcp = false;
 
 	if (!free_unref_page_prepare(page, pfn, order))
 		return;
@@ -3440,8 +3483,11 @@  void free_unref_page(struct page *page, unsigned int order)
 	}
 
 	local_lock_irqsave(&pagesets.lock, flags);
-	free_unref_page_commit(page, migratetype, order);
+	freed_pcp = free_unref_page_commit(page, migratetype, order, false);
 	local_unlock_irqrestore(&pagesets.lock, flags);
+
+	if (unlikely(!freed_pcp))
+		free_one_page(page_zone(page), page, pfn, order, migratetype, FPI_NONE);
 }
 
 /*
@@ -3450,10 +3496,19 @@  void free_unref_page(struct page *page, unsigned int order)
 void free_unref_page_list(struct list_head *list)
 {
 	struct page *page, *next;
+	struct per_cpu_pages *pcp;
+	struct zone *locked_zone;
 	unsigned long flags;
 	int batch_count = 0;
 	int migratetype;
 
+	/*
+	 * An empty list is possible. Check early so that the later
+	 * lru_to_page() does not potentially read garbage.
+	 */
+	if (list_empty(list))
+		return;
+
 	/* Prepare pages for freeing */
 	list_for_each_entry_safe(page, next, list, lru) {
 		unsigned long pfn = page_to_pfn(page);
@@ -3474,8 +3529,26 @@  void free_unref_page_list(struct list_head *list)
 		}
 	}
 
+	VM_BUG_ON(in_hardirq());
+
 	local_lock_irqsave(&pagesets.lock, flags);
+
+	page = lru_to_page(list);
+	locked_zone = page_zone(page);
+	pcp = this_cpu_ptr(locked_zone->per_cpu_pageset);
+	spin_lock(&pcp->lock);
+
 	list_for_each_entry_safe(page, next, list, lru) {
+		struct zone *zone = page_zone(page);
+
+		/* Different zone, different pcp lock. */
+		if (zone != locked_zone) {
+			spin_unlock(&pcp->lock);
+			locked_zone = zone;
+			pcp = this_cpu_ptr(zone->per_cpu_pageset);
+			spin_lock(&pcp->lock);
+		}
+
 		/*
 		 * Non-isolated types over MIGRATE_PCPTYPES get added
 		 * to the MIGRATE_MOVABLE pcp list.
@@ -3485,18 +3558,25 @@  void free_unref_page_list(struct list_head *list)
 			migratetype = MIGRATE_MOVABLE;
 
 		trace_mm_page_free_batched(page);
-		free_unref_page_commit(page, migratetype, 0);
+
+		/* True is dead code at the moment due to local_lock_irqsave. */
+		if (unlikely(!free_unref_page_commit(page, migratetype, 0, true)))
+			free_one_page(page_zone(page), page, page_to_pfn(page), 0, migratetype, FPI_NONE);
 
 		/*
 		 * Guard against excessive IRQ disabled times when we get
 		 * a large list of pages to free.
 		 */
 		if (++batch_count == SWAP_CLUSTER_MAX) {
+			spin_unlock(&pcp->lock);
 			local_unlock_irqrestore(&pagesets.lock, flags);
 			batch_count = 0;
 			local_lock_irqsave(&pagesets.lock, flags);
+			pcp = this_cpu_ptr(locked_zone->per_cpu_pageset);
+			spin_lock(&pcp->lock);
 		}
 	}
+	spin_unlock(&pcp->lock);
 	local_unlock_irqrestore(&pagesets.lock, flags);
 }
 
@@ -3668,9 +3748,30 @@  struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
 			int migratetype,
 			unsigned int alloc_flags,
 			struct per_cpu_pages *pcp,
-			struct list_head *list)
+			struct list_head *list,
+			bool locked)
 {
 	struct page *page;
+	unsigned long __maybe_unused UP_flags;
+
+	/*
+	 * spin_trylock is not necessary right now due to due to
+	 * local_lock_irqsave and is a preparation step for
+	 * a conversion to local_lock using the trylock to prevent
+	 * IRQ re-entrancy. If pcp->lock cannot be acquired, the caller
+	 * uses rmqueue_buddy.
+	 *
+	 * TODO: Convert local_lock_irqsave to local_lock. Care
+	 * 	 is needed as the type of local_lock would need a
+	 * 	 PREEMPT_RT version due to threaded IRQs.
+	 */
+	if (unlikely(!locked)) {
+		pcp_trylock_prepare(UP_flags);
+		if (!spin_trylock(&pcp->lock)) {
+			pcp_trylock_finish(UP_flags);
+			return NULL;
+		}
+	}
 
 	do {
 		if (list_empty(list)) {
@@ -3691,8 +3792,10 @@  struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
 					migratetype, alloc_flags);
 
 			pcp->count += alloced << order;
-			if (unlikely(list_empty(list)))
-				return NULL;
+			if (unlikely(list_empty(list))) {
+				page = NULL;
+				goto out;
+			}
 		}
 
 		page = list_first_entry(list, struct page, lru);
@@ -3700,6 +3803,12 @@  struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
 		pcp->count -= 1 << order;
 	} while (check_new_pcp(page, order));
 
+out:
+	if (!locked) {
+		spin_unlock(&pcp->lock);
+		pcp_trylock_finish(UP_flags);
+	}
+
 	return page;
 }
 
@@ -3724,7 +3833,7 @@  static struct page *rmqueue_pcplist(struct zone *preferred_zone,
 	pcp = this_cpu_ptr(zone->per_cpu_pageset);
 	pcp->free_factor >>= 1;
 	list = &pcp->lists[order_to_pindex(migratetype, order)];
-	page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list);
+	page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list, false);
 	local_unlock_irqrestore(&pagesets.lock, flags);
 	if (page) {
 		__count_zid_vm_events(PGALLOC, page_zonenum(page), 1);
@@ -3759,7 +3868,8 @@  struct page *rmqueue(struct zone *preferred_zone,
 				migratetype != MIGRATE_MOVABLE) {
 			page = rmqueue_pcplist(preferred_zone, zone, order,
 					gfp_flags, migratetype, alloc_flags);
-			goto out;
+			if (likely(page))
+				goto out;
 		}
 	}
 
@@ -5326,6 +5436,7 @@  unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
 	local_lock_irqsave(&pagesets.lock, flags);
 	pcp = this_cpu_ptr(zone->per_cpu_pageset);
 	pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)];
+	spin_lock(&pcp->lock);
 
 	while (nr_populated < nr_pages) {
 
@@ -5336,11 +5447,13 @@  unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
 		}
 
 		page = __rmqueue_pcplist(zone, 0, ac.migratetype, alloc_flags,
-								pcp, pcp_list);
+							pcp, pcp_list, true);
 		if (unlikely(!page)) {
 			/* Try and get at least one page */
-			if (!nr_populated)
+			if (!nr_populated) {
+				spin_unlock(&pcp->lock);
 				goto failed_irq;
+			}
 			break;
 		}
 		nr_account++;
@@ -5353,6 +5466,7 @@  unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
 		nr_populated++;
 	}
 
+	spin_unlock(&pcp->lock);
 	local_unlock_irqrestore(&pagesets.lock, flags);
 
 	__count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account);
@@ -6992,6 +7106,7 @@  static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonesta
 	memset(pcp, 0, sizeof(*pcp));
 	memset(pzstats, 0, sizeof(*pzstats));
 
+	spin_lock_init(&pcp->lock);
 	for (pindex = 0; pindex < NR_PCP_LISTS; pindex++)
 		INIT_LIST_HEAD(&pcp->lists[pindex]);