diff mbox series

mm/page_alloc: try oom if reclaim is unable to make forward progress

Message ID 20210315165837.789593-1-atomlin@redhat.com (mailing list archive)
State New, archived
Headers show
Series mm/page_alloc: try oom if reclaim is unable to make forward progress | expand

Commit Message

Aaron Tomlin March 15, 2021, 4:58 p.m. UTC
In the situation where direct reclaim is required to make progress for
compaction but no_progress_loops is already over the limit of
MAX_RECLAIM_RETRIES consider invoking the oom killer.

Signed-off-by: Aaron Tomlin <atomlin@redhat.com>
---
 mm/page_alloc.c | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

Comments

kernel test robot March 15, 2021, 7:54 p.m. UTC | #1
Hi Aaron,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on hnaz-linux-mm/master]

url:    https://github.com/0day-ci/linux/commits/Aaron-Tomlin/mm-page_alloc-try-oom-if-reclaim-is-unable-to-make-forward-progress/20210316-010203
base:   https://github.com/hnaz/linux-mm master
config: powerpc64-randconfig-r012-20210315 (attached as .config)
compiler: clang version 13.0.0 (https://github.com/llvm/llvm-project a28facba1ccdc957f386b7753f4958307f1bfde8)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # install powerpc64 cross compiling tool for clang build
        # apt-get install binutils-powerpc64-linux-gnu
        # https://github.com/0day-ci/linux/commit/77338aaff2606a7715c832545e79370e849e3b4e
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Aaron-Tomlin/mm-page_alloc-try-oom-if-reclaim-is-unable-to-make-forward-progress/20210316-010203
        git checkout 77338aaff2606a7715c832545e79370e849e3b4e
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross ARCH=powerpc64 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   mm/page_alloc.c:2538:5: warning: no previous prototype for function 'find_suitable_fallback' [-Wmissing-prototypes]
   int find_suitable_fallback(struct free_area *area, unsigned int order,
       ^
   mm/page_alloc.c:2538:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
   int find_suitable_fallback(struct free_area *area, unsigned int order,
   ^
   static 
>> mm/page_alloc.c:4444:3: error: use of undeclared identifier 'result'
                   result false;
                   ^
>> mm/page_alloc.c:4447:50: error: expected ';' after return statement
                   return unreserve_highatomic_pageblock(ac, true)
                                                                  ^
                                                                  ;
>> mm/page_alloc.c:4507:2: error: expected expression
           else
           ^
>> mm/page_alloc.c:4719:6: error: implicit declaration of function 'should_try_oom' [-Werror,-Wimplicit-function-declaration]
           if (should_try_oom(no_progress_loops, compact_result))
               ^
>> mm/page_alloc.c:4720:11: error: expected ';' after goto statement
                   goto oom:
                           ^
                           ;
   mm/page_alloc.c:6136:23: warning: no previous prototype for function 'memmap_init' [-Wmissing-prototypes]
   void __meminit __weak memmap_init(unsigned long size, int nid,
                         ^
   mm/page_alloc.c:6136:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
   void __meminit __weak memmap_init(unsigned long size, int nid,
   ^
   static 
   2 warnings and 5 errors generated.


vim +/result +4444 mm/page_alloc.c

  4409	
  4410	/*
  4411	 * Checks whether it makes sense to retry the reclaim to make a forward progress
  4412	 * for the given allocation request.
  4413	 *
  4414	 * We give up when we either have tried MAX_RECLAIM_RETRIES in a row
  4415	 * without success, or when we couldn't even meet the watermark if we
  4416	 * reclaimed all remaining pages on the LRU lists.
  4417	 *
  4418	 * Returns true if a retry is viable or false to enter the oom path.
  4419	 */
  4420	static inline bool
  4421	should_reclaim_retry(gfp_t gfp_mask, unsigned order,
  4422			     struct alloc_context *ac, int alloc_flags,
  4423			     bool did_some_progress, int *no_progress_loops)
  4424	{
  4425		struct zone *zone;
  4426		struct zoneref *z;
  4427		bool ret = false;
  4428	
  4429		/*
  4430		 * Costly allocations might have made a progress but this doesn't mean
  4431		 * their order will become available due to high fragmentation so
  4432		 * always increment the no progress counter for them
  4433		 */
  4434		if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER)
  4435			*no_progress_loops = 0;
  4436		else
  4437			(*no_progress_loops)++;
  4438	
  4439		/*
  4440		 * Make sure we converge to OOM if we cannot make any progress
  4441		 * several times in the row.
  4442		 */
  4443		if (*no_progress_loops > MAX_RECLAIM_RETRIES)
> 4444			result false;
  4445		/* Last chance before OOM, try draining highatomic_reserve once */
  4446		else if (*no_progress_loops == MAX_RECLAIM_RETRIES)
> 4447			return unreserve_highatomic_pageblock(ac, true)
  4448	
  4449		/*
  4450		 * Keep reclaiming pages while there is a chance this will lead
  4451		 * somewhere.  If none of the target zones can satisfy our allocation
  4452		 * request even if all reclaimable pages are considered then we are
  4453		 * screwed and have to go OOM.
  4454		 */
  4455		for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
  4456					ac->highest_zoneidx, ac->nodemask) {
  4457			unsigned long available;
  4458			unsigned long reclaimable;
  4459			unsigned long min_wmark = min_wmark_pages(zone);
  4460			bool wmark;
  4461	
  4462			available = reclaimable = zone_reclaimable_pages(zone);
  4463			available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
  4464	
  4465			/*
  4466			 * Would the allocation succeed if we reclaimed all
  4467			 * reclaimable pages?
  4468			 */
  4469			wmark = __zone_watermark_ok(zone, order, min_wmark,
  4470					ac->highest_zoneidx, alloc_flags, available);
  4471			trace_reclaim_retry_zone(z, order, reclaimable,
  4472					available, min_wmark, *no_progress_loops, wmark);
  4473			if (wmark) {
  4474				/*
  4475				 * If we didn't make any progress and have a lot of
  4476				 * dirty + writeback pages then we should wait for
  4477				 * an IO to complete to slow down the reclaim and
  4478				 * prevent from pre mature OOM
  4479				 */
  4480				if (!did_some_progress) {
  4481					unsigned long write_pending;
  4482	
  4483					write_pending = zone_page_state_snapshot(zone,
  4484								NR_ZONE_WRITE_PENDING);
  4485	
  4486					if (2 * write_pending > reclaimable) {
  4487						congestion_wait(BLK_RW_ASYNC, HZ/10);
  4488						return true;
  4489					}
  4490				}
  4491	
  4492				ret = true;
  4493				goto out;
  4494			}
  4495		}
  4496	
  4497	out:
  4498		/*
  4499		 * Memory allocation/reclaim might be called from a WQ context and the
  4500		 * current implementation of the WQ concurrency control doesn't
  4501		 * recognize that a particular WQ is congested if the worker thread is
  4502		 * looping without ever sleeping. Therefore we have to do a short sleep
  4503		 * here rather than calling cond_resched().
  4504		 */
  4505		if (current->flags & PF_WQ_WORKER)
  4506			schedule_timeout_uninterruptible(1);
> 4507		else
  4508			cond_resched();
  4509		return ret;
  4510	}
  4511	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
kernel test robot March 15, 2021, 7:54 p.m. UTC | #2
Hi Aaron,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on hnaz-linux-mm/master]

url:    https://github.com/0day-ci/linux/commits/Aaron-Tomlin/mm-page_alloc-try-oom-if-reclaim-is-unable-to-make-forward-progress/20210316-010203
base:   https://github.com/hnaz/linux-mm master
config: riscv-randconfig-r026-20210315 (attached as .config)
compiler: clang version 13.0.0 (https://github.com/llvm/llvm-project a28facba1ccdc957f386b7753f4958307f1bfde8)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # install riscv cross compiling tool for clang build
        # apt-get install binutils-riscv64-linux-gnu
        # https://github.com/0day-ci/linux/commit/77338aaff2606a7715c832545e79370e849e3b4e
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Aaron-Tomlin/mm-page_alloc-try-oom-if-reclaim-is-unable-to-make-forward-progress/20210316-010203
        git checkout 77338aaff2606a7715c832545e79370e849e3b4e
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross ARCH=riscv 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

                                                     ^
   In file included from mm/page_alloc.c:20:
   In file included from include/linux/highmem.h:10:
   In file included from include/linux/hardirq.h:10:
   In file included from ./arch/riscv/include/generated/asm/hardirq.h:1:
   In file included from include/asm-generic/hardirq.h:13:
   In file included from include/linux/irq.h:20:
   In file included from include/linux/io.h:13:
   In file included from arch/riscv/include/asm/io.h:148:
   include/asm-generic/io.h:572:9: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
           return inl(addr);
                  ^~~~~~~~~
   arch/riscv/include/asm/io.h:56:76: note: expanded from macro 'inl'
   #define inl(c)          ({ u32 __v; __io_pbr(); __v = readl_cpu((void*)(PCI_IOBASE + (c))); __io_par(__v); __v; })
                                                                           ~~~~~~~~~~ ^
   arch/riscv/include/asm/mmio.h:89:76: note: expanded from macro 'readl_cpu'
   #define readl_cpu(c)            ({ u32 __r = le32_to_cpu((__force __le32)__raw_readl(c)); __r; })
                                                                                        ^
   include/uapi/linux/byteorder/little_endian.h:34:51: note: expanded from macro '__le32_to_cpu'
   #define __le32_to_cpu(x) ((__force __u32)(__le32)(x))
                                                     ^
   In file included from mm/page_alloc.c:20:
   In file included from include/linux/highmem.h:10:
   In file included from include/linux/hardirq.h:10:
   In file included from ./arch/riscv/include/generated/asm/hardirq.h:1:
   In file included from include/asm-generic/hardirq.h:13:
   In file included from include/linux/irq.h:20:
   In file included from include/linux/io.h:13:
   In file included from arch/riscv/include/asm/io.h:148:
   include/asm-generic/io.h:580:2: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
           outb(value, addr);
           ^~~~~~~~~~~~~~~~~
   arch/riscv/include/asm/io.h:58:68: note: expanded from macro 'outb'
   #define outb(v,c)       ({ __io_pbw(); writeb_cpu((v),(void*)(PCI_IOBASE + (c))); __io_paw(); })
                                                                 ~~~~~~~~~~ ^
   arch/riscv/include/asm/mmio.h:91:52: note: expanded from macro 'writeb_cpu'
   #define writeb_cpu(v, c)        ((void)__raw_writeb((v), (c)))
                                                             ^
   In file included from mm/page_alloc.c:20:
   In file included from include/linux/highmem.h:10:
   In file included from include/linux/hardirq.h:10:
   In file included from ./arch/riscv/include/generated/asm/hardirq.h:1:
   In file included from include/asm-generic/hardirq.h:13:
   In file included from include/linux/irq.h:20:
   In file included from include/linux/io.h:13:
   In file included from arch/riscv/include/asm/io.h:148:
   include/asm-generic/io.h:588:2: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
           outw(value, addr);
           ^~~~~~~~~~~~~~~~~
   arch/riscv/include/asm/io.h:59:68: note: expanded from macro 'outw'
   #define outw(v,c)       ({ __io_pbw(); writew_cpu((v),(void*)(PCI_IOBASE + (c))); __io_paw(); })
                                                                 ~~~~~~~~~~ ^
   arch/riscv/include/asm/mmio.h:92:76: note: expanded from macro 'writew_cpu'
   #define writew_cpu(v, c)        ((void)__raw_writew((__force u16)cpu_to_le16(v), (c)))
                                                                                     ^
   In file included from mm/page_alloc.c:20:
   In file included from include/linux/highmem.h:10:
   In file included from include/linux/hardirq.h:10:
   In file included from ./arch/riscv/include/generated/asm/hardirq.h:1:
   In file included from include/asm-generic/hardirq.h:13:
   In file included from include/linux/irq.h:20:
   In file included from include/linux/io.h:13:
   In file included from arch/riscv/include/asm/io.h:148:
   include/asm-generic/io.h:596:2: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
           outl(value, addr);
           ^~~~~~~~~~~~~~~~~
   arch/riscv/include/asm/io.h:60:68: note: expanded from macro 'outl'
   #define outl(v,c)       ({ __io_pbw(); writel_cpu((v),(void*)(PCI_IOBASE + (c))); __io_paw(); })
                                                                 ~~~~~~~~~~ ^
   arch/riscv/include/asm/mmio.h:93:76: note: expanded from macro 'writel_cpu'
   #define writel_cpu(v, c)        ((void)__raw_writel((__force u32)cpu_to_le32(v), (c)))
                                                                                     ^
   In file included from mm/page_alloc.c:20:
   In file included from include/linux/highmem.h:10:
   In file included from include/linux/hardirq.h:10:
   In file included from ./arch/riscv/include/generated/asm/hardirq.h:1:
   In file included from include/asm-generic/hardirq.h:13:
   In file included from include/linux/irq.h:20:
   In file included from include/linux/io.h:13:
   In file included from arch/riscv/include/asm/io.h:148:
   include/asm-generic/io.h:1017:55: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
           return (port > MMIO_UPPER_LIMIT) ? NULL : PCI_IOBASE + port;
                                                     ~~~~~~~~~~ ^
   mm/page_alloc.c:2538:5: warning: no previous prototype for function 'find_suitable_fallback' [-Wmissing-prototypes]
   int find_suitable_fallback(struct free_area *area, unsigned int order,
       ^
   mm/page_alloc.c:2538:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
   int find_suitable_fallback(struct free_area *area, unsigned int order,
   ^
   static 
   mm/page_alloc.c:4444:3: error: use of undeclared identifier 'result'
                   result false;
                   ^
   mm/page_alloc.c:4447:50: error: expected ';' after return statement
                   return unreserve_highatomic_pageblock(ac, true)
                                                                  ^
                                                                  ;
   mm/page_alloc.c:4507:2: error: expected expression
           else
           ^
>> mm/page_alloc.c:4719:6: error: implicit declaration of function 'should_try_oom' [-Werror,-Wimplicit-function-declaration]
           if (should_try_oom(no_progress_loops, compact_result))
               ^
   mm/page_alloc.c:4720:11: error: expected ';' after goto statement
                   goto oom:
                           ^
                           ;
   mm/page_alloc.c:6136:23: warning: no previous prototype for function 'memmap_init' [-Wmissing-prototypes]
   void __meminit __weak memmap_init(unsigned long size, int nid,
                         ^
   mm/page_alloc.c:6136:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
   void __meminit __weak memmap_init(unsigned long size, int nid,
   ^
   static 
   9 warnings and 5 errors generated.


vim +/should_try_oom +4719 mm/page_alloc.c

  4544	
  4545	static inline struct page *
  4546	__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
  4547							struct alloc_context *ac)
  4548	{
  4549		bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
  4550		const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;
  4551		struct page *page = NULL;
  4552		unsigned int alloc_flags;
  4553		unsigned long did_some_progress;
  4554		enum compact_priority compact_priority;
  4555		enum compact_result compact_result;
  4556		int compaction_retries;
  4557		int no_progress_loops;
  4558		unsigned int cpuset_mems_cookie;
  4559		int reserve_flags;
  4560	
  4561		/*
  4562		 * We also sanity check to catch abuse of atomic reserves being used by
  4563		 * callers that are not in atomic context.
  4564		 */
  4565		if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) ==
  4566					(__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
  4567			gfp_mask &= ~__GFP_ATOMIC;
  4568	
  4569	retry_cpuset:
  4570		compaction_retries = 0;
  4571		no_progress_loops = 0;
  4572		compact_priority = DEF_COMPACT_PRIORITY;
  4573		cpuset_mems_cookie = read_mems_allowed_begin();
  4574	
  4575		/*
  4576		 * The fast path uses conservative alloc_flags to succeed only until
  4577		 * kswapd needs to be woken up, and to avoid the cost of setting up
  4578		 * alloc_flags precisely. So we do that now.
  4579		 */
  4580		alloc_flags = gfp_to_alloc_flags(gfp_mask);
  4581	
  4582		/*
  4583		 * We need to recalculate the starting point for the zonelist iterator
  4584		 * because we might have used different nodemask in the fast path, or
  4585		 * there was a cpuset modification and we are retrying - otherwise we
  4586		 * could end up iterating over non-eligible zones endlessly.
  4587		 */
  4588		ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
  4589						ac->highest_zoneidx, ac->nodemask);
  4590		if (!ac->preferred_zoneref->zone)
  4591			goto nopage;
  4592	
  4593		if (alloc_flags & ALLOC_KSWAPD)
  4594			wake_all_kswapds(order, gfp_mask, ac);
  4595	
  4596		/*
  4597		 * The adjusted alloc_flags might result in immediate success, so try
  4598		 * that first
  4599		 */
  4600		page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
  4601		if (page)
  4602			goto got_pg;
  4603	
  4604		/*
  4605		 * For costly allocations, try direct compaction first, as it's likely
  4606		 * that we have enough base pages and don't need to reclaim. For non-
  4607		 * movable high-order allocations, do that as well, as compaction will
  4608		 * try prevent permanent fragmentation by migrating from blocks of the
  4609		 * same migratetype.
  4610		 * Don't try this for allocations that are allowed to ignore
  4611		 * watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen.
  4612		 */
  4613		if (can_direct_reclaim &&
  4614				(costly_order ||
  4615				   (order > 0 && ac->migratetype != MIGRATE_MOVABLE))
  4616				&& !gfp_pfmemalloc_allowed(gfp_mask)) {
  4617			page = __alloc_pages_direct_compact(gfp_mask, order,
  4618							alloc_flags, ac,
  4619							INIT_COMPACT_PRIORITY,
  4620							&compact_result);
  4621			if (page)
  4622				goto got_pg;
  4623	
  4624			/*
  4625			 * Checks for costly allocations with __GFP_NORETRY, which
  4626			 * includes some THP page fault allocations
  4627			 */
  4628			if (costly_order && (gfp_mask & __GFP_NORETRY)) {
  4629				/*
  4630				 * If allocating entire pageblock(s) and compaction
  4631				 * failed because all zones are below low watermarks
  4632				 * or is prohibited because it recently failed at this
  4633				 * order, fail immediately unless the allocator has
  4634				 * requested compaction and reclaim retry.
  4635				 *
  4636				 * Reclaim is
  4637				 *  - potentially very expensive because zones are far
  4638				 *    below their low watermarks or this is part of very
  4639				 *    bursty high order allocations,
  4640				 *  - not guaranteed to help because isolate_freepages()
  4641				 *    may not iterate over freed pages as part of its
  4642				 *    linear scan, and
  4643				 *  - unlikely to make entire pageblocks free on its
  4644				 *    own.
  4645				 */
  4646				if (compact_result == COMPACT_SKIPPED ||
  4647				    compact_result == COMPACT_DEFERRED)
  4648					goto nopage;
  4649	
  4650				/*
  4651				 * Looks like reclaim/compaction is worth trying, but
  4652				 * sync compaction could be very expensive, so keep
  4653				 * using async compaction.
  4654				 */
  4655				compact_priority = INIT_COMPACT_PRIORITY;
  4656			}
  4657		}
  4658	
  4659	retry:
  4660		/* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
  4661		if (alloc_flags & ALLOC_KSWAPD)
  4662			wake_all_kswapds(order, gfp_mask, ac);
  4663	
  4664		reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
  4665		if (reserve_flags)
  4666			alloc_flags = current_alloc_flags(gfp_mask, reserve_flags);
  4667	
  4668		/*
  4669		 * Reset the nodemask and zonelist iterators if memory policies can be
  4670		 * ignored. These allocations are high priority and system rather than
  4671		 * user oriented.
  4672		 */
  4673		if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {
  4674			ac->nodemask = NULL;
  4675			ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
  4676						ac->highest_zoneidx, ac->nodemask);
  4677		}
  4678	
  4679		/* Attempt with potentially adjusted zonelist and alloc_flags */
  4680		page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
  4681		if (page)
  4682			goto got_pg;
  4683	
  4684		/* Caller is not willing to reclaim, we can't balance anything */
  4685		if (!can_direct_reclaim)
  4686			goto nopage;
  4687	
  4688		/* Avoid recursion of direct reclaim */
  4689		if (current->flags & PF_MEMALLOC)
  4690			goto nopage;
  4691	
  4692		/* Try direct reclaim and then allocating */
  4693		page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
  4694								&did_some_progress);
  4695		if (page)
  4696			goto got_pg;
  4697	
  4698		/* Try direct compaction and then allocating */
  4699		page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
  4700						compact_priority, &compact_result);
  4701		if (page)
  4702			goto got_pg;
  4703	
  4704		/* Do not loop if specifically requested */
  4705		if (gfp_mask & __GFP_NORETRY)
  4706			goto nopage;
  4707	
  4708		/*
  4709		 * Do not retry costly high order allocations unless they are
  4710		 * __GFP_RETRY_MAYFAIL
  4711		 */
  4712		if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL))
  4713			goto nopage;
  4714	
  4715		if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
  4716					 did_some_progress > 0, &no_progress_loops))
  4717			goto retry;
  4718	
> 4719		if (should_try_oom(no_progress_loops, compact_result))
  4720			goto oom:
  4721		/*
  4722		 * It doesn't make any sense to retry for the compaction if the order-0
  4723		 * reclaim is not able to make any progress because the current
  4724		 * implementation of the compaction depends on the sufficient amount
  4725		 * of free memory (see __compaction_suitable)
  4726		 */
  4727		if (did_some_progress > 0 &&
  4728				should_compact_retry(ac, order, alloc_flags,
  4729					compact_result, &compact_priority,
  4730					&compaction_retries))
  4731			goto retry;
  4732	
  4733	
  4734		/* Deal with possible cpuset update races before we start OOM killing */
  4735		if (check_retry_cpuset(cpuset_mems_cookie, ac))
  4736			goto retry_cpuset;
  4737	
  4738	oom:
  4739		/* Reclaim has failed us, start killing things */
  4740		page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
  4741		if (page)
  4742			goto got_pg;
  4743	
  4744		/* Avoid allocations with no watermarks from looping endlessly */
  4745		if (tsk_is_oom_victim(current) &&
  4746		    (alloc_flags & ALLOC_OOM ||
  4747		     (gfp_mask & __GFP_NOMEMALLOC)))
  4748			goto nopage;
  4749	
  4750		/* Retry as long as the OOM killer is making progress */
  4751		if (did_some_progress) {
  4752			no_progress_loops = 0;
  4753			goto retry;
  4754		}
  4755	
  4756	nopage:
  4757		/* Deal with possible cpuset update races before we fail */
  4758		if (check_retry_cpuset(cpuset_mems_cookie, ac))
  4759			goto retry_cpuset;
  4760	
  4761		/*
  4762		 * Make sure that __GFP_NOFAIL request doesn't leak out and make sure
  4763		 * we always retry
  4764		 */
  4765		if (gfp_mask & __GFP_NOFAIL) {
  4766			/*
  4767			 * All existing users of the __GFP_NOFAIL are blockable, so warn
  4768			 * of any new users that actually require GFP_NOWAIT
  4769			 */
  4770			if (WARN_ON_ONCE(!can_direct_reclaim))
  4771				goto fail;
  4772	
  4773			/*
  4774			 * PF_MEMALLOC request from this context is rather bizarre
  4775			 * because we cannot reclaim anything and only can loop waiting
  4776			 * for somebody to do a work for us
  4777			 */
  4778			WARN_ON_ONCE(current->flags & PF_MEMALLOC);
  4779	
  4780			/*
  4781			 * non failing costly orders are a hard requirement which we
  4782			 * are not prepared for much so let's warn about these users
  4783			 * so that we can identify them and convert them to something
  4784			 * else.
  4785			 */
  4786			WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER);
  4787	
  4788			/*
  4789			 * Help non-failing allocations by giving them access to memory
  4790			 * reserves but do not use ALLOC_NO_WATERMARKS because this
  4791			 * could deplete whole memory reserves which would just make
  4792			 * the situation worse
  4793			 */
  4794			page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac);
  4795			if (page)
  4796				goto got_pg;
  4797	
  4798			cond_resched();
  4799			goto retry;
  4800		}
  4801	fail:
  4802		warn_alloc(gfp_mask, ac->nodemask,
  4803				"page allocation failure: order:%u", order);
  4804	got_pg:
  4805		return page;
  4806	}
  4807	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
kernel test robot March 15, 2021, 7:54 p.m. UTC | #3
Hi Aaron,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on hnaz-linux-mm/master]

url:    https://github.com/0day-ci/linux/commits/Aaron-Tomlin/mm-page_alloc-try-oom-if-reclaim-is-unable-to-make-forward-progress/20210316-010203
base:   https://github.com/hnaz/linux-mm master
config: arc-randconfig-r024-20210315 (attached as .config)
compiler: arceb-elf-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/77338aaff2606a7715c832545e79370e849e3b4e
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Aaron-Tomlin/mm-page_alloc-try-oom-if-reclaim-is-unable-to-make-forward-progress/20210316-010203
        git checkout 77338aaff2606a7715c832545e79370e849e3b4e
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=arc 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All error/warnings (new ones prefixed by >>):

   mm/page_alloc.c: In function 'should_reclaim_retry':
>> mm/page_alloc.c:4444:3: error: 'result' undeclared (first use in this function)
    4444 |   result false;
         |   ^~~~~~
   mm/page_alloc.c:4444:3: note: each undeclared identifier is reported only once for each function it appears in
>> mm/page_alloc.c:4444:9: error: expected ';' before 'false'
    4444 |   result false;
         |         ^~~~~~
         |         ;
>> mm/page_alloc.c:4447:50: error: expected ';' before 'for'
    4447 |   return unreserve_highatomic_pageblock(ac, true)
         |                                                  ^
         |                                                  ;
   mm/page_alloc.c:4426:18: warning: unused variable 'z' [-Wunused-variable]
    4426 |  struct zoneref *z;
         |                  ^
   mm/page_alloc.c:4425:15: warning: unused variable 'zone' [-Wunused-variable]
    4425 |  struct zone *zone;
         |               ^~~~
   mm/page_alloc.c: In function '__alloc_pages_slowpath':
>> mm/page_alloc.c:4720:11: error: expected ';' before ':' token
    4720 |   goto oom:
         |           ^
         |           ;
>> mm/page_alloc.c:4556:6: warning: variable 'compaction_retries' set but not used [-Wunused-but-set-variable]
    4556 |  int compaction_retries;
         |      ^~~~~~~~~~~~~~~~~~
   mm/page_alloc.c: At top level:
   mm/page_alloc.c:6136:23: warning: no previous prototype for 'memmap_init' [-Wmissing-prototypes]
    6136 | void __meminit __weak memmap_init(unsigned long size, int nid,
         |                       ^~~~~~~~~~~


vim +/result +4444 mm/page_alloc.c

  4409	
  4410	/*
  4411	 * Checks whether it makes sense to retry the reclaim to make a forward progress
  4412	 * for the given allocation request.
  4413	 *
  4414	 * We give up when we either have tried MAX_RECLAIM_RETRIES in a row
  4415	 * without success, or when we couldn't even meet the watermark if we
  4416	 * reclaimed all remaining pages on the LRU lists.
  4417	 *
  4418	 * Returns true if a retry is viable or false to enter the oom path.
  4419	 */
  4420	static inline bool
  4421	should_reclaim_retry(gfp_t gfp_mask, unsigned order,
  4422			     struct alloc_context *ac, int alloc_flags,
  4423			     bool did_some_progress, int *no_progress_loops)
  4424	{
  4425		struct zone *zone;
  4426		struct zoneref *z;
  4427		bool ret = false;
  4428	
  4429		/*
  4430		 * Costly allocations might have made a progress but this doesn't mean
  4431		 * their order will become available due to high fragmentation so
  4432		 * always increment the no progress counter for them
  4433		 */
  4434		if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER)
  4435			*no_progress_loops = 0;
  4436		else
  4437			(*no_progress_loops)++;
  4438	
  4439		/*
  4440		 * Make sure we converge to OOM if we cannot make any progress
  4441		 * several times in the row.
  4442		 */
  4443		if (*no_progress_loops > MAX_RECLAIM_RETRIES)
> 4444			result false;
  4445		/* Last chance before OOM, try draining highatomic_reserve once */
  4446		else if (*no_progress_loops == MAX_RECLAIM_RETRIES)
> 4447			return unreserve_highatomic_pageblock(ac, true)
  4448	
  4449		/*
  4450		 * Keep reclaiming pages while there is a chance this will lead
  4451		 * somewhere.  If none of the target zones can satisfy our allocation
  4452		 * request even if all reclaimable pages are considered then we are
  4453		 * screwed and have to go OOM.
  4454		 */
  4455		for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
  4456					ac->highest_zoneidx, ac->nodemask) {
  4457			unsigned long available;
  4458			unsigned long reclaimable;
  4459			unsigned long min_wmark = min_wmark_pages(zone);
  4460			bool wmark;
  4461	
  4462			available = reclaimable = zone_reclaimable_pages(zone);
  4463			available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
  4464	
  4465			/*
  4466			 * Would the allocation succeed if we reclaimed all
  4467			 * reclaimable pages?
  4468			 */
  4469			wmark = __zone_watermark_ok(zone, order, min_wmark,
  4470					ac->highest_zoneidx, alloc_flags, available);
  4471			trace_reclaim_retry_zone(z, order, reclaimable,
  4472					available, min_wmark, *no_progress_loops, wmark);
  4473			if (wmark) {
  4474				/*
  4475				 * If we didn't make any progress and have a lot of
  4476				 * dirty + writeback pages then we should wait for
  4477				 * an IO to complete to slow down the reclaim and
  4478				 * prevent from pre mature OOM
  4479				 */
  4480				if (!did_some_progress) {
  4481					unsigned long write_pending;
  4482	
  4483					write_pending = zone_page_state_snapshot(zone,
  4484								NR_ZONE_WRITE_PENDING);
  4485	
  4486					if (2 * write_pending > reclaimable) {
  4487						congestion_wait(BLK_RW_ASYNC, HZ/10);
  4488						return true;
  4489					}
  4490				}
  4491	
  4492				ret = true;
  4493				goto out;
  4494			}
  4495		}
  4496	
  4497	out:
  4498		/*
  4499		 * Memory allocation/reclaim might be called from a WQ context and the
  4500		 * current implementation of the WQ concurrency control doesn't
  4501		 * recognize that a particular WQ is congested if the worker thread is
  4502		 * looping without ever sleeping. Therefore we have to do a short sleep
  4503		 * here rather than calling cond_resched().
  4504		 */
  4505		if (current->flags & PF_WQ_WORKER)
  4506			schedule_timeout_uninterruptible(1);
  4507		else
  4508			cond_resched();
  4509		return ret;
  4510	}
  4511	
  4512	static inline bool
  4513	check_retry_cpuset(int cpuset_mems_cookie, struct alloc_context *ac)
  4514	{
  4515		/*
  4516		 * It's possible that cpuset's mems_allowed and the nodemask from
  4517		 * mempolicy don't intersect. This should be normally dealt with by
  4518		 * policy_nodemask(), but it's possible to race with cpuset update in
  4519		 * such a way the check therein was true, and then it became false
  4520		 * before we got our cpuset_mems_cookie here.
  4521		 * This assumes that for all allocations, ac->nodemask can come only
  4522		 * from MPOL_BIND mempolicy (whose documented semantics is to be ignored
  4523		 * when it does not intersect with the cpuset restrictions) or the
  4524		 * caller can deal with a violated nodemask.
  4525		 */
  4526		if (cpusets_enabled() && ac->nodemask &&
  4527				!cpuset_nodemask_valid_mems_allowed(ac->nodemask)) {
  4528			ac->nodemask = NULL;
  4529			return true;
  4530		}
  4531	
  4532		/*
  4533		 * When updating a task's mems_allowed or mempolicy nodemask, it is
  4534		 * possible to race with parallel threads in such a way that our
  4535		 * allocation can fail while the mask is being updated. If we are about
  4536		 * to fail, check if the cpuset changed during allocation and if so,
  4537		 * retry.
  4538		 */
  4539		if (read_mems_allowed_retry(cpuset_mems_cookie))
  4540			return true;
  4541	
  4542		return false;
  4543	}
  4544	
  4545	static inline struct page *
  4546	__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
  4547							struct alloc_context *ac)
  4548	{
  4549		bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
  4550		const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;
  4551		struct page *page = NULL;
  4552		unsigned int alloc_flags;
  4553		unsigned long did_some_progress;
  4554		enum compact_priority compact_priority;
  4555		enum compact_result compact_result;
> 4556		int compaction_retries;
  4557		int no_progress_loops;
  4558		unsigned int cpuset_mems_cookie;
  4559		int reserve_flags;
  4560	
  4561		/*
  4562		 * We also sanity check to catch abuse of atomic reserves being used by
  4563		 * callers that are not in atomic context.
  4564		 */
  4565		if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) ==
  4566					(__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
  4567			gfp_mask &= ~__GFP_ATOMIC;
  4568	
  4569	retry_cpuset:
  4570		compaction_retries = 0;
  4571		no_progress_loops = 0;
  4572		compact_priority = DEF_COMPACT_PRIORITY;
  4573		cpuset_mems_cookie = read_mems_allowed_begin();
  4574	
  4575		/*
  4576		 * The fast path uses conservative alloc_flags to succeed only until
  4577		 * kswapd needs to be woken up, and to avoid the cost of setting up
  4578		 * alloc_flags precisely. So we do that now.
  4579		 */
  4580		alloc_flags = gfp_to_alloc_flags(gfp_mask);
  4581	
  4582		/*
  4583		 * We need to recalculate the starting point for the zonelist iterator
  4584		 * because we might have used different nodemask in the fast path, or
  4585		 * there was a cpuset modification and we are retrying - otherwise we
  4586		 * could end up iterating over non-eligible zones endlessly.
  4587		 */
  4588		ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
  4589						ac->highest_zoneidx, ac->nodemask);
  4590		if (!ac->preferred_zoneref->zone)
  4591			goto nopage;
  4592	
  4593		if (alloc_flags & ALLOC_KSWAPD)
  4594			wake_all_kswapds(order, gfp_mask, ac);
  4595	
  4596		/*
  4597		 * The adjusted alloc_flags might result in immediate success, so try
  4598		 * that first
  4599		 */
  4600		page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
  4601		if (page)
  4602			goto got_pg;
  4603	
  4604		/*
  4605		 * For costly allocations, try direct compaction first, as it's likely
  4606		 * that we have enough base pages and don't need to reclaim. For non-
  4607		 * movable high-order allocations, do that as well, as compaction will
  4608		 * try prevent permanent fragmentation by migrating from blocks of the
  4609		 * same migratetype.
  4610		 * Don't try this for allocations that are allowed to ignore
  4611		 * watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen.
  4612		 */
  4613		if (can_direct_reclaim &&
  4614				(costly_order ||
  4615				   (order > 0 && ac->migratetype != MIGRATE_MOVABLE))
  4616				&& !gfp_pfmemalloc_allowed(gfp_mask)) {
  4617			page = __alloc_pages_direct_compact(gfp_mask, order,
  4618							alloc_flags, ac,
  4619							INIT_COMPACT_PRIORITY,
  4620							&compact_result);
  4621			if (page)
  4622				goto got_pg;
  4623	
  4624			/*
  4625			 * Checks for costly allocations with __GFP_NORETRY, which
  4626			 * includes some THP page fault allocations
  4627			 */
  4628			if (costly_order && (gfp_mask & __GFP_NORETRY)) {
  4629				/*
  4630				 * If allocating entire pageblock(s) and compaction
  4631				 * failed because all zones are below low watermarks
  4632				 * or is prohibited because it recently failed at this
  4633				 * order, fail immediately unless the allocator has
  4634				 * requested compaction and reclaim retry.
  4635				 *
  4636				 * Reclaim is
  4637				 *  - potentially very expensive because zones are far
  4638				 *    below their low watermarks or this is part of very
  4639				 *    bursty high order allocations,
  4640				 *  - not guaranteed to help because isolate_freepages()
  4641				 *    may not iterate over freed pages as part of its
  4642				 *    linear scan, and
  4643				 *  - unlikely to make entire pageblocks free on its
  4644				 *    own.
  4645				 */
  4646				if (compact_result == COMPACT_SKIPPED ||
  4647				    compact_result == COMPACT_DEFERRED)
  4648					goto nopage;
  4649	
  4650				/*
  4651				 * Looks like reclaim/compaction is worth trying, but
  4652				 * sync compaction could be very expensive, so keep
  4653				 * using async compaction.
  4654				 */
  4655				compact_priority = INIT_COMPACT_PRIORITY;
  4656			}
  4657		}
  4658	
  4659	retry:
  4660		/* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
  4661		if (alloc_flags & ALLOC_KSWAPD)
  4662			wake_all_kswapds(order, gfp_mask, ac);
  4663	
  4664		reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
  4665		if (reserve_flags)
  4666			alloc_flags = current_alloc_flags(gfp_mask, reserve_flags);
  4667	
  4668		/*
  4669		 * Reset the nodemask and zonelist iterators if memory policies can be
  4670		 * ignored. These allocations are high priority and system rather than
  4671		 * user oriented.
  4672		 */
  4673		if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {
  4674			ac->nodemask = NULL;
  4675			ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
  4676						ac->highest_zoneidx, ac->nodemask);
  4677		}
  4678	
  4679		/* Attempt with potentially adjusted zonelist and alloc_flags */
  4680		page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
  4681		if (page)
  4682			goto got_pg;
  4683	
  4684		/* Caller is not willing to reclaim, we can't balance anything */
  4685		if (!can_direct_reclaim)
  4686			goto nopage;
  4687	
  4688		/* Avoid recursion of direct reclaim */
  4689		if (current->flags & PF_MEMALLOC)
  4690			goto nopage;
  4691	
  4692		/* Try direct reclaim and then allocating */
  4693		page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
  4694								&did_some_progress);
  4695		if (page)
  4696			goto got_pg;
  4697	
  4698		/* Try direct compaction and then allocating */
  4699		page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
  4700						compact_priority, &compact_result);
  4701		if (page)
  4702			goto got_pg;
  4703	
  4704		/* Do not loop if specifically requested */
  4705		if (gfp_mask & __GFP_NORETRY)
  4706			goto nopage;
  4707	
  4708		/*
  4709		 * Do not retry costly high order allocations unless they are
  4710		 * __GFP_RETRY_MAYFAIL
  4711		 */
  4712		if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL))
  4713			goto nopage;
  4714	
  4715		if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
  4716					 did_some_progress > 0, &no_progress_loops))
  4717			goto retry;
  4718	
  4719		if (should_try_oom(no_progress_loops, compact_result))
  4720			goto oom:
  4721		/*
  4722		 * It doesn't make any sense to retry for the compaction if the order-0
  4723		 * reclaim is not able to make any progress because the current
  4724		 * implementation of the compaction depends on the sufficient amount
  4725		 * of free memory (see __compaction_suitable)
  4726		 */
  4727		if (did_some_progress > 0 &&
  4728				should_compact_retry(ac, order, alloc_flags,
  4729					compact_result, &compact_priority,
  4730					&compaction_retries))
  4731			goto retry;
  4732	
  4733	
  4734		/* Deal with possible cpuset update races before we start OOM killing */
  4735		if (check_retry_cpuset(cpuset_mems_cookie, ac))
  4736			goto retry_cpuset;
  4737	
  4738	oom:
  4739		/* Reclaim has failed us, start killing things */
  4740		page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
  4741		if (page)
  4742			goto got_pg;
  4743	
  4744		/* Avoid allocations with no watermarks from looping endlessly */
  4745		if (tsk_is_oom_victim(current) &&
  4746		    (alloc_flags & ALLOC_OOM ||
  4747		     (gfp_mask & __GFP_NOMEMALLOC)))
  4748			goto nopage;
  4749	
  4750		/* Retry as long as the OOM killer is making progress */
  4751		if (did_some_progress) {
  4752			no_progress_loops = 0;
  4753			goto retry;
  4754		}
  4755	
  4756	nopage:
  4757		/* Deal with possible cpuset update races before we fail */
  4758		if (check_retry_cpuset(cpuset_mems_cookie, ac))
  4759			goto retry_cpuset;
  4760	
  4761		/*
  4762		 * Make sure that __GFP_NOFAIL request doesn't leak out and make sure
  4763		 * we always retry
  4764		 */
  4765		if (gfp_mask & __GFP_NOFAIL) {
  4766			/*
  4767			 * All existing users of the __GFP_NOFAIL are blockable, so warn
  4768			 * of any new users that actually require GFP_NOWAIT
  4769			 */
  4770			if (WARN_ON_ONCE(!can_direct_reclaim))
  4771				goto fail;
  4772	
  4773			/*
  4774			 * PF_MEMALLOC request from this context is rather bizarre
  4775			 * because we cannot reclaim anything and only can loop waiting
  4776			 * for somebody to do a work for us
  4777			 */
  4778			WARN_ON_ONCE(current->flags & PF_MEMALLOC);
  4779	
  4780			/*
  4781			 * non failing costly orders are a hard requirement which we
  4782			 * are not prepared for much so let's warn about these users
  4783			 * so that we can identify them and convert them to something
  4784			 * else.
  4785			 */
  4786			WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER);
  4787	
  4788			/*
  4789			 * Help non-failing allocations by giving them access to memory
  4790			 * reserves but do not use ALLOC_NO_WATERMARKS because this
  4791			 * could deplete whole memory reserves which would just make
  4792			 * the situation worse
  4793			 */
  4794			page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac);
  4795			if (page)
  4796				goto got_pg;
  4797	
  4798			cond_resched();
  4799			goto retry;
  4800		}
  4801	fail:
  4802		warn_alloc(gfp_mask, ac->nodemask,
  4803				"page allocation failure: order:%u", order);
  4804	got_pg:
  4805		return page;
  4806	}
  4807	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
Michal Hocko March 18, 2021, 4:16 p.m. UTC | #4
On Mon 15-03-21 16:58:37, Aaron Tomlin wrote:
> In the situation where direct reclaim is required to make progress for
> compaction but no_progress_loops is already over the limit of
> MAX_RECLAIM_RETRIES consider invoking the oom killer.

What is the problem you are trying to fix?

> 
> Signed-off-by: Aaron Tomlin <atomlin@redhat.com>
> ---
>  mm/page_alloc.c | 22 ++++++++++++++++++----
>  1 file changed, 18 insertions(+), 4 deletions(-)
> 
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 7a2c89b21115..8d748b1b8d1e 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -4181,6 +4181,16 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
>  	return NULL;
>  }
>  
> +static inline bool
> +should_try_oom(int no_progress_loops,
> +		enum compact_result last_compact_result)
> +{
> +	if (no_progress_loops > MAX_RECLAIM_RETRIES && last_compact_result
> +			== COMPACT_SKIPPED)
> +		return true;
> +	return false;
> +}
> +
>  static inline bool
>  should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
>  		     enum compact_result compact_result,
> @@ -4547,10 +4557,11 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
>  	 * Make sure we converge to OOM if we cannot make any progress
>  	 * several times in the row.
>  	 */
> -	if (*no_progress_loops > MAX_RECLAIM_RETRIES) {
> -		/* Before OOM, exhaust highatomic_reserve */
> -		return unreserve_highatomic_pageblock(ac, true);
> -	}
> +	if (*no_progress_loops > MAX_RECLAIM_RETRIES)
> +		result false;
> +	/* Last chance before OOM, try draining highatomic_reserve once */
> +	else if (*no_progress_loops == MAX_RECLAIM_RETRIES)
> +		return unreserve_highatomic_pageblock(ac, true)
>  
>  	/*
>  	 * Keep reclaiming pages while there is a chance this will lead
> @@ -4822,6 +4833,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
>  				 did_some_progress > 0, &no_progress_loops))
>  		goto retry;
>  
> +	if (should_try_oom(no_progress_loops, compact_result))
> +		goto oom:
>  	/*
>  	 * It doesn't make any sense to retry for the compaction if the order-0
>  	 * reclaim is not able to make any progress because the current
> @@ -4839,6 +4852,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
>  	if (check_retry_cpuset(cpuset_mems_cookie, ac))
>  		goto retry_cpuset;
>  
> +oom:
>  	/* Reclaim has failed us, start killing things */
>  	page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
>  	if (page)
> -- 
> 2.26.2
>
Aaron Tomlin March 19, 2021, 5:29 p.m. UTC | #5
Hi Michal,

On Thu 2021-03-18 17:16 +0100, Michal Hocko wrote:
> On Mon 15-03-21 16:58:37, Aaron Tomlin wrote:
> > In the situation where direct reclaim is required to make progress for
> > compaction but no_progress_loops is already over the limit of
> > MAX_RECLAIM_RETRIES consider invoking the oom killer.

Firstly, thank you for your response.

> What is the problem you are trying to fix?

If I understand correctly, in the case of a "costly" order allocation
request that is permitted to repeatedly retry, it is possible to exceed the
maximum reclaim retry threshold as long as "some" progress is being made
even at the highest compaction priority. Furthermore, if the allocator has
a fatal signal pending, this is not considered.

In my opinion, it might be better to just give up straight away or try and
use the OOM killer only in the non-costly order allocation scenario to
assit reclaim. Looking at __alloc_pages_may_oom() the current logic is to
entirely skip the OOM killer for a costly order request, which makes sense.


Regards,
Michal Hocko March 22, 2021, 10:47 a.m. UTC | #6
On Fri 19-03-21 17:29:01, Aaron Tomlin wrote:
> Hi Michal,
> 
> On Thu 2021-03-18 17:16 +0100, Michal Hocko wrote:
> > On Mon 15-03-21 16:58:37, Aaron Tomlin wrote:
> > > In the situation where direct reclaim is required to make progress for
> > > compaction but no_progress_loops is already over the limit of
> > > MAX_RECLAIM_RETRIES consider invoking the oom killer.
> 
> Firstly, thank you for your response.
> 
> > What is the problem you are trying to fix?
> 
> If I understand correctly, in the case of a "costly" order allocation
> request that is permitted to repeatedly retry, it is possible to exceed the
> maximum reclaim retry threshold as long as "some" progress is being made
> even at the highest compaction priority.

Costly orders already do have heuristics for the retry in place. Could
you be more specific what kind of problem you see with those?

> Furthermore, if the allocator has a fatal signal pending, this is not
> considered.

Fatal signals pending are usually not a strong reason to cut retries
count or fail allocations.

> In my opinion, it might be better to just give up straight away or try and
> use the OOM killer only in the non-costly order allocation scenario to
> assit reclaim. Looking at __alloc_pages_may_oom() the current logic is to
> entirely skip the OOM killer for a costly order request, which makes sense.

Well, opinions might differ of course. The main question is whether
there are workloads which are unhappy about the existing behavior.
Aaron Tomlin March 25, 2021, 9:01 p.m. UTC | #7
On Mon 2021-03-22 11:47 +0100, Michal Hocko wrote:
> Costly orders already do have heuristics for the retry in place. Could
> you be more specific what kind of problem you see with those?

If I understand correctly, when the gfp_mask consists of
GFP_KERNEL | __GFP_RETRY_MAYFAIL in particular, an allocation request will
fail, if and only if reclaim is unable to make progress.

The costly order allocation retry logic is handled primarily in
should_reclaim_retry(). Looking at should_reclaim_retry() we see that the
no progress counter value is always incremented in the costly order case
even when "some" progress has been made which is represented by the value
stored in did_some_progress.

        if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL))
                goto nopage;

        if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
                                 did_some_progress > 0, &no_progress_loops))
                goto retry;

I think after we have tried MAX_RECLAIM_RETRIES in a row without success
and the last known compaction attempt was "skipped", perhaps it would be
better to try and use the OOM killer or fail the allocation attempt?

I encountered a situation when the value of no_progress_loops was found to
be 31,611,688 i.e. significantly over MAX_RECLAIM_RETRIES; the allocation
order was 5. The gfp_mask contained the following:

     #define ___GFP_HIGHMEM          0x02
     #define ___GFP_IO               0x40
     #define ___GFP_FS               0x80
     #define ___GFP_NOWARN           0x200
     #define ___GFP_RETRY_MAYFAIL    0x400
     #define ___GFP_COMP             0x4000
     #define ___GFP_HARDWALL         0x20000
     #define ___GFP_DIRECT_RECLAIM   0x200000
     #define ___GFP_KSWAPD_RECLAIM   0x400000
Michal Hocko March 26, 2021, 8:16 a.m. UTC | #8
[Cc Vlastimil]

On Thu 25-03-21 21:01:59, Aaron Tomlin wrote:
> On Mon 2021-03-22 11:47 +0100, Michal Hocko wrote:
> > Costly orders already do have heuristics for the retry in place. Could
> > you be more specific what kind of problem you see with those?
> 
> If I understand correctly, when the gfp_mask consists of
> GFP_KERNEL | __GFP_RETRY_MAYFAIL in particular, an allocation request will
> fail, if and only if reclaim is unable to make progress.
> 
> The costly order allocation retry logic is handled primarily in
> should_reclaim_retry(). Looking at should_reclaim_retry() we see that the
> no progress counter value is always incremented in the costly order case
> even when "some" progress has been made which is represented by the value
> stored in did_some_progress.
> 
>         if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL))
>                 goto nopage;
> 
>         if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
>                                  did_some_progress > 0, &no_progress_loops))
>                 goto retry;
> 
> I think after we have tried MAX_RECLAIM_RETRIES in a row without success
> and the last known compaction attempt was "skipped", perhaps it would be
> better to try and use the OOM killer or fail the allocation attempt?

The oom killer is never triggered for costly allocation request. Both
reclaim and compaction maintain their own retries counters as they are
targeting a different operation. Although the compaction really depends
on the reclaim to do some progress.

> I encountered a situation when the value of no_progress_loops was found to
> be 31,611,688 i.e. significantly over MAX_RECLAIM_RETRIES; the allocation
> order was 5. The gfp_mask contained the following:

OK, this sound unexpected as it indicates that the reclaim is able to
make a forward progress but compaction doesn't want to give up and keeps
retrying. Are you able to reproduce this or could you find out which
specific condition keeps compaction retrying? I would expect that it is
one of the 3 conditions before the max_retries is checked.

>      #define ___GFP_HIGHMEM          0x02
>      #define ___GFP_IO               0x40
>      #define ___GFP_FS               0x80
>      #define ___GFP_NOWARN           0x200
>      #define ___GFP_RETRY_MAYFAIL    0x400
>      #define ___GFP_COMP             0x4000
>      #define ___GFP_HARDWALL         0x20000
>      #define ___GFP_DIRECT_RECLAIM   0x200000
>      #define ___GFP_KSWAPD_RECLAIM   0x400000
> 
> 
> 
> -- 
> Aaron Tomlin
>
Aaron Tomlin March 26, 2021, 11:22 a.m. UTC | #9
Hi Michal,

On Fri 2021-03-26 09:16 +0100, Michal Hocko wrote:
> The oom killer is never triggered for costly allocation request.

Yes - I agree. Looking at __alloc_pages_may_oom() I can see for a costly
order allocation request the OOM killer is explicitly not used.
If I understand correctly, the patch I proposed was for the following
scenarios:

  1.    The costly order allocation request to fail when
        "some" progress is made (i.e. order-0) and the last
        compaction request was "skipped"

  2.    A non-costly order allocation request that is
        unable to make any progress and failed over the
        maximum reclaim retry count in a row and the last
        known compaction result was skipped to consider
        using the OOM killer for assistance

> Both reclaim and compaction maintain their own retries counters as they
> are targeting a different operation. Although the compaction really
> depends on the reclaim to do some progress.

Yes. Looking at should_compact_retry() if the last known compaction result
was skipped i.e. suggesting there was not enough order-0 pages to support
compaction, so assistance is needed from reclaim
(see __compaction_suitable()).

I noticed that the value of compaction_retries, compact_result and
compact_priority was 0, COMPACT_SKIPPED and 1 i.e. COMPACT_PRIO_SYNC_LIGHT,
respectively.

> OK, this sound unexpected as it indicates that the reclaim is able to
> make a forward progress but compaction doesn't want to give up and keeps
> retrying. Are you able to reproduce this or could you find out which
> specific condition keeps compaction retrying? I would expect that it is
> one of the 3 conditions before the max_retries is checked.

Unfortunately, I have been told it is not entirely reproducible.
I suspect it is the following in should_compact_retry() - as I indicated
above the last known value stored in compaction_retries was 0:


        if (order > PAGE_ALLOC_COSTLY_ORDER)
                max_retries /= 4;
        if (*compaction_retries <= max_retries) {
                ret = true;
                goto out;
        }




Kind regards,
Michal Hocko March 26, 2021, 3:36 p.m. UTC | #10
On Fri 26-03-21 11:22:54, Aaron Tomlin wrote:
[...]
> > Both reclaim and compaction maintain their own retries counters as they
> > are targeting a different operation. Although the compaction really
> > depends on the reclaim to do some progress.
> 
> Yes. Looking at should_compact_retry() if the last known compaction result
> was skipped i.e. suggesting there was not enough order-0 pages to support
> compaction, so assistance is needed from reclaim
> (see __compaction_suitable()).
> 
> I noticed that the value of compaction_retries, compact_result and
> compact_priority was 0, COMPACT_SKIPPED and 1 i.e. COMPACT_PRIO_SYNC_LIGHT,
> respectively.
> 
> > OK, this sound unexpected as it indicates that the reclaim is able to
> > make a forward progress but compaction doesn't want to give up and keeps
> > retrying. Are you able to reproduce this or could you find out which
> > specific condition keeps compaction retrying? I would expect that it is
> > one of the 3 conditions before the max_retries is checked.
> 
> Unfortunately, I have been told it is not entirely reproducible.
> I suspect it is the following in should_compact_retry() - as I indicated
> above the last known value stored in compaction_retries was 0:
> 
> 
>         if (order > PAGE_ALLOC_COSTLY_ORDER)
>                 max_retries /= 4;
>         if (*compaction_retries <= max_retries) {
>                 ret = true;
>                 goto out;
>         }

OK, I kinda expected this would be not easily reproducible. The reason I
dislike your patch is that it addes yet another criterion for oom while
we already do have 2 which doesn't make the resulting code easier to
reason about. We should be focusing on the compaction retry logic and
see whether we can have some "run away" scenarios there. Seeing so many
retries without compaction bailing out sounds like a bug in that retry
logic. Vlastimil is much more familiar with that.
Aaron Tomlin March 26, 2021, 5 p.m. UTC | #11
On Fri 2021-03-26 16:36 +0100, Michal Hocko wrote:
> We should be focusing on the compaction retry logic and see whether we
> can have some "run away" scenarios there.

Fair enough.

> Seeing so many retries without compaction bailing out sounds like a bug
> in that retry logic.

Agreed - I will continue to look into this.



Kind regards,
Aaron Tomlin May 18, 2021, 2:05 p.m. UTC | #12
Michal,

On Fri 2021-03-26 16:36 +0100, Michal Hocko wrote:
> OK, I kinda expected this would be not easily reproducible.

Unfortunately, I'm still waiting for feedback on this.

> We should be focusing on the compaction retry logic and see whether we
> can have some "run away" scenarios there. Seeing so many retries without
> compaction bailing out sounds like a bug in that retry logic.

I suspect so.

This is indeed a case of excessive reclaim/compaction retries (i.e. the
last known value stored in the no_progress_loops variable was 31,611,688).

What might be particularly unique about this situation is that a fatal
signal was found pending. In this context, if I understand correctly, it
does not make sense to retry compaction when the last known compact result
was skipped and a fatal signal is pending.

Looking at try_to_compact_pages(), indeed COMPACT_SKIPPED can be returned;
albeit, not every zone, on the zone list, would be considered in the case
a fatal signal is found to be pending. Yet, in should_compact_retry(),
given the last known compaction result, each zone, on the zone list, can be
considered/or checked (see compaction_zonelist_suitable()). If a zone e.g.
was found to succeed then reclaim/compaction would be tried again
(notwithstanding the above).
Michal Hocko May 19, 2021, 11:10 a.m. UTC | #13
On Tue 18-05-21 15:05:54, Aaron Tomlin wrote:
> Michal,
> 
> On Fri 2021-03-26 16:36 +0100, Michal Hocko wrote:
> > OK, I kinda expected this would be not easily reproducible.
> 
> Unfortunately, I'm still waiting for feedback on this.
> 
> > We should be focusing on the compaction retry logic and see whether we
> > can have some "run away" scenarios there. Seeing so many retries without
> > compaction bailing out sounds like a bug in that retry logic.
> 
> I suspect so.
> 
> This is indeed a case of excessive reclaim/compaction retries (i.e. the
> last known value stored in the no_progress_loops variable was 31,611,688).
> 
> What might be particularly unique about this situation is that a fatal
> signal was found pending. In this context, if I understand correctly, it
> does not make sense to retry compaction when the last known compact result
> was skipped and a fatal signal is pending.

OK, this might be an interesting lead.

> Looking at try_to_compact_pages(), indeed COMPACT_SKIPPED can be returned;
> albeit, not every zone, on the zone list, would be considered in the case
> a fatal signal is found to be pending. Yet, in should_compact_retry(),
> given the last known compaction result, each zone, on the zone list, can be
> considered/or checked (see compaction_zonelist_suitable()). If a zone e.g.
> was found to succeed then reclaim/compaction would be tried again
> (notwithstanding the above).

I believe Vlastimil would be much better fit into looking into those
details but it smells like pending fatal signals can lead to a unbound
retry indeed.

Thanks!
Aaron Tomlin May 19, 2021, 1:06 p.m. UTC | #14
Michal,

On Wed 2021-05-19 13:10 +0200, Michal Hocko wrote:
> > Looking at try_to_compact_pages(), indeed COMPACT_SKIPPED can be returned;
> > albeit, not every zone, on the zone list, would be considered in the case
> > a fatal signal is found to be pending. Yet, in should_compact_retry(),
> > given the last known compaction result, each zone, on the zone list, can be
> > considered/or checked (see compaction_zonelist_suitable()). If a zone e.g.
> > was found to succeed then reclaim/compaction would be tried again
> > (notwithstanding the above).
> 
> I believe Vlastimil would be much better fit into looking into those
> details but it smells like pending fatal signals can lead to a unbound
> retry indeed.

Understood.

I will post a trivial patch to hopefully address this particular condition
in the compaction retry code path, shortly for further discussion.


Kind regards,
diff mbox series

Patch

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7a2c89b21115..8d748b1b8d1e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4181,6 +4181,16 @@  __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	return NULL;
 }
 
+static inline bool
+should_try_oom(int no_progress_loops,
+		enum compact_result last_compact_result)
+{
+	if (no_progress_loops > MAX_RECLAIM_RETRIES && last_compact_result
+			== COMPACT_SKIPPED)
+		return true;
+	return false;
+}
+
 static inline bool
 should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
 		     enum compact_result compact_result,
@@ -4547,10 +4557,11 @@  should_reclaim_retry(gfp_t gfp_mask, unsigned order,
 	 * Make sure we converge to OOM if we cannot make any progress
 	 * several times in the row.
 	 */
-	if (*no_progress_loops > MAX_RECLAIM_RETRIES) {
-		/* Before OOM, exhaust highatomic_reserve */
-		return unreserve_highatomic_pageblock(ac, true);
-	}
+	if (*no_progress_loops > MAX_RECLAIM_RETRIES)
+		result false;
+	/* Last chance before OOM, try draining highatomic_reserve once */
+	else if (*no_progress_loops == MAX_RECLAIM_RETRIES)
+		return unreserve_highatomic_pageblock(ac, true)
 
 	/*
 	 * Keep reclaiming pages while there is a chance this will lead
@@ -4822,6 +4833,8 @@  __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 				 did_some_progress > 0, &no_progress_loops))
 		goto retry;
 
+	if (should_try_oom(no_progress_loops, compact_result))
+		goto oom:
 	/*
 	 * It doesn't make any sense to retry for the compaction if the order-0
 	 * reclaim is not able to make any progress because the current
@@ -4839,6 +4852,7 @@  __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	if (check_retry_cpuset(cpuset_mems_cookie, ac))
 		goto retry_cpuset;
 
+oom:
 	/* Reclaim has failed us, start killing things */
 	page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
 	if (page)