diff mbox series

reply: [PATCH] Revert "mm: skip CMA pages when they are not available"

Message ID 1710488498897.75752@unisoc.com (mailing list archive)
State New
Headers show
Series reply: [PATCH] Revert "mm: skip CMA pages when they are not available" | expand

Commit Message

zhaoyang.huang March 15, 2024, 7:41 a.m. UTC
On Thu, Mar 14, 2024 at 10:15 AM <liuhailong@oppo.com> wrote:
>
> From: "Hailong.Liu" <liuhailong@oppo.com>
>
> This reverts commit 5da226dbfce3a2f44978c2c7cf88166e69a6788b.
>
> patch may cause system not responding. if cma pages is large in lru_list
> and system is in lowmemory, many tasks would enter direct reclaim and waste
> cpu time to isolate and return. Test this patch on android-5.15 device
> and tasks call stack as below.
>
> Task name: UsbFfs-worker [affinity: 0xff] pid: 3374 cpu: 7 prio: 120 start: ffffff8897a35c80
> state: 0x0[R] exit_state: 0x0 stack base: 0xffffffc01eaa0000
> Last_enqueued_ts:       0.000000000 Last_sleep_ts:       0.000000000
> Stack:
> [<ffffffd32ee7d910>] __switch_to+0x180
> [<ffffffd3302022fc>] __schedule+0x4dc
> [<ffffffd330201e08>] preempt_schedule+0x5c
> [<ffffffd33020a4d0>] _raw_spin_unlock_irq+0x54
> [<ffffffd32f14906c>] shrink_inactive_list+0x1d0
> [<ffffffd32f143998>] shrink_lruvec+0x1bc
> [<ffffffd32f147c0c>] shrink_node_memcgs+0x184
> [<ffffffd32f147414>] shrink_node+0x2d0
> [<ffffffd32f146d38>] shrink_zones+0x14c
> [<ffffffd32f142e84>] do_try_to_free_pages+0xe8
> [<ffffffd32f142b08>] try_to_free_pages+0x2e0
> [<ffffffd32f1a8e44>] __alloc_pages_direct_reclaim+0x84
> [<ffffffd32f1a2d58>] __alloc_pages_slowpath+0x4d0
> [<ffffffd32f1a23bc>] __alloc_pages_nodemask[jt]+0x124
> [<ffffffd32f19a220>] __vmalloc_area_node+0x188
> [<ffffffd32f19a540>] __vmalloc_node+0x148
> [<ffffffd32f19a60c>] vmalloc+0x4c
> [<ffffffd32f910218>] ffs_epfile_io+0x258
> [<ffffffd330033780>] kretprobe_trampoline[jt]+0x0
> [<ffffffd330033780>] kretprobe_trampoline[jt]+0x0
> [<ffffffd32f28129c>] __io_submit_one+0x1c0
> [<ffffffd32f280e38>] io_submit_one+0x88
> [<ffffffd32f280c88>] __do_sys_io_submit+0x178
> [<ffffffd32f27eac0>] __arm64_sys_io_submit+0x20
> [<ffffffd32eeabb74>] el0_svc_common.llvm.9961749221945255377+0xd0
> [<ffffffd32eeaba34>] do_el0_svc+0x28
> [<ffffffd32ff21be8>] el0_svc+0x14
> [<ffffffd32ff21b70>] el0_sync_handler+0x88
> [<ffffffd32ee128b8>] el0_sync+0x1b8
>
> Task name: kthreadd [affinity: 0xff] pid: 2 cpu: 7 prio: 120 start: ffffff87808c0000
> state: 0x0[R] exit_state: 0x0 stack base: 0xffffffc008078000
> Last_enqueued_ts:       0.000000000 Last_sleep_ts:       0.000000000
> Stack:
> [<ffffffd32ee7d910>] __switch_to+0x180
> [<ffffffd3302022fc>] __schedule+0x4dc
> [<ffffffd330201e08>] preempt_schedule+0x5c
> [<ffffffd33020a4d0>] _raw_spin_unlock_irq+0x54
> [<ffffffd32f149168>] shrink_inactive_list+0x2cc
> [<ffffffd32f143998>] shrink_lruvec+0x1bc
> [<ffffffd32f147c0c>] shrink_node_memcgs+0x184
> [<ffffffd32f147414>] shrink_node+0x2d0
> [<ffffffd32f146d38>] shrink_zones+0x14c
> [<ffffffd32f142e84>] do_try_to_free_pages+0xe8
> [<ffffffd32f142b08>] try_to_free_pages+0x2e0
> [<ffffffd32f1a8e44>] __alloc_pages_direct_reclaim+0x84
> [<ffffffd32f1a2d58>] __alloc_pages_slowpath+0x4d0
> [<ffffffd32f1a23bc>] __alloc_pages_nodemask[jt]+0x124
> [<ffffffd32f19a220>] __vmalloc_area_node+0x188
> [<ffffffd32f19a044>] __vmalloc_node_range+0x88
> [<ffffffd32f0fb430>] scs_alloc+0x1b8
> [<ffffffd32f0fb62c>] scs_prepare+0x20
> [<ffffffd32ef2ce04>] dup_task_struct+0xd4
> [<ffffffd32ef2a77c>] copy_process+0x144
> [<ffffffd32ef2bae4>] kernel_clone+0xb4
> [<ffffffd32ef2c040>] kernel_thread+0x5c
> [<ffffffd32ef618d0>] kthreadd+0x184
>
> without this patch, the tasks will reclaim cma pages and wakeup
> oom-killer or not spin on cpus.
>
> Signed-off-by: Hailong.Liu <liuhailong@oppo.com>
> ---
>  mm/vmscan.c | 22 +---------------------
>  1 file changed, 1 insertion(+), 21 deletions(-)
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 2fe4a11d63f4..197ddf62019f 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -2261,25 +2261,6 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec,
>
>  }
>
> -#ifdef CONFIG_CMA
> -/*
> - * It is waste of effort to scan and reclaim CMA pages if it is not available
> - * for current allocation context. Kswapd can not be enrolled as it can not
> - * distinguish this scenario by using sc->gfp_mask = GFP_KERNEL
> - */
> -static bool skip_cma(struct folio *folio, struct scan_control *sc)
> -{
> -       return !current_is_kswapd() &&
> -                       gfp_migratetype(sc->gfp_mask) != MIGRATE_MOVABLE &&
> -                       get_pageblock_migratetype(&folio->page) == MIGRATE_CMA;
> -}
> -#else
> -static bool skip_cma(struct folio *folio, struct scan_control *sc)
> -{
> -       return false;
> -}
> -#endif
> -

>NAK.

>+Charan Teja Kalla -- This can cause build errors when CONFIG_LRU_GEN=y.

>If you plan to post a v2, please include a reproducer. Thanks.

Could you please retest the case with bellow patch, which has not been in the aosp yet.

From: Zhaoyang Huang <zhaoyang.huang@unisoc.com>

According to current CMA utilization policy, an alloc_pages(GFP_USER)
could 'steal' UNMOVABLE & RECLAIMABLE page blocks via the help of
CMA(pass zone_watermark_ok by counting CMA in but use U&R in rmqueue),
which could lead to following alloc_pages(GFP_KERNEL) fail.
Solving this by introducing second watermark checking for GFP_MOVABLE,
which could have the allocation use CMA when proper.

-- Free_pages(30MB)
|
|
-- WMARK_LOW(25MB)
|
-- Free_CMA(12MB)
|
|
--

Signed-off-by: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
---
v6: update comments
---
---
 mm/page_alloc.c | 44 ++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 40 insertions(+), 4 deletions(-)

--


>  /*
>   * Isolating page from the lruvec to fill in @dst list by nr_to_scan times.
>   *
> @@ -2326,8 +2307,7 @@ static unsigned long isolate_lru_folios(unsigned long nr_to_scan,
>                 nr_pages = folio_nr_pages(folio);
>                 total_scan += nr_pages;
>
> -               if (folio_zonenum(folio) > sc->reclaim_idx ||
> -                               skip_cma(folio, sc)) {
> +               if (folio_zonenum(folio) > sc->reclaim_idx) {
>                         nr_skipped[folio_zonenum(folio)] += nr_pages;
>                         move_to = &folios_skipped;
>                         goto move;
> --
> 2.34.1
>
>

Comments

刘海龙(LaoLiu) March 15, 2024, 10:36 a.m. UTC | #1
On 2024/3/15 15:41, 黄朝阳 (Zhaoyang Huang) wrote:
> 
> 
> On Thu, Mar 14, 2024 at 10:15 AM <liuhailong@oppo.com> wrote:
>>
>> From: "Hailong.Liu" <liuhailong@oppo.com>
>>
>> This reverts commit 5da226dbfce3a2f44978c2c7cf88166e69a6788b.
>>
>> patch may cause system not responding. if cma pages is large in lru_list
>> and system is in lowmemory, many tasks would enter direct reclaim and waste
>> cpu time to isolate and return. Test this patch on android-5.15 device
>> and tasks call stack as below.
>>
>> Task name: UsbFfs-worker [affinity: 0xff] pid: 3374 cpu: 7 prio: 120 start: ffffff8897a35c80
>> state: 0x0[R] exit_state: 0x0 stack base: 0xffffffc01eaa0000
>> Last_enqueued_ts:       0.000000000 Last_sleep_ts:       0.000000000
>> Stack:
>> [<ffffffd32ee7d910>] __switch_to+0x180
>> [<ffffffd3302022fc>] __schedule+0x4dc
>> [<ffffffd330201e08>] preempt_schedule+0x5c
>> [<ffffffd33020a4d0>] _raw_spin_unlock_irq+0x54
>> [<ffffffd32f14906c>] shrink_inactive_list+0x1d0
>> [<ffffffd32f143998>] shrink_lruvec+0x1bc
>> [<ffffffd32f147c0c>] shrink_node_memcgs+0x184
>> [<ffffffd32f147414>] shrink_node+0x2d0
>> [<ffffffd32f146d38>] shrink_zones+0x14c
>> [<ffffffd32f142e84>] do_try_to_free_pages+0xe8
>> [<ffffffd32f142b08>] try_to_free_pages+0x2e0
>> [<ffffffd32f1a8e44>] __alloc_pages_direct_reclaim+0x84
>> [<ffffffd32f1a2d58>] __alloc_pages_slowpath+0x4d0
>> [<ffffffd32f1a23bc>] __alloc_pages_nodemask[jt]+0x124
>> [<ffffffd32f19a220>] __vmalloc_area_node+0x188
>> [<ffffffd32f19a540>] __vmalloc_node+0x148
>> [<ffffffd32f19a60c>] vmalloc+0x4c
>> [<ffffffd32f910218>] ffs_epfile_io+0x258
>> [<ffffffd330033780>] kretprobe_trampoline[jt]+0x0
>> [<ffffffd330033780>] kretprobe_trampoline[jt]+0x0
>> [<ffffffd32f28129c>] __io_submit_one+0x1c0
>> [<ffffffd32f280e38>] io_submit_one+0x88
>> [<ffffffd32f280c88>] __do_sys_io_submit+0x178
>> [<ffffffd32f27eac0>] __arm64_sys_io_submit+0x20
>> [<ffffffd32eeabb74>] el0_svc_common.llvm.9961749221945255377+0xd0
>> [<ffffffd32eeaba34>] do_el0_svc+0x28
>> [<ffffffd32ff21be8>] el0_svc+0x14
>> [<ffffffd32ff21b70>] el0_sync_handler+0x88
>> [<ffffffd32ee128b8>] el0_sync+0x1b8
>>
>> Task name: kthreadd [affinity: 0xff] pid: 2 cpu: 7 prio: 120 start: ffffff87808c0000
>> state: 0x0[R] exit_state: 0x0 stack base: 0xffffffc008078000
>> Last_enqueued_ts:       0.000000000 Last_sleep_ts:       0.000000000
>> Stack:
>> [<ffffffd32ee7d910>] __switch_to+0x180
>> [<ffffffd3302022fc>] __schedule+0x4dc
>> [<ffffffd330201e08>] preempt_schedule+0x5c
>> [<ffffffd33020a4d0>] _raw_spin_unlock_irq+0x54
>> [<ffffffd32f149168>] shrink_inactive_list+0x2cc
>> [<ffffffd32f143998>] shrink_lruvec+0x1bc
>> [<ffffffd32f147c0c>] shrink_node_memcgs+0x184
>> [<ffffffd32f147414>] shrink_node+0x2d0
>> [<ffffffd32f146d38>] shrink_zones+0x14c
>> [<ffffffd32f142e84>] do_try_to_free_pages+0xe8
>> [<ffffffd32f142b08>] try_to_free_pages+0x2e0
>> [<ffffffd32f1a8e44>] __alloc_pages_direct_reclaim+0x84
>> [<ffffffd32f1a2d58>] __alloc_pages_slowpath+0x4d0
>> [<ffffffd32f1a23bc>] __alloc_pages_nodemask[jt]+0x124
>> [<ffffffd32f19a220>] __vmalloc_area_node+0x188
>> [<ffffffd32f19a044>] __vmalloc_node_range+0x88
>> [<ffffffd32f0fb430>] scs_alloc+0x1b8
>> [<ffffffd32f0fb62c>] scs_prepare+0x20
>> [<ffffffd32ef2ce04>] dup_task_struct+0xd4
>> [<ffffffd32ef2a77c>] copy_process+0x144
>> [<ffffffd32ef2bae4>] kernel_clone+0xb4
>> [<ffffffd32ef2c040>] kernel_thread+0x5c
>> [<ffffffd32ef618d0>] kthreadd+0x184
>>
>> without this patch, the tasks will reclaim cma pages and wakeup
>> oom-killer or not spin on cpus.
>>
>> Signed-off-by: Hailong.Liu <liuhailong@oppo.com>
>> ---
>>  mm/vmscan.c | 22 +---------------------
>>  1 file changed, 1 insertion(+), 21 deletions(-)
>>
>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>> index 2fe4a11d63f4..197ddf62019f 100644
>> --- a/mm/vmscan.c
>> +++ b/mm/vmscan.c
>> @@ -2261,25 +2261,6 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec,
>>
>>  }
>>
>> -#ifdef CONFIG_CMA
>> -/*
>> - * It is waste of effort to scan and reclaim CMA pages if it is not available
>> - * for current allocation context. Kswapd can not be enrolled as it can not
>> - * distinguish this scenario by using sc->gfp_mask = GFP_KERNEL
>> - */
>> -static bool skip_cma(struct folio *folio, struct scan_control *sc)
>> -{
>> -       return !current_is_kswapd() &&
>> -                       gfp_migratetype(sc->gfp_mask) != MIGRATE_MOVABLE &&
>> -                       get_pageblock_migratetype(&folio->page) == MIGRATE_CMA;
>> -}
>> -#else
>> -static bool skip_cma(struct folio *folio, struct scan_control *sc)
>> -{
>> -       return false;
>> -}
>> -#endif
>> -
> 
>> NAK.
> 
>> +Charan Teja Kalla -- This can cause build errors when CONFIG_LRU_GEN=y.
> 
>> If you plan to post a v2, please include a reproducer. Thanks.
> 
> Could you please retest the case with bellow patch, which has not been in the aosp yet.
> 
> From: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
> 
> According to current CMA utilization policy, an alloc_pages(GFP_USER)
> could 'steal' UNMOVABLE & RECLAIMABLE page blocks via the help of
> CMA(pass zone_watermark_ok by counting CMA in but use U&R in rmqueue),
> which could lead to following alloc_pages(GFP_KERNEL) fail.
> Solving this by introducing second watermark checking for GFP_MOVABLE,
> which could have the allocation use CMA when proper.
> 
> -- Free_pages(30MB)
> |
> |
> -- WMARK_LOW(25MB)
> |
> -- Free_CMA(12MB)
> |
> |
> --
> 
> Signed-off-by: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
> ---
> v6: update comments
> ---
> ---
>  mm/page_alloc.c | 44 ++++++++++++++++++++++++++++++++++++++++----
>  1 file changed, 40 insertions(+), 4 deletions(-)
> 
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 452459836b71..5a146aa7c0aa 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -2078,6 +2078,43 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
> 
>  }
> 
> +#ifdef CONFIG_CMA
> +/*
> + * GFP_MOVABLE allocation could drain UNMOVABLE & RECLAIMABLE page blocks via
> + * the help of CMA which makes GFP_KERNEL failed. Checking if zone_watermark_ok
> + * again without ALLOC_CMA to see if to use CMA first.
> + */
> +static bool use_cma_first(struct zone *zone, unsigned int order, unsigned int alloc_flags)
> +{
> +       unsigned long watermark;
> +       bool cma_first = false;
> +
> +       watermark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
> +       /* check if GFP_MOVABLE pass previous zone_watermark_ok via the help of CMA */
> +       if (zone_watermark_ok(zone, order, watermark, 0, alloc_flags & (~ALLOC_CMA))) {
> +               /*
> +                * Balance movable allocations between regular and CMA areas by
> +                * allocating from CMA when over half of the zone's free memory
> +                * is in the CMA area.
> +                */
> +               cma_first = (zone_page_state(zone, NR_FREE_CMA_PAGES) >
> +                               zone_page_state(zone, NR_FREE_PAGES) / 2);
> +       } else {
> +               /*
> +                * watermark failed means UNMOVABLE & RECLAIMBLE is not enough
> +                * now, we should use cma first to keep them stay around the
> +                * corresponding watermark
> +                */
> +               cma_first = true;
> +       }
> +       return cma_first;
> +}
> +#else
> +static bool use_cma_first(struct zone *zone, unsigned int order, unsigned int alloc_flags)
> +{
> +       return false;
> +}
> +#endif
>  /*
>   * Do the hard work of removing an element from the buddy allocator.
>   * Call me with the zone->lock already held.
> @@ -2091,12 +2128,11 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype,
>         if (IS_ENABLED(CONFIG_CMA)) {
>                 /*
>                  * Balance movable allocations between regular and CMA areas by
> -                * allocating from CMA when over half of the zone's free memory
> -                * is in the CMA area.
> +                * allocating from CMA base on judging zone_watermark_ok again
> +                * to see if the latest check got pass via the help of CMA
>                  */
>                 if (alloc_flags & ALLOC_CMA &&
> -                   zone_page_state(zone, NR_FREE_CMA_PAGES) >
> -                   zone_page_state(zone, NR_FREE_PAGES) / 2) {
> +                       use_cma_first(zone, order, alloc_flags)) {
>                         page = __rmqueue_cma_fallback(zone, order);
>                         if (page)
>                                 return page;
> --
> 
Hi Zhaoyang:

I write a reproducer in v2-patch, this may not solve the case. because if 
system in lowmemory all lru_list is cma pages. direct_reclaim would wasting time
scan and skip. For now we could not know how many cma pages in lru and do
some heuristic is something weird.

Brs,
Hailong.
diff mbox series

Patch

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 452459836b71..5a146aa7c0aa 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2078,6 +2078,43 @@  __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,

 }

+#ifdef CONFIG_CMA
+/*
+ * GFP_MOVABLE allocation could drain UNMOVABLE & RECLAIMABLE page blocks via
+ * the help of CMA which makes GFP_KERNEL failed. Checking if zone_watermark_ok
+ * again without ALLOC_CMA to see if to use CMA first.
+ */
+static bool use_cma_first(struct zone *zone, unsigned int order, unsigned int alloc_flags)
+{
+       unsigned long watermark;
+       bool cma_first = false;
+
+       watermark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
+       /* check if GFP_MOVABLE pass previous zone_watermark_ok via the help of CMA */
+       if (zone_watermark_ok(zone, order, watermark, 0, alloc_flags & (~ALLOC_CMA))) {
+               /*
+                * Balance movable allocations between regular and CMA areas by
+                * allocating from CMA when over half of the zone's free memory
+                * is in the CMA area.
+                */
+               cma_first = (zone_page_state(zone, NR_FREE_CMA_PAGES) >
+                               zone_page_state(zone, NR_FREE_PAGES) / 2);
+       } else {
+               /*
+                * watermark failed means UNMOVABLE & RECLAIMBLE is not enough
+                * now, we should use cma first to keep them stay around the
+                * corresponding watermark
+                */
+               cma_first = true;
+       }
+       return cma_first;
+}
+#else
+static bool use_cma_first(struct zone *zone, unsigned int order, unsigned int alloc_flags)
+{
+       return false;
+}
+#endif
 /*
  * Do the hard work of removing an element from the buddy allocator.
  * Call me with the zone->lock already held.
@@ -2091,12 +2128,11 @@  __rmqueue(struct zone *zone, unsigned int order, int migratetype,
        if (IS_ENABLED(CONFIG_CMA)) {
                /*
                 * Balance movable allocations between regular and CMA areas by
-                * allocating from CMA when over half of the zone's free memory
-                * is in the CMA area.
+                * allocating from CMA base on judging zone_watermark_ok again
+                * to see if the latest check got pass via the help of CMA
                 */
                if (alloc_flags & ALLOC_CMA &&
-                   zone_page_state(zone, NR_FREE_CMA_PAGES) >
-                   zone_page_state(zone, NR_FREE_PAGES) / 2) {
+                       use_cma_first(zone, order, alloc_flags)) {
                        page = __rmqueue_cma_fallback(zone, order);
                        if (page)
                                return page;