diff mbox series

[v3,4/6] mm: Introduce a pageflag for partially mapped folios

Message ID 20240813120328.1275952-5-usamaarif642@gmail.com (mailing list archive)
State New
Headers show
Series mm: split underutilized THPs | expand

Commit Message

Usama Arif Aug. 13, 2024, 12:02 p.m. UTC
Currently folio->_deferred_list is used to keep track of
partially_mapped folios that are going to be split under memory
pressure. In the next patch, all THPs that are faulted in and collapsed
by khugepaged are also going to be tracked using _deferred_list.

This patch introduces a pageflag to be able to distinguish between
partially mapped folios and others in the deferred_list at split time in
deferred_split_scan. Its needed as __folio_remove_rmap decrements
_mapcount, _large_mapcount and _entire_mapcount, hence it won't be
possible to distinguish between partially mapped folios and others in
deferred_split_scan.

Eventhough it introduces an extra flag to track if the folio is
partially mapped, there is no functional change intended with this
patch and the flag is not useful in this patch itself, it will
become useful in the next patch when _deferred_list has non partially
mapped folios.

Signed-off-by: Usama Arif <usamaarif642@gmail.com>
---
 include/linux/huge_mm.h    |  4 ++--
 include/linux/page-flags.h |  3 +++
 mm/huge_memory.c           | 21 +++++++++++++--------
 mm/hugetlb.c               |  1 +
 mm/internal.h              |  4 +++-
 mm/memcontrol.c            |  3 ++-
 mm/migrate.c               |  3 ++-
 mm/page_alloc.c            |  5 +++--
 mm/rmap.c                  |  3 ++-
 mm/vmscan.c                |  3 ++-
 10 files changed, 33 insertions(+), 17 deletions(-)

Comments

Yu Zhao Aug. 14, 2024, 3:30 a.m. UTC | #1
On Tue, Aug 13, 2024 at 6:03 AM Usama Arif <usamaarif642@gmail.com> wrote:
>
> Currently folio->_deferred_list is used to keep track of
> partially_mapped folios that are going to be split under memory
> pressure. In the next patch, all THPs that are faulted in and collapsed
> by khugepaged are also going to be tracked using _deferred_list.
>
> This patch introduces a pageflag to be able to distinguish between
> partially mapped folios and others in the deferred_list at split time in
> deferred_split_scan. Its needed as __folio_remove_rmap decrements
> _mapcount, _large_mapcount and _entire_mapcount, hence it won't be
> possible to distinguish between partially mapped folios and others in
> deferred_split_scan.
>
> Eventhough it introduces an extra flag to track if the folio is
> partially mapped, there is no functional change intended with this
> patch and the flag is not useful in this patch itself, it will
> become useful in the next patch when _deferred_list has non partially
> mapped folios.
>
> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
> ---
>  include/linux/huge_mm.h    |  4 ++--
>  include/linux/page-flags.h |  3 +++
>  mm/huge_memory.c           | 21 +++++++++++++--------
>  mm/hugetlb.c               |  1 +
>  mm/internal.h              |  4 +++-
>  mm/memcontrol.c            |  3 ++-
>  mm/migrate.c               |  3 ++-
>  mm/page_alloc.c            |  5 +++--
>  mm/rmap.c                  |  3 ++-
>  mm/vmscan.c                |  3 ++-
>  10 files changed, 33 insertions(+), 17 deletions(-)
>
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index 4c32058cacfe..969f11f360d2 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -321,7 +321,7 @@ static inline int split_huge_page(struct page *page)
>  {
>         return split_huge_page_to_list_to_order(page, NULL, 0);
>  }
> -void deferred_split_folio(struct folio *folio);
> +void deferred_split_folio(struct folio *folio, bool partially_mapped);
>
>  void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
>                 unsigned long address, bool freeze, struct folio *folio);
> @@ -495,7 +495,7 @@ static inline int split_huge_page(struct page *page)
>  {
>         return 0;
>  }
> -static inline void deferred_split_folio(struct folio *folio) {}
> +static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {}
>  #define split_huge_pmd(__vma, __pmd, __address)        \
>         do { } while (0)
>
> diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
> index a0a29bd092f8..cecc1bad7910 100644
> --- a/include/linux/page-flags.h
> +++ b/include/linux/page-flags.h
> @@ -182,6 +182,7 @@ enum pageflags {
>         /* At least one page in this folio has the hwpoison flag set */
>         PG_has_hwpoisoned = PG_active,
>         PG_large_rmappable = PG_workingset, /* anon or file-backed */
> +       PG_partially_mapped, /* was identified to be partially mapped */
>  };
>
>  #define PAGEFLAGS_MASK         ((1UL << NR_PAGEFLAGS) - 1)
> @@ -861,8 +862,10 @@ static inline void ClearPageCompound(struct page *page)
>         ClearPageHead(page);
>  }
>  FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE)
> +FOLIO_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
>  #else
>  FOLIO_FLAG_FALSE(large_rmappable)
> +FOLIO_FLAG_FALSE(partially_mapped)
>  #endif
>
>  #define PG_head_mask ((1UL << PG_head))
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 6df0e9f4f56c..c024ab0f745c 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -3397,6 +3397,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>                          * page_deferred_list.
>                          */
>                         list_del_init(&folio->_deferred_list);
> +                       folio_clear_partially_mapped(folio);
>                 }
>                 spin_unlock(&ds_queue->split_queue_lock);
>                 if (mapping) {
> @@ -3453,11 +3454,12 @@ void __folio_undo_large_rmappable(struct folio *folio)
>         if (!list_empty(&folio->_deferred_list)) {
>                 ds_queue->split_queue_len--;
>                 list_del_init(&folio->_deferred_list);
> +               folio_clear_partially_mapped(folio);
>         }
>         spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
>  }
>
> -void deferred_split_folio(struct folio *folio)
> +void deferred_split_folio(struct folio *folio, bool partially_mapped)
>  {
>         struct deferred_split *ds_queue = get_deferred_split_queue(folio);
>  #ifdef CONFIG_MEMCG
> @@ -3485,14 +3487,17 @@ void deferred_split_folio(struct folio *folio)
>         if (folio_test_swapcache(folio))
>                 return;
>
> -       if (!list_empty(&folio->_deferred_list))
> -               return;
> -
>         spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
> +       if (partially_mapped)
> +               folio_set_partially_mapped(folio);
> +       else
> +               folio_clear_partially_mapped(folio);
>         if (list_empty(&folio->_deferred_list)) {
> -               if (folio_test_pmd_mappable(folio))
> -                       count_vm_event(THP_DEFERRED_SPLIT_PAGE);
> -               count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
> +               if (partially_mapped) {
> +                       if (folio_test_pmd_mappable(folio))
> +                               count_vm_event(THP_DEFERRED_SPLIT_PAGE);
> +                       count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
> +               }
>                 list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
>                 ds_queue->split_queue_len++;
>  #ifdef CONFIG_MEMCG
> @@ -3541,6 +3546,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
>                 } else {
>                         /* We lost race with folio_put() */
>                         list_del_init(&folio->_deferred_list);
> +                       folio_clear_partially_mapped(folio);
>                         ds_queue->split_queue_len--;
>                 }
>                 if (!--sc->nr_to_scan)
> @@ -3558,7 +3564,6 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
>  next:
>                 folio_put(folio);
>         }
> -

Unintentional change above?

>         spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
>         list_splice_tail(&list, &ds_queue->split_queue);
>         spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 1fdd9eab240c..2ae2d9a18e40 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -1758,6 +1758,7 @@ static void __update_and_free_hugetlb_folio(struct hstate *h,
>                 free_gigantic_folio(folio, huge_page_order(h));
>         } else {
>                 INIT_LIST_HEAD(&folio->_deferred_list);
> +               folio_clear_partially_mapped(folio);

Why does it need to clear a flag that should never be set on hugeTLB folios?

HugeTLB does really use _deferred_list -- it clears it only to avoid
bad_page() because of the overlapping fields:
                        void *_hugetlb_subpool;
                        void *_hugetlb_cgroup;
Usama Arif Aug. 14, 2024, 10:20 a.m. UTC | #2
On 14/08/2024 04:30, Yu Zhao wrote:
>> @@ -3558,7 +3564,6 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
>>  next:
>>                 folio_put(folio);
>>         }
>> -
> 
> Unintentional change above?

Yes, unintended new line, will fix it.
> 
>>         spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
>>         list_splice_tail(&list, &ds_queue->split_queue);
>>         spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
>> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
>> index 1fdd9eab240c..2ae2d9a18e40 100644
>> --- a/mm/hugetlb.c
>> +++ b/mm/hugetlb.c
>> @@ -1758,6 +1758,7 @@ static void __update_and_free_hugetlb_folio(struct hstate *h,
>>                 free_gigantic_folio(folio, huge_page_order(h));
>>         } else {
>>                 INIT_LIST_HEAD(&folio->_deferred_list);
>> +               folio_clear_partially_mapped(folio);
> 
> Why does it need to clear a flag that should never be set on hugeTLB folios?
> 
> HugeTLB does really use _deferred_list -- it clears it only to avoid
> bad_page() because of the overlapping fields:
>                         void *_hugetlb_subpool;
>                         void *_hugetlb_cgroup;

Yes, thats right, will remove it. Thanks!
Barry Song Aug. 14, 2024, 10:44 a.m. UTC | #3
On Wed, Aug 14, 2024 at 12:03 AM Usama Arif <usamaarif642@gmail.com> wrote:
>
> Currently folio->_deferred_list is used to keep track of
> partially_mapped folios that are going to be split under memory
> pressure. In the next patch, all THPs that are faulted in and collapsed
> by khugepaged are also going to be tracked using _deferred_list.
>
> This patch introduces a pageflag to be able to distinguish between
> partially mapped folios and others in the deferred_list at split time in
> deferred_split_scan. Its needed as __folio_remove_rmap decrements
> _mapcount, _large_mapcount and _entire_mapcount, hence it won't be
> possible to distinguish between partially mapped folios and others in
> deferred_split_scan.
>
> Eventhough it introduces an extra flag to track if the folio is
> partially mapped, there is no functional change intended with this
> patch and the flag is not useful in this patch itself, it will
> become useful in the next patch when _deferred_list has non partially
> mapped folios.
>
> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
> ---
>  include/linux/huge_mm.h    |  4 ++--
>  include/linux/page-flags.h |  3 +++
>  mm/huge_memory.c           | 21 +++++++++++++--------
>  mm/hugetlb.c               |  1 +
>  mm/internal.h              |  4 +++-
>  mm/memcontrol.c            |  3 ++-
>  mm/migrate.c               |  3 ++-
>  mm/page_alloc.c            |  5 +++--
>  mm/rmap.c                  |  3 ++-
>  mm/vmscan.c                |  3 ++-
>  10 files changed, 33 insertions(+), 17 deletions(-)
>
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index 4c32058cacfe..969f11f360d2 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -321,7 +321,7 @@ static inline int split_huge_page(struct page *page)
>  {
>         return split_huge_page_to_list_to_order(page, NULL, 0);
>  }
> -void deferred_split_folio(struct folio *folio);
> +void deferred_split_folio(struct folio *folio, bool partially_mapped);
>
>  void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
>                 unsigned long address, bool freeze, struct folio *folio);
> @@ -495,7 +495,7 @@ static inline int split_huge_page(struct page *page)
>  {
>         return 0;
>  }
> -static inline void deferred_split_folio(struct folio *folio) {}
> +static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {}
>  #define split_huge_pmd(__vma, __pmd, __address)        \
>         do { } while (0)
>
> diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
> index a0a29bd092f8..cecc1bad7910 100644
> --- a/include/linux/page-flags.h
> +++ b/include/linux/page-flags.h
> @@ -182,6 +182,7 @@ enum pageflags {
>         /* At least one page in this folio has the hwpoison flag set */
>         PG_has_hwpoisoned = PG_active,
>         PG_large_rmappable = PG_workingset, /* anon or file-backed */
> +       PG_partially_mapped, /* was identified to be partially mapped */
>  };
>
>  #define PAGEFLAGS_MASK         ((1UL << NR_PAGEFLAGS) - 1)
> @@ -861,8 +862,10 @@ static inline void ClearPageCompound(struct page *page)
>         ClearPageHead(page);
>  }
>  FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE)
> +FOLIO_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
>  #else
>  FOLIO_FLAG_FALSE(large_rmappable)
> +FOLIO_FLAG_FALSE(partially_mapped)
>  #endif
>
>  #define PG_head_mask ((1UL << PG_head))
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 6df0e9f4f56c..c024ab0f745c 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -3397,6 +3397,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>                          * page_deferred_list.
>                          */
>                         list_del_init(&folio->_deferred_list);
> +                       folio_clear_partially_mapped(folio);
>                 }
>                 spin_unlock(&ds_queue->split_queue_lock);
>                 if (mapping) {
> @@ -3453,11 +3454,12 @@ void __folio_undo_large_rmappable(struct folio *folio)
>         if (!list_empty(&folio->_deferred_list)) {
>                 ds_queue->split_queue_len--;
>                 list_del_init(&folio->_deferred_list);
> +               folio_clear_partially_mapped(folio);
>         }
>         spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
>  }
>
> -void deferred_split_folio(struct folio *folio)
> +void deferred_split_folio(struct folio *folio, bool partially_mapped)
>  {
>         struct deferred_split *ds_queue = get_deferred_split_queue(folio);
>  #ifdef CONFIG_MEMCG
> @@ -3485,14 +3487,17 @@ void deferred_split_folio(struct folio *folio)
>         if (folio_test_swapcache(folio))
>                 return;
>
> -       if (!list_empty(&folio->_deferred_list))
> -               return;
> -
>         spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
> +       if (partially_mapped)
> +               folio_set_partially_mapped(folio);
> +       else
> +               folio_clear_partially_mapped(folio);

Hi Usama,

Do we need this? When can a partially_mapped folio on deferred_list become
non-partially-mapped and need a clear? I understand transferring from
entirely_map
to partially_mapped is a one way process? partially_mapped folios can't go back
to entirely_mapped?

I am trying to rebase my NR_SPLIT_DEFERRED counter on top of your
work, but this "clear" makes that job quite tricky. as I am not sure
if this patch
is going to clear the partially_mapped flag for folios on deferred_list.

Otherwise, I can simply do the below whenever folio is leaving deferred_list

        ds_queue->split_queue_len--;
        list_del_init(&folio->_deferred_list);
        if (folio_test_clear_partially_mapped(folio))
                mod_mthp_stat(folio_order(folio),
MTHP_STAT_NR_SPLIT_DEFERRED, -1);

>         if (list_empty(&folio->_deferred_list)) {
> -               if (folio_test_pmd_mappable(folio))
> -                       count_vm_event(THP_DEFERRED_SPLIT_PAGE);
> -               count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
> +               if (partially_mapped) {
> +                       if (folio_test_pmd_mappable(folio))
> +                               count_vm_event(THP_DEFERRED_SPLIT_PAGE);
> +                       count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
> +               }
>                 list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
>                 ds_queue->split_queue_len++;
>  #ifdef CONFIG_MEMCG
> @@ -3541,6 +3546,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
>                 } else {
>                         /* We lost race with folio_put() */
>                         list_del_init(&folio->_deferred_list);
> +                       folio_clear_partially_mapped(folio);
>                         ds_queue->split_queue_len--;
>                 }
>                 if (!--sc->nr_to_scan)
> @@ -3558,7 +3564,6 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
>  next:
>                 folio_put(folio);
>         }
> -
>         spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
>         list_splice_tail(&list, &ds_queue->split_queue);
>         spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 1fdd9eab240c..2ae2d9a18e40 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -1758,6 +1758,7 @@ static void __update_and_free_hugetlb_folio(struct hstate *h,
>                 free_gigantic_folio(folio, huge_page_order(h));
>         } else {
>                 INIT_LIST_HEAD(&folio->_deferred_list);
> +               folio_clear_partially_mapped(folio);
>                 folio_put(folio);
>         }
>  }
> diff --git a/mm/internal.h b/mm/internal.h
> index 52f7fc4e8ac3..d64546b8d377 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -662,8 +662,10 @@ static inline void prep_compound_head(struct page *page, unsigned int order)
>         atomic_set(&folio->_entire_mapcount, -1);
>         atomic_set(&folio->_nr_pages_mapped, 0);
>         atomic_set(&folio->_pincount, 0);
> -       if (order > 1)
> +       if (order > 1) {
>                 INIT_LIST_HEAD(&folio->_deferred_list);
> +               folio_clear_partially_mapped(folio);
> +       }
>  }
>
>  static inline void prep_compound_tail(struct page *head, int tail_idx)
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index e1ffd2950393..0fd95daecf9a 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -4669,7 +4669,8 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
>         VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
>         VM_BUG_ON_FOLIO(folio_order(folio) > 1 &&
>                         !folio_test_hugetlb(folio) &&
> -                       !list_empty(&folio->_deferred_list), folio);
> +                       !list_empty(&folio->_deferred_list) &&
> +                       folio_test_partially_mapped(folio), folio);
>
>         /*
>          * Nobody should be changing or seriously looking at
> diff --git a/mm/migrate.c b/mm/migrate.c
> index 3288ac041d03..6e32098ac2dc 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -1734,7 +1734,8 @@ static int migrate_pages_batch(struct list_head *from,
>                          * use _deferred_list.
>                          */
>                         if (nr_pages > 2 &&
> -                          !list_empty(&folio->_deferred_list)) {
> +                          !list_empty(&folio->_deferred_list) &&
> +                          folio_test_partially_mapped(folio)) {
>                                 if (!try_split_folio(folio, split_folios, mode)) {
>                                         nr_failed++;
>                                         stats->nr_thp_failed += is_thp;
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 408ef3d25cf5..a145c550dd2a 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -957,8 +957,9 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page)
>                 break;
>         case 2:
>                 /* the second tail page: deferred_list overlaps ->mapping */
> -               if (unlikely(!list_empty(&folio->_deferred_list))) {
> -                       bad_page(page, "on deferred list");
> +               if (unlikely(!list_empty(&folio->_deferred_list) &&
> +                   folio_test_partially_mapped(folio))) {
> +                       bad_page(page, "partially mapped folio on deferred list");
>                         goto out;
>                 }
>                 break;
> diff --git a/mm/rmap.c b/mm/rmap.c
> index a6b9cd0b2b18..9ad558c2bad0 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -1579,7 +1579,8 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
>          */
>         if (partially_mapped && folio_test_anon(folio) &&
>             list_empty(&folio->_deferred_list))
> -               deferred_split_folio(folio);
> +               deferred_split_folio(folio, true);
> +
>         __folio_mod_stat(folio, -nr, -nr_pmdmapped);
>
>         /*
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 25e43bb3b574..25f4e8403f41 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -1233,7 +1233,8 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
>                                          * Split partially mapped folios right away.
>                                          * We can free the unmapped pages without IO.
>                                          */
> -                                       if (data_race(!list_empty(&folio->_deferred_list)) &&
> +                                       if (data_race(!list_empty(&folio->_deferred_list) &&
> +                                           folio_test_partially_mapped(folio)) &&
>                                             split_folio_to_list(folio, folio_list))
>                                                 goto activate_locked;
>                                 }
> --
> 2.43.5
>

Thanks
Barry
Barry Song Aug. 14, 2024, 10:52 a.m. UTC | #4
On Wed, Aug 14, 2024 at 10:44 PM Barry Song <baohua@kernel.org> wrote:
>
> On Wed, Aug 14, 2024 at 12:03 AM Usama Arif <usamaarif642@gmail.com> wrote:
> >
> > Currently folio->_deferred_list is used to keep track of
> > partially_mapped folios that are going to be split under memory
> > pressure. In the next patch, all THPs that are faulted in and collapsed
> > by khugepaged are also going to be tracked using _deferred_list.
> >
> > This patch introduces a pageflag to be able to distinguish between
> > partially mapped folios and others in the deferred_list at split time in
> > deferred_split_scan. Its needed as __folio_remove_rmap decrements
> > _mapcount, _large_mapcount and _entire_mapcount, hence it won't be
> > possible to distinguish between partially mapped folios and others in
> > deferred_split_scan.
> >
> > Eventhough it introduces an extra flag to track if the folio is
> > partially mapped, there is no functional change intended with this
> > patch and the flag is not useful in this patch itself, it will
> > become useful in the next patch when _deferred_list has non partially
> > mapped folios.
> >
> > Signed-off-by: Usama Arif <usamaarif642@gmail.com>
> > ---
> >  include/linux/huge_mm.h    |  4 ++--
> >  include/linux/page-flags.h |  3 +++
> >  mm/huge_memory.c           | 21 +++++++++++++--------
> >  mm/hugetlb.c               |  1 +
> >  mm/internal.h              |  4 +++-
> >  mm/memcontrol.c            |  3 ++-
> >  mm/migrate.c               |  3 ++-
> >  mm/page_alloc.c            |  5 +++--
> >  mm/rmap.c                  |  3 ++-
> >  mm/vmscan.c                |  3 ++-
> >  10 files changed, 33 insertions(+), 17 deletions(-)
> >
> > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> > index 4c32058cacfe..969f11f360d2 100644
> > --- a/include/linux/huge_mm.h
> > +++ b/include/linux/huge_mm.h
> > @@ -321,7 +321,7 @@ static inline int split_huge_page(struct page *page)
> >  {
> >         return split_huge_page_to_list_to_order(page, NULL, 0);
> >  }
> > -void deferred_split_folio(struct folio *folio);
> > +void deferred_split_folio(struct folio *folio, bool partially_mapped);
> >
> >  void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
> >                 unsigned long address, bool freeze, struct folio *folio);
> > @@ -495,7 +495,7 @@ static inline int split_huge_page(struct page *page)
> >  {
> >         return 0;
> >  }
> > -static inline void deferred_split_folio(struct folio *folio) {}
> > +static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {}
> >  #define split_huge_pmd(__vma, __pmd, __address)        \
> >         do { } while (0)
> >
> > diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
> > index a0a29bd092f8..cecc1bad7910 100644
> > --- a/include/linux/page-flags.h
> > +++ b/include/linux/page-flags.h
> > @@ -182,6 +182,7 @@ enum pageflags {
> >         /* At least one page in this folio has the hwpoison flag set */
> >         PG_has_hwpoisoned = PG_active,
> >         PG_large_rmappable = PG_workingset, /* anon or file-backed */
> > +       PG_partially_mapped, /* was identified to be partially mapped */
> >  };
> >
> >  #define PAGEFLAGS_MASK         ((1UL << NR_PAGEFLAGS) - 1)
> > @@ -861,8 +862,10 @@ static inline void ClearPageCompound(struct page *page)
> >         ClearPageHead(page);
> >  }
> >  FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE)
> > +FOLIO_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
> >  #else
> >  FOLIO_FLAG_FALSE(large_rmappable)
> > +FOLIO_FLAG_FALSE(partially_mapped)
> >  #endif
> >
> >  #define PG_head_mask ((1UL << PG_head))
> > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > index 6df0e9f4f56c..c024ab0f745c 100644
> > --- a/mm/huge_memory.c
> > +++ b/mm/huge_memory.c
> > @@ -3397,6 +3397,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
> >                          * page_deferred_list.
> >                          */
> >                         list_del_init(&folio->_deferred_list);
> > +                       folio_clear_partially_mapped(folio);
> >                 }
> >                 spin_unlock(&ds_queue->split_queue_lock);
> >                 if (mapping) {
> > @@ -3453,11 +3454,12 @@ void __folio_undo_large_rmappable(struct folio *folio)
> >         if (!list_empty(&folio->_deferred_list)) {
> >                 ds_queue->split_queue_len--;
> >                 list_del_init(&folio->_deferred_list);
> > +               folio_clear_partially_mapped(folio);
> >         }
> >         spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
> >  }
> >
> > -void deferred_split_folio(struct folio *folio)
> > +void deferred_split_folio(struct folio *folio, bool partially_mapped)
> >  {
> >         struct deferred_split *ds_queue = get_deferred_split_queue(folio);
> >  #ifdef CONFIG_MEMCG
> > @@ -3485,14 +3487,17 @@ void deferred_split_folio(struct folio *folio)
> >         if (folio_test_swapcache(folio))
> >                 return;
> >
> > -       if (!list_empty(&folio->_deferred_list))
> > -               return;
> > -
> >         spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
> > +       if (partially_mapped)
> > +               folio_set_partially_mapped(folio);
> > +       else
> > +               folio_clear_partially_mapped(folio);
>
> Hi Usama,
>
> Do we need this? When can a partially_mapped folio on deferred_list become
> non-partially-mapped and need a clear? I understand transferring from
> entirely_map
> to partially_mapped is a one way process? partially_mapped folios can't go back
> to entirely_mapped?
>
> I am trying to rebase my NR_SPLIT_DEFERRED counter on top of your
> work, but this "clear" makes that job quite tricky. as I am not sure
> if this patch
> is going to clear the partially_mapped flag for folios on deferred_list.
>
> Otherwise, I can simply do the below whenever folio is leaving deferred_list
>
>         ds_queue->split_queue_len--;
>         list_del_init(&folio->_deferred_list);
>         if (folio_test_clear_partially_mapped(folio))
>                 mod_mthp_stat(folio_order(folio),
> MTHP_STAT_NR_SPLIT_DEFERRED, -1);

On the other hand, I can still make things correct by the below code,
but it looks much more tricky. i believe we at least need the first
one folio_test_set_partially_mapped()  because folios on
deferred_list can become partially_mapped from
entirely_mapped.

Not quite sure if we need the second folio_test_clear_partially_mapped(folio)
in deferred_split_folio(). My understand is that it is impossible and this
folio_clear_partially_mapped() is probably redundant.

@@ -3515,10 +3522,13 @@ void deferred_split_folio(struct folio *folio,
bool partially_mapped)
                return;

        spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
-       if (partially_mapped)
-               folio_set_partially_mapped(folio);
-       else
-               folio_clear_partially_mapped(folio);
+       if (partially_mapped) {
+               if (!folio_test_set_partially_mapped(folio))
+                       mod_mthp_stat(folio_order(folio),
+                               MTHP_STAT_NR_SPLIT_DEFERRED, 1);
+       } else if (folio_test_clear_partially_mapped(folio)) {
+                mod_mthp_stat(folio_order(folio),
MTHP_STAT_NR_SPLIT_DEFERRED, -1);
+       }
        if (list_empty(&folio->_deferred_list)) {
                if (partially_mapped) {
                        if (folio_test_pmd_mappable(folio))

>
> >         if (list_empty(&folio->_deferred_list)) {
> > -               if (folio_test_pmd_mappable(folio))
> > -                       count_vm_event(THP_DEFERRED_SPLIT_PAGE);
> > -               count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
> > +               if (partially_mapped) {
> > +                       if (folio_test_pmd_mappable(folio))
> > +                               count_vm_event(THP_DEFERRED_SPLIT_PAGE);
> > +                       count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
> > +               }
> >                 list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
> >                 ds_queue->split_queue_len++;
> >  #ifdef CONFIG_MEMCG
> > @@ -3541,6 +3546,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
> >                 } else {
> >                         /* We lost race with folio_put() */
> >                         list_del_init(&folio->_deferred_list);
> > +                       folio_clear_partially_mapped(folio);
> >                         ds_queue->split_queue_len--;
> >                 }
> >                 if (!--sc->nr_to_scan)
> > @@ -3558,7 +3564,6 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
> >  next:
> >                 folio_put(folio);
> >         }
> > -
> >         spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
> >         list_splice_tail(&list, &ds_queue->split_queue);
> >         spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > index 1fdd9eab240c..2ae2d9a18e40 100644
> > --- a/mm/hugetlb.c
> > +++ b/mm/hugetlb.c
> > @@ -1758,6 +1758,7 @@ static void __update_and_free_hugetlb_folio(struct hstate *h,
> >                 free_gigantic_folio(folio, huge_page_order(h));
> >         } else {
> >                 INIT_LIST_HEAD(&folio->_deferred_list);
> > +               folio_clear_partially_mapped(folio);
> >                 folio_put(folio);
> >         }
> >  }
> > diff --git a/mm/internal.h b/mm/internal.h
> > index 52f7fc4e8ac3..d64546b8d377 100644
> > --- a/mm/internal.h
> > +++ b/mm/internal.h
> > @@ -662,8 +662,10 @@ static inline void prep_compound_head(struct page *page, unsigned int order)
> >         atomic_set(&folio->_entire_mapcount, -1);
> >         atomic_set(&folio->_nr_pages_mapped, 0);
> >         atomic_set(&folio->_pincount, 0);
> > -       if (order > 1)
> > +       if (order > 1) {
> >                 INIT_LIST_HEAD(&folio->_deferred_list);
> > +               folio_clear_partially_mapped(folio);
> > +       }
> >  }
> >
> >  static inline void prep_compound_tail(struct page *head, int tail_idx)
> > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > index e1ffd2950393..0fd95daecf9a 100644
> > --- a/mm/memcontrol.c
> > +++ b/mm/memcontrol.c
> > @@ -4669,7 +4669,8 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
> >         VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
> >         VM_BUG_ON_FOLIO(folio_order(folio) > 1 &&
> >                         !folio_test_hugetlb(folio) &&
> > -                       !list_empty(&folio->_deferred_list), folio);
> > +                       !list_empty(&folio->_deferred_list) &&
> > +                       folio_test_partially_mapped(folio), folio);
> >
> >         /*
> >          * Nobody should be changing or seriously looking at
> > diff --git a/mm/migrate.c b/mm/migrate.c
> > index 3288ac041d03..6e32098ac2dc 100644
> > --- a/mm/migrate.c
> > +++ b/mm/migrate.c
> > @@ -1734,7 +1734,8 @@ static int migrate_pages_batch(struct list_head *from,
> >                          * use _deferred_list.
> >                          */
> >                         if (nr_pages > 2 &&
> > -                          !list_empty(&folio->_deferred_list)) {
> > +                          !list_empty(&folio->_deferred_list) &&
> > +                          folio_test_partially_mapped(folio)) {
> >                                 if (!try_split_folio(folio, split_folios, mode)) {
> >                                         nr_failed++;
> >                                         stats->nr_thp_failed += is_thp;
> > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > index 408ef3d25cf5..a145c550dd2a 100644
> > --- a/mm/page_alloc.c
> > +++ b/mm/page_alloc.c
> > @@ -957,8 +957,9 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page)
> >                 break;
> >         case 2:
> >                 /* the second tail page: deferred_list overlaps ->mapping */
> > -               if (unlikely(!list_empty(&folio->_deferred_list))) {
> > -                       bad_page(page, "on deferred list");
> > +               if (unlikely(!list_empty(&folio->_deferred_list) &&
> > +                   folio_test_partially_mapped(folio))) {
> > +                       bad_page(page, "partially mapped folio on deferred list");
> >                         goto out;
> >                 }
> >                 break;
> > diff --git a/mm/rmap.c b/mm/rmap.c
> > index a6b9cd0b2b18..9ad558c2bad0 100644
> > --- a/mm/rmap.c
> > +++ b/mm/rmap.c
> > @@ -1579,7 +1579,8 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
> >          */
> >         if (partially_mapped && folio_test_anon(folio) &&
> >             list_empty(&folio->_deferred_list))
> > -               deferred_split_folio(folio);
> > +               deferred_split_folio(folio, true);
> > +
> >         __folio_mod_stat(folio, -nr, -nr_pmdmapped);
> >
> >         /*
> > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > index 25e43bb3b574..25f4e8403f41 100644
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -1233,7 +1233,8 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
> >                                          * Split partially mapped folios right away.
> >                                          * We can free the unmapped pages without IO.
> >                                          */
> > -                                       if (data_race(!list_empty(&folio->_deferred_list)) &&
> > +                                       if (data_race(!list_empty(&folio->_deferred_list) &&
> > +                                           folio_test_partially_mapped(folio)) &&
> >                                             split_folio_to_list(folio, folio_list))
> >                                                 goto activate_locked;
> >                                 }
> > --
> > 2.43.5
> >
>
> Thanks
> Barry
Barry Song Aug. 14, 2024, 11:10 a.m. UTC | #5
On Wed, Aug 14, 2024 at 12:03 AM Usama Arif <usamaarif642@gmail.com> wrote:
>
> Currently folio->_deferred_list is used to keep track of
> partially_mapped folios that are going to be split under memory
> pressure. In the next patch, all THPs that are faulted in and collapsed
> by khugepaged are also going to be tracked using _deferred_list.
>
> This patch introduces a pageflag to be able to distinguish between
> partially mapped folios and others in the deferred_list at split time in
> deferred_split_scan. Its needed as __folio_remove_rmap decrements
> _mapcount, _large_mapcount and _entire_mapcount, hence it won't be
> possible to distinguish between partially mapped folios and others in
> deferred_split_scan.
>
> Eventhough it introduces an extra flag to track if the folio is
> partially mapped, there is no functional change intended with this
> patch and the flag is not useful in this patch itself, it will
> become useful in the next patch when _deferred_list has non partially
> mapped folios.
>
> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
> ---
>  include/linux/huge_mm.h    |  4 ++--
>  include/linux/page-flags.h |  3 +++
>  mm/huge_memory.c           | 21 +++++++++++++--------
>  mm/hugetlb.c               |  1 +
>  mm/internal.h              |  4 +++-
>  mm/memcontrol.c            |  3 ++-
>  mm/migrate.c               |  3 ++-
>  mm/page_alloc.c            |  5 +++--
>  mm/rmap.c                  |  3 ++-
>  mm/vmscan.c                |  3 ++-
>  10 files changed, 33 insertions(+), 17 deletions(-)
>
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index 4c32058cacfe..969f11f360d2 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -321,7 +321,7 @@ static inline int split_huge_page(struct page *page)
>  {
>         return split_huge_page_to_list_to_order(page, NULL, 0);
>  }
> -void deferred_split_folio(struct folio *folio);
> +void deferred_split_folio(struct folio *folio, bool partially_mapped);
>
>  void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
>                 unsigned long address, bool freeze, struct folio *folio);
> @@ -495,7 +495,7 @@ static inline int split_huge_page(struct page *page)
>  {
>         return 0;
>  }
> -static inline void deferred_split_folio(struct folio *folio) {}
> +static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {}
>  #define split_huge_pmd(__vma, __pmd, __address)        \
>         do { } while (0)
>
> diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
> index a0a29bd092f8..cecc1bad7910 100644
> --- a/include/linux/page-flags.h
> +++ b/include/linux/page-flags.h
> @@ -182,6 +182,7 @@ enum pageflags {
>         /* At least one page in this folio has the hwpoison flag set */
>         PG_has_hwpoisoned = PG_active,
>         PG_large_rmappable = PG_workingset, /* anon or file-backed */
> +       PG_partially_mapped, /* was identified to be partially mapped */
>  };
>
>  #define PAGEFLAGS_MASK         ((1UL << NR_PAGEFLAGS) - 1)
> @@ -861,8 +862,10 @@ static inline void ClearPageCompound(struct page *page)
>         ClearPageHead(page);
>  }
>  FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE)
> +FOLIO_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
>  #else
>  FOLIO_FLAG_FALSE(large_rmappable)
> +FOLIO_FLAG_FALSE(partially_mapped)
>  #endif
>
>  #define PG_head_mask ((1UL << PG_head))
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 6df0e9f4f56c..c024ab0f745c 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -3397,6 +3397,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>                          * page_deferred_list.
>                          */
>                         list_del_init(&folio->_deferred_list);
> +                       folio_clear_partially_mapped(folio);
>                 }
>                 spin_unlock(&ds_queue->split_queue_lock);
>                 if (mapping) {
> @@ -3453,11 +3454,12 @@ void __folio_undo_large_rmappable(struct folio *folio)
>         if (!list_empty(&folio->_deferred_list)) {
>                 ds_queue->split_queue_len--;
>                 list_del_init(&folio->_deferred_list);
> +               folio_clear_partially_mapped(folio);
>         }
>         spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
>  }
>
> -void deferred_split_folio(struct folio *folio)
> +void deferred_split_folio(struct folio *folio, bool partially_mapped)
>  {
>         struct deferred_split *ds_queue = get_deferred_split_queue(folio);
>  #ifdef CONFIG_MEMCG
> @@ -3485,14 +3487,17 @@ void deferred_split_folio(struct folio *folio)
>         if (folio_test_swapcache(folio))
>                 return;
>
> -       if (!list_empty(&folio->_deferred_list))
> -               return;
> -
>         spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
> +       if (partially_mapped)
> +               folio_set_partially_mapped(folio);
> +       else
> +               folio_clear_partially_mapped(folio);
>         if (list_empty(&folio->_deferred_list)) {
> -               if (folio_test_pmd_mappable(folio))
> -                       count_vm_event(THP_DEFERRED_SPLIT_PAGE);
> -               count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
> +               if (partially_mapped) {
> +                       if (folio_test_pmd_mappable(folio))
> +                               count_vm_event(THP_DEFERRED_SPLIT_PAGE);
> +                       count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);

This code completely broke MTHP_STAT_SPLIT_DEFERRED for PMD_ORDER. It
added the folio to the deferred_list as entirely_mapped
(partially_mapped == false).
However, when partially_mapped becomes true, there's no opportunity to
add it again
as it has been there on the list. Are you consistently seeing the counter for
PMD_ORDER as 0?

> +               }
>                 list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
>                 ds_queue->split_queue_len++;
>  #ifdef CONFIG_MEMCG
> @@ -3541,6 +3546,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
>                 } else {
>                         /* We lost race with folio_put() */
>                         list_del_init(&folio->_deferred_list);
> +                       folio_clear_partially_mapped(folio);
>                         ds_queue->split_queue_len--;
>                 }
>                 if (!--sc->nr_to_scan)
> @@ -3558,7 +3564,6 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
>  next:
>                 folio_put(folio);
>         }
> -
>         spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
>         list_splice_tail(&list, &ds_queue->split_queue);
>         spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 1fdd9eab240c..2ae2d9a18e40 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -1758,6 +1758,7 @@ static void __update_and_free_hugetlb_folio(struct hstate *h,
>                 free_gigantic_folio(folio, huge_page_order(h));
>         } else {
>                 INIT_LIST_HEAD(&folio->_deferred_list);
> +               folio_clear_partially_mapped(folio);
>                 folio_put(folio);
>         }
>  }
> diff --git a/mm/internal.h b/mm/internal.h
> index 52f7fc4e8ac3..d64546b8d377 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -662,8 +662,10 @@ static inline void prep_compound_head(struct page *page, unsigned int order)
>         atomic_set(&folio->_entire_mapcount, -1);
>         atomic_set(&folio->_nr_pages_mapped, 0);
>         atomic_set(&folio->_pincount, 0);
> -       if (order > 1)
> +       if (order > 1) {
>                 INIT_LIST_HEAD(&folio->_deferred_list);
> +               folio_clear_partially_mapped(folio);
> +       }
>  }
>
>  static inline void prep_compound_tail(struct page *head, int tail_idx)
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index e1ffd2950393..0fd95daecf9a 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -4669,7 +4669,8 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
>         VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
>         VM_BUG_ON_FOLIO(folio_order(folio) > 1 &&
>                         !folio_test_hugetlb(folio) &&
> -                       !list_empty(&folio->_deferred_list), folio);
> +                       !list_empty(&folio->_deferred_list) &&
> +                       folio_test_partially_mapped(folio), folio);
>
>         /*
>          * Nobody should be changing or seriously looking at
> diff --git a/mm/migrate.c b/mm/migrate.c
> index 3288ac041d03..6e32098ac2dc 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -1734,7 +1734,8 @@ static int migrate_pages_batch(struct list_head *from,
>                          * use _deferred_list.
>                          */
>                         if (nr_pages > 2 &&
> -                          !list_empty(&folio->_deferred_list)) {
> +                          !list_empty(&folio->_deferred_list) &&
> +                          folio_test_partially_mapped(folio)) {
>                                 if (!try_split_folio(folio, split_folios, mode)) {
>                                         nr_failed++;
>                                         stats->nr_thp_failed += is_thp;
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 408ef3d25cf5..a145c550dd2a 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -957,8 +957,9 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page)
>                 break;
>         case 2:
>                 /* the second tail page: deferred_list overlaps ->mapping */
> -               if (unlikely(!list_empty(&folio->_deferred_list))) {
> -                       bad_page(page, "on deferred list");
> +               if (unlikely(!list_empty(&folio->_deferred_list) &&
> +                   folio_test_partially_mapped(folio))) {
> +                       bad_page(page, "partially mapped folio on deferred list");
>                         goto out;
>                 }
>                 break;
> diff --git a/mm/rmap.c b/mm/rmap.c
> index a6b9cd0b2b18..9ad558c2bad0 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -1579,7 +1579,8 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
>          */
>         if (partially_mapped && folio_test_anon(folio) &&
>             list_empty(&folio->_deferred_list))
> -               deferred_split_folio(folio);
> +               deferred_split_folio(folio, true);
> +
>         __folio_mod_stat(folio, -nr, -nr_pmdmapped);
>
>         /*
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 25e43bb3b574..25f4e8403f41 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -1233,7 +1233,8 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
>                                          * Split partially mapped folios right away.
>                                          * We can free the unmapped pages without IO.
>                                          */
> -                                       if (data_race(!list_empty(&folio->_deferred_list)) &&
> +                                       if (data_race(!list_empty(&folio->_deferred_list) &&
> +                                           folio_test_partially_mapped(folio)) &&
>                                             split_folio_to_list(folio, folio_list))
>                                                 goto activate_locked;
>                                 }
> --
> 2.43.5
>
Usama Arif Aug. 14, 2024, 11:11 a.m. UTC | #6
On 14/08/2024 11:44, Barry Song wrote:
> On Wed, Aug 14, 2024 at 12:03 AM Usama Arif <usamaarif642@gmail.com> wrote:
>>
>> Currently folio->_deferred_list is used to keep track of
>> partially_mapped folios that are going to be split under memory
>> pressure. In the next patch, all THPs that are faulted in and collapsed
>> by khugepaged are also going to be tracked using _deferred_list.
>>
>> This patch introduces a pageflag to be able to distinguish between
>> partially mapped folios and others in the deferred_list at split time in
>> deferred_split_scan. Its needed as __folio_remove_rmap decrements
>> _mapcount, _large_mapcount and _entire_mapcount, hence it won't be
>> possible to distinguish between partially mapped folios and others in
>> deferred_split_scan.
>>
>> Eventhough it introduces an extra flag to track if the folio is
>> partially mapped, there is no functional change intended with this
>> patch and the flag is not useful in this patch itself, it will
>> become useful in the next patch when _deferred_list has non partially
>> mapped folios.
>>
>> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
>> ---
>>  include/linux/huge_mm.h    |  4 ++--
>>  include/linux/page-flags.h |  3 +++
>>  mm/huge_memory.c           | 21 +++++++++++++--------
>>  mm/hugetlb.c               |  1 +
>>  mm/internal.h              |  4 +++-
>>  mm/memcontrol.c            |  3 ++-
>>  mm/migrate.c               |  3 ++-
>>  mm/page_alloc.c            |  5 +++--
>>  mm/rmap.c                  |  3 ++-
>>  mm/vmscan.c                |  3 ++-
>>  10 files changed, 33 insertions(+), 17 deletions(-)
>>
>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>> index 4c32058cacfe..969f11f360d2 100644
>> --- a/include/linux/huge_mm.h
>> +++ b/include/linux/huge_mm.h
>> @@ -321,7 +321,7 @@ static inline int split_huge_page(struct page *page)
>>  {
>>         return split_huge_page_to_list_to_order(page, NULL, 0);
>>  }
>> -void deferred_split_folio(struct folio *folio);
>> +void deferred_split_folio(struct folio *folio, bool partially_mapped);
>>
>>  void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
>>                 unsigned long address, bool freeze, struct folio *folio);
>> @@ -495,7 +495,7 @@ static inline int split_huge_page(struct page *page)
>>  {
>>         return 0;
>>  }
>> -static inline void deferred_split_folio(struct folio *folio) {}
>> +static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {}
>>  #define split_huge_pmd(__vma, __pmd, __address)        \
>>         do { } while (0)
>>
>> diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
>> index a0a29bd092f8..cecc1bad7910 100644
>> --- a/include/linux/page-flags.h
>> +++ b/include/linux/page-flags.h
>> @@ -182,6 +182,7 @@ enum pageflags {
>>         /* At least one page in this folio has the hwpoison flag set */
>>         PG_has_hwpoisoned = PG_active,
>>         PG_large_rmappable = PG_workingset, /* anon or file-backed */
>> +       PG_partially_mapped, /* was identified to be partially mapped */
>>  };
>>
>>  #define PAGEFLAGS_MASK         ((1UL << NR_PAGEFLAGS) - 1)
>> @@ -861,8 +862,10 @@ static inline void ClearPageCompound(struct page *page)
>>         ClearPageHead(page);
>>  }
>>  FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE)
>> +FOLIO_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
>>  #else
>>  FOLIO_FLAG_FALSE(large_rmappable)
>> +FOLIO_FLAG_FALSE(partially_mapped)
>>  #endif
>>
>>  #define PG_head_mask ((1UL << PG_head))
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 6df0e9f4f56c..c024ab0f745c 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -3397,6 +3397,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>>                          * page_deferred_list.
>>                          */
>>                         list_del_init(&folio->_deferred_list);
>> +                       folio_clear_partially_mapped(folio);
>>                 }
>>                 spin_unlock(&ds_queue->split_queue_lock);
>>                 if (mapping) {
>> @@ -3453,11 +3454,12 @@ void __folio_undo_large_rmappable(struct folio *folio)
>>         if (!list_empty(&folio->_deferred_list)) {
>>                 ds_queue->split_queue_len--;
>>                 list_del_init(&folio->_deferred_list);
>> +               folio_clear_partially_mapped(folio);
>>         }
>>         spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
>>  }
>>
>> -void deferred_split_folio(struct folio *folio)
>> +void deferred_split_folio(struct folio *folio, bool partially_mapped)
>>  {
>>         struct deferred_split *ds_queue = get_deferred_split_queue(folio);
>>  #ifdef CONFIG_MEMCG
>> @@ -3485,14 +3487,17 @@ void deferred_split_folio(struct folio *folio)
>>         if (folio_test_swapcache(folio))
>>                 return;
>>
>> -       if (!list_empty(&folio->_deferred_list))
>> -               return;
>> -
>>         spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
>> +       if (partially_mapped)
>> +               folio_set_partially_mapped(folio);
>> +       else
>> +               folio_clear_partially_mapped(folio);
> 
> Hi Usama,
> 
> Do we need this? When can a partially_mapped folio on deferred_list become
> non-partially-mapped and need a clear? I understand transferring from
> entirely_map
> to partially_mapped is a one way process? partially_mapped folios can't go back
> to entirely_mapped?
> 
Hi Barry,

deferred_split_folio function is called in 3 places after this series, at fault, collapse and partial mapping. partial mapping can only happen after fault/collapse, and we have FOLIO_FLAG_FALSE(partially_mapped), i.e. flag initialized to false, so technically its not needed. A partially_mapped folio on deferred list wont become non-partially mapped. 

I just did it as a precaution if someone ever changes the kernel and calls deferred_split_folio with partially_mapped set to false after it had been true. The function arguments of deferred_split_folio make it seem that passing partially_mapped=false as an argument would clear it, which is why I cleared it as well. I could change the patch to something like below if it makes things better? i.e. add a comment at the top of the function:


-void deferred_split_folio(struct folio *folio)
+/* partially_mapped=false won't clear PG_partially_mapped folio flag */
+void deferred_split_folio(struct folio *folio, bool partially_mapped)
 {
        struct deferred_split *ds_queue = get_deferred_split_queue(folio);
 #ifdef CONFIG_MEMCG
@@ -3485,14 +3488,15 @@ void deferred_split_folio(struct folio *folio)
        if (folio_test_swapcache(folio))
                return;
 
-       if (!list_empty(&folio->_deferred_list))
-               return;
-
        spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
+       if (partially_mapped)
+               folio_set_partially_mapped(folio);
        if (list_empty(&folio->_deferred_list)) {
-               if (folio_test_pmd_mappable(folio))
-                       count_vm_event(THP_DEFERRED_SPLIT_PAGE);
-               count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
+               if (partially_mapped) {
+                       if (folio_test_pmd_mappable(folio))
+                               count_vm_event(THP_DEFERRED_SPLIT_PAGE);
+                       count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
+               }
                list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
                ds_queue->split_queue_len++;
 #ifdef CONFIG_MEMCG


> I am trying to rebase my NR_SPLIT_DEFERRED counter on top of your
> work, but this "clear" makes that job quite tricky. as I am not sure
> if this patch
> is going to clear the partially_mapped flag for folios on deferred_list.
> 
> Otherwise, I can simply do the below whenever folio is leaving deferred_list
> 
>         ds_queue->split_queue_len--;
>         list_del_init(&folio->_deferred_list);
>         if (folio_test_clear_partially_mapped(folio))
>                 mod_mthp_stat(folio_order(folio),
> MTHP_STAT_NR_SPLIT_DEFERRED, -1);
>
Barry Song Aug. 14, 2024, 11:20 a.m. UTC | #7
On Wed, Aug 14, 2024 at 11:11 PM Usama Arif <usamaarif642@gmail.com> wrote:
>
>
>
> On 14/08/2024 11:44, Barry Song wrote:
> > On Wed, Aug 14, 2024 at 12:03 AM Usama Arif <usamaarif642@gmail.com> wrote:
> >>
> >> Currently folio->_deferred_list is used to keep track of
> >> partially_mapped folios that are going to be split under memory
> >> pressure. In the next patch, all THPs that are faulted in and collapsed
> >> by khugepaged are also going to be tracked using _deferred_list.
> >>
> >> This patch introduces a pageflag to be able to distinguish between
> >> partially mapped folios and others in the deferred_list at split time in
> >> deferred_split_scan. Its needed as __folio_remove_rmap decrements
> >> _mapcount, _large_mapcount and _entire_mapcount, hence it won't be
> >> possible to distinguish between partially mapped folios and others in
> >> deferred_split_scan.
> >>
> >> Eventhough it introduces an extra flag to track if the folio is
> >> partially mapped, there is no functional change intended with this
> >> patch and the flag is not useful in this patch itself, it will
> >> become useful in the next patch when _deferred_list has non partially
> >> mapped folios.
> >>
> >> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
> >> ---
> >>  include/linux/huge_mm.h    |  4 ++--
> >>  include/linux/page-flags.h |  3 +++
> >>  mm/huge_memory.c           | 21 +++++++++++++--------
> >>  mm/hugetlb.c               |  1 +
> >>  mm/internal.h              |  4 +++-
> >>  mm/memcontrol.c            |  3 ++-
> >>  mm/migrate.c               |  3 ++-
> >>  mm/page_alloc.c            |  5 +++--
> >>  mm/rmap.c                  |  3 ++-
> >>  mm/vmscan.c                |  3 ++-
> >>  10 files changed, 33 insertions(+), 17 deletions(-)
> >>
> >> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> >> index 4c32058cacfe..969f11f360d2 100644
> >> --- a/include/linux/huge_mm.h
> >> +++ b/include/linux/huge_mm.h
> >> @@ -321,7 +321,7 @@ static inline int split_huge_page(struct page *page)
> >>  {
> >>         return split_huge_page_to_list_to_order(page, NULL, 0);
> >>  }
> >> -void deferred_split_folio(struct folio *folio);
> >> +void deferred_split_folio(struct folio *folio, bool partially_mapped);
> >>
> >>  void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
> >>                 unsigned long address, bool freeze, struct folio *folio);
> >> @@ -495,7 +495,7 @@ static inline int split_huge_page(struct page *page)
> >>  {
> >>         return 0;
> >>  }
> >> -static inline void deferred_split_folio(struct folio *folio) {}
> >> +static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {}
> >>  #define split_huge_pmd(__vma, __pmd, __address)        \
> >>         do { } while (0)
> >>
> >> diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
> >> index a0a29bd092f8..cecc1bad7910 100644
> >> --- a/include/linux/page-flags.h
> >> +++ b/include/linux/page-flags.h
> >> @@ -182,6 +182,7 @@ enum pageflags {
> >>         /* At least one page in this folio has the hwpoison flag set */
> >>         PG_has_hwpoisoned = PG_active,
> >>         PG_large_rmappable = PG_workingset, /* anon or file-backed */
> >> +       PG_partially_mapped, /* was identified to be partially mapped */
> >>  };
> >>
> >>  #define PAGEFLAGS_MASK         ((1UL << NR_PAGEFLAGS) - 1)
> >> @@ -861,8 +862,10 @@ static inline void ClearPageCompound(struct page *page)
> >>         ClearPageHead(page);
> >>  }
> >>  FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE)
> >> +FOLIO_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
> >>  #else
> >>  FOLIO_FLAG_FALSE(large_rmappable)
> >> +FOLIO_FLAG_FALSE(partially_mapped)
> >>  #endif
> >>
> >>  #define PG_head_mask ((1UL << PG_head))
> >> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> >> index 6df0e9f4f56c..c024ab0f745c 100644
> >> --- a/mm/huge_memory.c
> >> +++ b/mm/huge_memory.c
> >> @@ -3397,6 +3397,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
> >>                          * page_deferred_list.
> >>                          */
> >>                         list_del_init(&folio->_deferred_list);
> >> +                       folio_clear_partially_mapped(folio);
> >>                 }
> >>                 spin_unlock(&ds_queue->split_queue_lock);
> >>                 if (mapping) {
> >> @@ -3453,11 +3454,12 @@ void __folio_undo_large_rmappable(struct folio *folio)
> >>         if (!list_empty(&folio->_deferred_list)) {
> >>                 ds_queue->split_queue_len--;
> >>                 list_del_init(&folio->_deferred_list);
> >> +               folio_clear_partially_mapped(folio);
> >>         }
> >>         spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
> >>  }
> >>
> >> -void deferred_split_folio(struct folio *folio)
> >> +void deferred_split_folio(struct folio *folio, bool partially_mapped)
> >>  {
> >>         struct deferred_split *ds_queue = get_deferred_split_queue(folio);
> >>  #ifdef CONFIG_MEMCG
> >> @@ -3485,14 +3487,17 @@ void deferred_split_folio(struct folio *folio)
> >>         if (folio_test_swapcache(folio))
> >>                 return;
> >>
> >> -       if (!list_empty(&folio->_deferred_list))
> >> -               return;
> >> -
> >>         spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
> >> +       if (partially_mapped)
> >> +               folio_set_partially_mapped(folio);
> >> +       else
> >> +               folio_clear_partially_mapped(folio);
> >
> > Hi Usama,
> >
> > Do we need this? When can a partially_mapped folio on deferred_list become
> > non-partially-mapped and need a clear? I understand transferring from
> > entirely_map
> > to partially_mapped is a one way process? partially_mapped folios can't go back
> > to entirely_mapped?
> >
> Hi Barry,
>
> deferred_split_folio function is called in 3 places after this series, at fault, collapse and partial mapping. partial mapping can only happen after fault/collapse, and we have FOLIO_FLAG_FALSE(partially_mapped), i.e. flag initialized to false, so technically its not needed. A partially_mapped folio on deferred list wont become non-partially mapped.
>
> I just did it as a precaution if someone ever changes the kernel and calls deferred_split_folio with partially_mapped set to false after it had been true. The function arguments of deferred_split_folio make it seem that passing partially_mapped=false as an argument would clear it, which is why I cleared it as well. I could change the patch to something like below if it makes things better? i.e. add a comment at the top of the function:
>

to me, it seems a BUG to call with false after a folio has been
partially_mapped. So I'd rather
VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio);

The below should also fix the MTHP_STAT_SPLIT_DEFERRED
counter this patch is breaking?

@@ -3515,16 +3522,18 @@ void deferred_split_folio(struct folio *folio,
bool partially_mapped)
                return;

        spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
-       if (partially_mapped)
-               folio_set_partially_mapped(folio);
-       else
-               folio_clear_partially_mapped(folio);
-       if (list_empty(&folio->_deferred_list)) {
-               if (partially_mapped) {
+       if (partially_mapped) {
+               if (!folio_test_set_partially_mapped(folio)) {
+                       mod_mthp_stat(folio_order(folio),
+                               MTHP_STAT_NR_SPLIT_DEFERRED, 1);
                        if (folio_test_pmd_mappable(folio))
                                count_vm_event(THP_DEFERRED_SPLIT_PAGE);
                        count_mthp_stat(folio_order(folio),
MTHP_STAT_SPLIT_DEFERRED);
                }
+       }
+       VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio);
+
+       if (list_empty(&folio->_deferred_list)) {
                list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
                ds_queue->split_queue_len++;
 #ifdef CONFIG_MEMCG


>
> -void deferred_split_folio(struct folio *folio)
> +/* partially_mapped=false won't clear PG_partially_mapped folio flag */
> +void deferred_split_folio(struct folio *folio, bool partially_mapped)
>  {
>         struct deferred_split *ds_queue = get_deferred_split_queue(folio);
>  #ifdef CONFIG_MEMCG
> @@ -3485,14 +3488,15 @@ void deferred_split_folio(struct folio *folio)
>         if (folio_test_swapcache(folio))
>                 return;
>
> -       if (!list_empty(&folio->_deferred_list))
> -               return;
> -
>         spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
> +       if (partially_mapped)
> +               folio_set_partially_mapped(folio);
>         if (list_empty(&folio->_deferred_list)) {
> -               if (folio_test_pmd_mappable(folio))
> -                       count_vm_event(THP_DEFERRED_SPLIT_PAGE);
> -               count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
> +               if (partially_mapped) {
> +                       if (folio_test_pmd_mappable(folio))
> +                               count_vm_event(THP_DEFERRED_SPLIT_PAGE);
> +                       count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
> +               }
>                 list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
>                 ds_queue->split_queue_len++;
>  #ifdef CONFIG_MEMCG
>
>
> > I am trying to rebase my NR_SPLIT_DEFERRED counter on top of your
> > work, but this "clear" makes that job quite tricky. as I am not sure
> > if this patch
> > is going to clear the partially_mapped flag for folios on deferred_list.
> >
> > Otherwise, I can simply do the below whenever folio is leaving deferred_list
> >
> >         ds_queue->split_queue_len--;
> >         list_del_init(&folio->_deferred_list);
> >         if (folio_test_clear_partially_mapped(folio))
> >                 mod_mthp_stat(folio_order(folio),
> > MTHP_STAT_NR_SPLIT_DEFERRED, -1);
> >
>

Thanks
Barry
Usama Arif Aug. 14, 2024, 11:20 a.m. UTC | #8
On 14/08/2024 12:10, Barry Song wrote:
> On Wed, Aug 14, 2024 at 12:03 AM Usama Arif <usamaarif642@gmail.com> wrote:
>>
>> Currently folio->_deferred_list is used to keep track of
>> partially_mapped folios that are going to be split under memory
>> pressure. In the next patch, all THPs that are faulted in and collapsed
>> by khugepaged are also going to be tracked using _deferred_list.
>>
>> This patch introduces a pageflag to be able to distinguish between
>> partially mapped folios and others in the deferred_list at split time in
>> deferred_split_scan. Its needed as __folio_remove_rmap decrements
>> _mapcount, _large_mapcount and _entire_mapcount, hence it won't be
>> possible to distinguish between partially mapped folios and others in
>> deferred_split_scan.
>>
>> Eventhough it introduces an extra flag to track if the folio is
>> partially mapped, there is no functional change intended with this
>> patch and the flag is not useful in this patch itself, it will
>> become useful in the next patch when _deferred_list has non partially
>> mapped folios.
>>
>> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
>> ---
>>  include/linux/huge_mm.h    |  4 ++--
>>  include/linux/page-flags.h |  3 +++
>>  mm/huge_memory.c           | 21 +++++++++++++--------
>>  mm/hugetlb.c               |  1 +
>>  mm/internal.h              |  4 +++-
>>  mm/memcontrol.c            |  3 ++-
>>  mm/migrate.c               |  3 ++-
>>  mm/page_alloc.c            |  5 +++--
>>  mm/rmap.c                  |  3 ++-
>>  mm/vmscan.c                |  3 ++-
>>  10 files changed, 33 insertions(+), 17 deletions(-)
>>
>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>> index 4c32058cacfe..969f11f360d2 100644
>> --- a/include/linux/huge_mm.h
>> +++ b/include/linux/huge_mm.h
>> @@ -321,7 +321,7 @@ static inline int split_huge_page(struct page *page)
>>  {
>>         return split_huge_page_to_list_to_order(page, NULL, 0);
>>  }
>> -void deferred_split_folio(struct folio *folio);
>> +void deferred_split_folio(struct folio *folio, bool partially_mapped);
>>
>>  void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
>>                 unsigned long address, bool freeze, struct folio *folio);
>> @@ -495,7 +495,7 @@ static inline int split_huge_page(struct page *page)
>>  {
>>         return 0;
>>  }
>> -static inline void deferred_split_folio(struct folio *folio) {}
>> +static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {}
>>  #define split_huge_pmd(__vma, __pmd, __address)        \
>>         do { } while (0)
>>
>> diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
>> index a0a29bd092f8..cecc1bad7910 100644
>> --- a/include/linux/page-flags.h
>> +++ b/include/linux/page-flags.h
>> @@ -182,6 +182,7 @@ enum pageflags {
>>         /* At least one page in this folio has the hwpoison flag set */
>>         PG_has_hwpoisoned = PG_active,
>>         PG_large_rmappable = PG_workingset, /* anon or file-backed */
>> +       PG_partially_mapped, /* was identified to be partially mapped */
>>  };
>>
>>  #define PAGEFLAGS_MASK         ((1UL << NR_PAGEFLAGS) - 1)
>> @@ -861,8 +862,10 @@ static inline void ClearPageCompound(struct page *page)
>>         ClearPageHead(page);
>>  }
>>  FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE)
>> +FOLIO_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
>>  #else
>>  FOLIO_FLAG_FALSE(large_rmappable)
>> +FOLIO_FLAG_FALSE(partially_mapped)
>>  #endif
>>
>>  #define PG_head_mask ((1UL << PG_head))
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 6df0e9f4f56c..c024ab0f745c 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -3397,6 +3397,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>>                          * page_deferred_list.
>>                          */
>>                         list_del_init(&folio->_deferred_list);
>> +                       folio_clear_partially_mapped(folio);
>>                 }
>>                 spin_unlock(&ds_queue->split_queue_lock);
>>                 if (mapping) {
>> @@ -3453,11 +3454,12 @@ void __folio_undo_large_rmappable(struct folio *folio)
>>         if (!list_empty(&folio->_deferred_list)) {
>>                 ds_queue->split_queue_len--;
>>                 list_del_init(&folio->_deferred_list);
>> +               folio_clear_partially_mapped(folio);
>>         }
>>         spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
>>  }
>>
>> -void deferred_split_folio(struct folio *folio)
>> +void deferred_split_folio(struct folio *folio, bool partially_mapped)
>>  {
>>         struct deferred_split *ds_queue = get_deferred_split_queue(folio);
>>  #ifdef CONFIG_MEMCG
>> @@ -3485,14 +3487,17 @@ void deferred_split_folio(struct folio *folio)
>>         if (folio_test_swapcache(folio))
>>                 return;
>>
>> -       if (!list_empty(&folio->_deferred_list))
>> -               return;
>> -
>>         spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
>> +       if (partially_mapped)
>> +               folio_set_partially_mapped(folio);
>> +       else
>> +               folio_clear_partially_mapped(folio);
>>         if (list_empty(&folio->_deferred_list)) {
>> -               if (folio_test_pmd_mappable(folio))
>> -                       count_vm_event(THP_DEFERRED_SPLIT_PAGE);
>> -               count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
>> +               if (partially_mapped) {
>> +                       if (folio_test_pmd_mappable(folio))
>> +                               count_vm_event(THP_DEFERRED_SPLIT_PAGE);
>> +                       count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
> 
> This code completely broke MTHP_STAT_SPLIT_DEFERRED for PMD_ORDER. It
> added the folio to the deferred_list as entirely_mapped
> (partially_mapped == false).
> However, when partially_mapped becomes true, there's no opportunity to
> add it again
> as it has been there on the list. Are you consistently seeing the counter for
> PMD_ORDER as 0?
> 

Ah I see it, this should fix it?

-void deferred_split_folio(struct folio *folio)
+/* partially_mapped=false won't clear PG_partially_mapped folio flag */
+void deferred_split_folio(struct folio *folio, bool partially_mapped)
 {
        struct deferred_split *ds_queue = get_deferred_split_queue(folio);
 #ifdef CONFIG_MEMCG
@@ -3485,14 +3488,14 @@ void deferred_split_folio(struct folio *folio)
        if (folio_test_swapcache(folio))
                return;
 
-       if (!list_empty(&folio->_deferred_list))
-               return;
-
        spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
-       if (list_empty(&folio->_deferred_list)) {
+       if (partially_mapped) {
+               folio_set_partially_mapped(folio);
                if (folio_test_pmd_mappable(folio))
                        count_vm_event(THP_DEFERRED_SPLIT_PAGE);
                count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
+       }
+       if (list_empty(&folio->_deferred_list)) {
                list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
                ds_queue->split_queue_len++;
 #ifdef CONFIG_MEMCG
Barry Song Aug. 14, 2024, 11:23 a.m. UTC | #9
On Wed, Aug 14, 2024 at 11:20 PM Usama Arif <usamaarif642@gmail.com> wrote:
>
>
>
> On 14/08/2024 12:10, Barry Song wrote:
> > On Wed, Aug 14, 2024 at 12:03 AM Usama Arif <usamaarif642@gmail.com> wrote:
> >>
> >> Currently folio->_deferred_list is used to keep track of
> >> partially_mapped folios that are going to be split under memory
> >> pressure. In the next patch, all THPs that are faulted in and collapsed
> >> by khugepaged are also going to be tracked using _deferred_list.
> >>
> >> This patch introduces a pageflag to be able to distinguish between
> >> partially mapped folios and others in the deferred_list at split time in
> >> deferred_split_scan. Its needed as __folio_remove_rmap decrements
> >> _mapcount, _large_mapcount and _entire_mapcount, hence it won't be
> >> possible to distinguish between partially mapped folios and others in
> >> deferred_split_scan.
> >>
> >> Eventhough it introduces an extra flag to track if the folio is
> >> partially mapped, there is no functional change intended with this
> >> patch and the flag is not useful in this patch itself, it will
> >> become useful in the next patch when _deferred_list has non partially
> >> mapped folios.
> >>
> >> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
> >> ---
> >>  include/linux/huge_mm.h    |  4 ++--
> >>  include/linux/page-flags.h |  3 +++
> >>  mm/huge_memory.c           | 21 +++++++++++++--------
> >>  mm/hugetlb.c               |  1 +
> >>  mm/internal.h              |  4 +++-
> >>  mm/memcontrol.c            |  3 ++-
> >>  mm/migrate.c               |  3 ++-
> >>  mm/page_alloc.c            |  5 +++--
> >>  mm/rmap.c                  |  3 ++-
> >>  mm/vmscan.c                |  3 ++-
> >>  10 files changed, 33 insertions(+), 17 deletions(-)
> >>
> >> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> >> index 4c32058cacfe..969f11f360d2 100644
> >> --- a/include/linux/huge_mm.h
> >> +++ b/include/linux/huge_mm.h
> >> @@ -321,7 +321,7 @@ static inline int split_huge_page(struct page *page)
> >>  {
> >>         return split_huge_page_to_list_to_order(page, NULL, 0);
> >>  }
> >> -void deferred_split_folio(struct folio *folio);
> >> +void deferred_split_folio(struct folio *folio, bool partially_mapped);
> >>
> >>  void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
> >>                 unsigned long address, bool freeze, struct folio *folio);
> >> @@ -495,7 +495,7 @@ static inline int split_huge_page(struct page *page)
> >>  {
> >>         return 0;
> >>  }
> >> -static inline void deferred_split_folio(struct folio *folio) {}
> >> +static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {}
> >>  #define split_huge_pmd(__vma, __pmd, __address)        \
> >>         do { } while (0)
> >>
> >> diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
> >> index a0a29bd092f8..cecc1bad7910 100644
> >> --- a/include/linux/page-flags.h
> >> +++ b/include/linux/page-flags.h
> >> @@ -182,6 +182,7 @@ enum pageflags {
> >>         /* At least one page in this folio has the hwpoison flag set */
> >>         PG_has_hwpoisoned = PG_active,
> >>         PG_large_rmappable = PG_workingset, /* anon or file-backed */
> >> +       PG_partially_mapped, /* was identified to be partially mapped */
> >>  };
> >>
> >>  #define PAGEFLAGS_MASK         ((1UL << NR_PAGEFLAGS) - 1)
> >> @@ -861,8 +862,10 @@ static inline void ClearPageCompound(struct page *page)
> >>         ClearPageHead(page);
> >>  }
> >>  FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE)
> >> +FOLIO_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
> >>  #else
> >>  FOLIO_FLAG_FALSE(large_rmappable)
> >> +FOLIO_FLAG_FALSE(partially_mapped)
> >>  #endif
> >>
> >>  #define PG_head_mask ((1UL << PG_head))
> >> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> >> index 6df0e9f4f56c..c024ab0f745c 100644
> >> --- a/mm/huge_memory.c
> >> +++ b/mm/huge_memory.c
> >> @@ -3397,6 +3397,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
> >>                          * page_deferred_list.
> >>                          */
> >>                         list_del_init(&folio->_deferred_list);
> >> +                       folio_clear_partially_mapped(folio);
> >>                 }
> >>                 spin_unlock(&ds_queue->split_queue_lock);
> >>                 if (mapping) {
> >> @@ -3453,11 +3454,12 @@ void __folio_undo_large_rmappable(struct folio *folio)
> >>         if (!list_empty(&folio->_deferred_list)) {
> >>                 ds_queue->split_queue_len--;
> >>                 list_del_init(&folio->_deferred_list);
> >> +               folio_clear_partially_mapped(folio);
> >>         }
> >>         spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
> >>  }
> >>
> >> -void deferred_split_folio(struct folio *folio)
> >> +void deferred_split_folio(struct folio *folio, bool partially_mapped)
> >>  {
> >>         struct deferred_split *ds_queue = get_deferred_split_queue(folio);
> >>  #ifdef CONFIG_MEMCG
> >> @@ -3485,14 +3487,17 @@ void deferred_split_folio(struct folio *folio)
> >>         if (folio_test_swapcache(folio))
> >>                 return;
> >>
> >> -       if (!list_empty(&folio->_deferred_list))
> >> -               return;
> >> -
> >>         spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
> >> +       if (partially_mapped)
> >> +               folio_set_partially_mapped(folio);
> >> +       else
> >> +               folio_clear_partially_mapped(folio);
> >>         if (list_empty(&folio->_deferred_list)) {
> >> -               if (folio_test_pmd_mappable(folio))
> >> -                       count_vm_event(THP_DEFERRED_SPLIT_PAGE);
> >> -               count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
> >> +               if (partially_mapped) {
> >> +                       if (folio_test_pmd_mappable(folio))
> >> +                               count_vm_event(THP_DEFERRED_SPLIT_PAGE);
> >> +                       count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
> >
> > This code completely broke MTHP_STAT_SPLIT_DEFERRED for PMD_ORDER. It
> > added the folio to the deferred_list as entirely_mapped
> > (partially_mapped == false).
> > However, when partially_mapped becomes true, there's no opportunity to
> > add it again
> > as it has been there on the list. Are you consistently seeing the counter for
> > PMD_ORDER as 0?
> >
>
> Ah I see it, this should fix it?
>
> -void deferred_split_folio(struct folio *folio)
> +/* partially_mapped=false won't clear PG_partially_mapped folio flag */
> +void deferred_split_folio(struct folio *folio, bool partially_mapped)
>  {
>         struct deferred_split *ds_queue = get_deferred_split_queue(folio);
>  #ifdef CONFIG_MEMCG
> @@ -3485,14 +3488,14 @@ void deferred_split_folio(struct folio *folio)
>         if (folio_test_swapcache(folio))
>                 return;
>
> -       if (!list_empty(&folio->_deferred_list))
> -               return;
> -
>         spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
> -       if (list_empty(&folio->_deferred_list)) {
> +       if (partially_mapped) {
> +               folio_set_partially_mapped(folio);
>                 if (folio_test_pmd_mappable(folio))
>                         count_vm_event(THP_DEFERRED_SPLIT_PAGE);
>                 count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
> +       }
> +       if (list_empty(&folio->_deferred_list)) {
>                 list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
>                 ds_queue->split_queue_len++;
>  #ifdef CONFIG_MEMCG
>

not enough. as deferred_split_folio(true) won't be called if folio has been
deferred_list in __folio_remove_rmap():

        if (partially_mapped && folio_test_anon(folio) &&
            list_empty(&folio->_deferred_list))
                deferred_split_folio(folio, true);

so you will still see 0.

Thanks
Barry
Barry Song Aug. 14, 2024, 11:26 a.m. UTC | #10
On Wed, Aug 14, 2024 at 11:20 PM Barry Song <baohua@kernel.org> wrote:
>
> On Wed, Aug 14, 2024 at 11:11 PM Usama Arif <usamaarif642@gmail.com> wrote:
> >
> >
> >
> > On 14/08/2024 11:44, Barry Song wrote:
> > > On Wed, Aug 14, 2024 at 12:03 AM Usama Arif <usamaarif642@gmail.com> wrote:
> > >>
> > >> Currently folio->_deferred_list is used to keep track of
> > >> partially_mapped folios that are going to be split under memory
> > >> pressure. In the next patch, all THPs that are faulted in and collapsed
> > >> by khugepaged are also going to be tracked using _deferred_list.
> > >>
> > >> This patch introduces a pageflag to be able to distinguish between
> > >> partially mapped folios and others in the deferred_list at split time in
> > >> deferred_split_scan. Its needed as __folio_remove_rmap decrements
> > >> _mapcount, _large_mapcount and _entire_mapcount, hence it won't be
> > >> possible to distinguish between partially mapped folios and others in
> > >> deferred_split_scan.
> > >>
> > >> Eventhough it introduces an extra flag to track if the folio is
> > >> partially mapped, there is no functional change intended with this
> > >> patch and the flag is not useful in this patch itself, it will
> > >> become useful in the next patch when _deferred_list has non partially
> > >> mapped folios.
> > >>
> > >> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
> > >> ---
> > >>  include/linux/huge_mm.h    |  4 ++--
> > >>  include/linux/page-flags.h |  3 +++
> > >>  mm/huge_memory.c           | 21 +++++++++++++--------
> > >>  mm/hugetlb.c               |  1 +
> > >>  mm/internal.h              |  4 +++-
> > >>  mm/memcontrol.c            |  3 ++-
> > >>  mm/migrate.c               |  3 ++-
> > >>  mm/page_alloc.c            |  5 +++--
> > >>  mm/rmap.c                  |  3 ++-
> > >>  mm/vmscan.c                |  3 ++-
> > >>  10 files changed, 33 insertions(+), 17 deletions(-)
> > >>
> > >> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> > >> index 4c32058cacfe..969f11f360d2 100644
> > >> --- a/include/linux/huge_mm.h
> > >> +++ b/include/linux/huge_mm.h
> > >> @@ -321,7 +321,7 @@ static inline int split_huge_page(struct page *page)
> > >>  {
> > >>         return split_huge_page_to_list_to_order(page, NULL, 0);
> > >>  }
> > >> -void deferred_split_folio(struct folio *folio);
> > >> +void deferred_split_folio(struct folio *folio, bool partially_mapped);
> > >>
> > >>  void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
> > >>                 unsigned long address, bool freeze, struct folio *folio);
> > >> @@ -495,7 +495,7 @@ static inline int split_huge_page(struct page *page)
> > >>  {
> > >>         return 0;
> > >>  }
> > >> -static inline void deferred_split_folio(struct folio *folio) {}
> > >> +static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {}
> > >>  #define split_huge_pmd(__vma, __pmd, __address)        \
> > >>         do { } while (0)
> > >>
> > >> diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
> > >> index a0a29bd092f8..cecc1bad7910 100644
> > >> --- a/include/linux/page-flags.h
> > >> +++ b/include/linux/page-flags.h
> > >> @@ -182,6 +182,7 @@ enum pageflags {
> > >>         /* At least one page in this folio has the hwpoison flag set */
> > >>         PG_has_hwpoisoned = PG_active,
> > >>         PG_large_rmappable = PG_workingset, /* anon or file-backed */
> > >> +       PG_partially_mapped, /* was identified to be partially mapped */
> > >>  };
> > >>
> > >>  #define PAGEFLAGS_MASK         ((1UL << NR_PAGEFLAGS) - 1)
> > >> @@ -861,8 +862,10 @@ static inline void ClearPageCompound(struct page *page)
> > >>         ClearPageHead(page);
> > >>  }
> > >>  FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE)
> > >> +FOLIO_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
> > >>  #else
> > >>  FOLIO_FLAG_FALSE(large_rmappable)
> > >> +FOLIO_FLAG_FALSE(partially_mapped)
> > >>  #endif
> > >>
> > >>  #define PG_head_mask ((1UL << PG_head))
> > >> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > >> index 6df0e9f4f56c..c024ab0f745c 100644
> > >> --- a/mm/huge_memory.c
> > >> +++ b/mm/huge_memory.c
> > >> @@ -3397,6 +3397,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
> > >>                          * page_deferred_list.
> > >>                          */
> > >>                         list_del_init(&folio->_deferred_list);
> > >> +                       folio_clear_partially_mapped(folio);
> > >>                 }
> > >>                 spin_unlock(&ds_queue->split_queue_lock);
> > >>                 if (mapping) {
> > >> @@ -3453,11 +3454,12 @@ void __folio_undo_large_rmappable(struct folio *folio)
> > >>         if (!list_empty(&folio->_deferred_list)) {
> > >>                 ds_queue->split_queue_len--;
> > >>                 list_del_init(&folio->_deferred_list);
> > >> +               folio_clear_partially_mapped(folio);
> > >>         }
> > >>         spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
> > >>  }
> > >>
> > >> -void deferred_split_folio(struct folio *folio)
> > >> +void deferred_split_folio(struct folio *folio, bool partially_mapped)
> > >>  {
> > >>         struct deferred_split *ds_queue = get_deferred_split_queue(folio);
> > >>  #ifdef CONFIG_MEMCG
> > >> @@ -3485,14 +3487,17 @@ void deferred_split_folio(struct folio *folio)
> > >>         if (folio_test_swapcache(folio))
> > >>                 return;
> > >>
> > >> -       if (!list_empty(&folio->_deferred_list))
> > >> -               return;
> > >> -
> > >>         spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
> > >> +       if (partially_mapped)
> > >> +               folio_set_partially_mapped(folio);
> > >> +       else
> > >> +               folio_clear_partially_mapped(folio);
> > >
> > > Hi Usama,
> > >
> > > Do we need this? When can a partially_mapped folio on deferred_list become
> > > non-partially-mapped and need a clear? I understand transferring from
> > > entirely_map
> > > to partially_mapped is a one way process? partially_mapped folios can't go back
> > > to entirely_mapped?
> > >
> > Hi Barry,
> >
> > deferred_split_folio function is called in 3 places after this series, at fault, collapse and partial mapping. partial mapping can only happen after fault/collapse, and we have FOLIO_FLAG_FALSE(partially_mapped), i.e. flag initialized to false, so technically its not needed. A partially_mapped folio on deferred list wont become non-partially mapped.
> >
> > I just did it as a precaution if someone ever changes the kernel and calls deferred_split_folio with partially_mapped set to false after it had been true. The function arguments of deferred_split_folio make it seem that passing partially_mapped=false as an argument would clear it, which is why I cleared it as well. I could change the patch to something like below if it makes things better? i.e. add a comment at the top of the function:
> >
>
> to me, it seems a BUG to call with false after a folio has been
> partially_mapped. So I'd rather
> VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio);
>
> The below should also fix the MTHP_STAT_SPLIT_DEFERRED
> counter this patch is breaking?
>
> @@ -3515,16 +3522,18 @@ void deferred_split_folio(struct folio *folio,
> bool partially_mapped)
>                 return;
>
>         spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
> -       if (partially_mapped)
> -               folio_set_partially_mapped(folio);
> -       else
> -               folio_clear_partially_mapped(folio);
> -       if (list_empty(&folio->_deferred_list)) {
> -               if (partially_mapped) {
> +       if (partially_mapped) {
> +               if (!folio_test_set_partially_mapped(folio)) {
> +                       mod_mthp_stat(folio_order(folio),
> +                               MTHP_STAT_NR_SPLIT_DEFERRED, 1);
>                         if (folio_test_pmd_mappable(folio))
>                                 count_vm_event(THP_DEFERRED_SPLIT_PAGE);
>                         count_mthp_stat(folio_order(folio),
> MTHP_STAT_SPLIT_DEFERRED);
>                 }
> +       }
> +       VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio);

sorry, I mean:
VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio) &&
!partially_mapped, folio);

> +
> +       if (list_empty(&folio->_deferred_list)) {
>                 list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
>                 ds_queue->split_queue_len++;
>  #ifdef CONFIG_MEMCG
>
>
> >
> > -void deferred_split_folio(struct folio *folio)
> > +/* partially_mapped=false won't clear PG_partially_mapped folio flag */
> > +void deferred_split_folio(struct folio *folio, bool partially_mapped)
> >  {
> >         struct deferred_split *ds_queue = get_deferred_split_queue(folio);
> >  #ifdef CONFIG_MEMCG
> > @@ -3485,14 +3488,15 @@ void deferred_split_folio(struct folio *folio)
> >         if (folio_test_swapcache(folio))
> >                 return;
> >
> > -       if (!list_empty(&folio->_deferred_list))
> > -               return;
> > -
> >         spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
> > +       if (partially_mapped)
> > +               folio_set_partially_mapped(folio);
> >         if (list_empty(&folio->_deferred_list)) {
> > -               if (folio_test_pmd_mappable(folio))
> > -                       count_vm_event(THP_DEFERRED_SPLIT_PAGE);
> > -               count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
> > +               if (partially_mapped) {
> > +                       if (folio_test_pmd_mappable(folio))
> > +                               count_vm_event(THP_DEFERRED_SPLIT_PAGE);
> > +                       count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
> > +               }
> >                 list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
> >                 ds_queue->split_queue_len++;
> >  #ifdef CONFIG_MEMCG
> >
> >
> > > I am trying to rebase my NR_SPLIT_DEFERRED counter on top of your
> > > work, but this "clear" makes that job quite tricky. as I am not sure
> > > if this patch
> > > is going to clear the partially_mapped flag for folios on deferred_list.
> > >
> > > Otherwise, I can simply do the below whenever folio is leaving deferred_list
> > >
> > >         ds_queue->split_queue_len--;
> > >         list_del_init(&folio->_deferred_list);
> > >         if (folio_test_clear_partially_mapped(folio))
> > >                 mod_mthp_stat(folio_order(folio),
> > > MTHP_STAT_NR_SPLIT_DEFERRED, -1);
> > >
> >
>
> Thanks
> Barry
Usama Arif Aug. 14, 2024, 11:30 a.m. UTC | #11
On 14/08/2024 12:20, Barry Song wrote:
> On Wed, Aug 14, 2024 at 11:11 PM Usama Arif <usamaarif642@gmail.com> wrote:
>>
>>
>>
>> On 14/08/2024 11:44, Barry Song wrote:
>>> On Wed, Aug 14, 2024 at 12:03 AM Usama Arif <usamaarif642@gmail.com> wrote:
>>>>
>>>> Currently folio->_deferred_list is used to keep track of
>>>> partially_mapped folios that are going to be split under memory
>>>> pressure. In the next patch, all THPs that are faulted in and collapsed
>>>> by khugepaged are also going to be tracked using _deferred_list.
>>>>
>>>> This patch introduces a pageflag to be able to distinguish between
>>>> partially mapped folios and others in the deferred_list at split time in
>>>> deferred_split_scan. Its needed as __folio_remove_rmap decrements
>>>> _mapcount, _large_mapcount and _entire_mapcount, hence it won't be
>>>> possible to distinguish between partially mapped folios and others in
>>>> deferred_split_scan.
>>>>
>>>> Eventhough it introduces an extra flag to track if the folio is
>>>> partially mapped, there is no functional change intended with this
>>>> patch and the flag is not useful in this patch itself, it will
>>>> become useful in the next patch when _deferred_list has non partially
>>>> mapped folios.
>>>>
>>>> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
>>>> ---
>>>>  include/linux/huge_mm.h    |  4 ++--
>>>>  include/linux/page-flags.h |  3 +++
>>>>  mm/huge_memory.c           | 21 +++++++++++++--------
>>>>  mm/hugetlb.c               |  1 +
>>>>  mm/internal.h              |  4 +++-
>>>>  mm/memcontrol.c            |  3 ++-
>>>>  mm/migrate.c               |  3 ++-
>>>>  mm/page_alloc.c            |  5 +++--
>>>>  mm/rmap.c                  |  3 ++-
>>>>  mm/vmscan.c                |  3 ++-
>>>>  10 files changed, 33 insertions(+), 17 deletions(-)
>>>>
>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>>> index 4c32058cacfe..969f11f360d2 100644
>>>> --- a/include/linux/huge_mm.h
>>>> +++ b/include/linux/huge_mm.h
>>>> @@ -321,7 +321,7 @@ static inline int split_huge_page(struct page *page)
>>>>  {
>>>>         return split_huge_page_to_list_to_order(page, NULL, 0);
>>>>  }
>>>> -void deferred_split_folio(struct folio *folio);
>>>> +void deferred_split_folio(struct folio *folio, bool partially_mapped);
>>>>
>>>>  void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
>>>>                 unsigned long address, bool freeze, struct folio *folio);
>>>> @@ -495,7 +495,7 @@ static inline int split_huge_page(struct page *page)
>>>>  {
>>>>         return 0;
>>>>  }
>>>> -static inline void deferred_split_folio(struct folio *folio) {}
>>>> +static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {}
>>>>  #define split_huge_pmd(__vma, __pmd, __address)        \
>>>>         do { } while (0)
>>>>
>>>> diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
>>>> index a0a29bd092f8..cecc1bad7910 100644
>>>> --- a/include/linux/page-flags.h
>>>> +++ b/include/linux/page-flags.h
>>>> @@ -182,6 +182,7 @@ enum pageflags {
>>>>         /* At least one page in this folio has the hwpoison flag set */
>>>>         PG_has_hwpoisoned = PG_active,
>>>>         PG_large_rmappable = PG_workingset, /* anon or file-backed */
>>>> +       PG_partially_mapped, /* was identified to be partially mapped */
>>>>  };
>>>>
>>>>  #define PAGEFLAGS_MASK         ((1UL << NR_PAGEFLAGS) - 1)
>>>> @@ -861,8 +862,10 @@ static inline void ClearPageCompound(struct page *page)
>>>>         ClearPageHead(page);
>>>>  }
>>>>  FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE)
>>>> +FOLIO_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
>>>>  #else
>>>>  FOLIO_FLAG_FALSE(large_rmappable)
>>>> +FOLIO_FLAG_FALSE(partially_mapped)
>>>>  #endif
>>>>
>>>>  #define PG_head_mask ((1UL << PG_head))
>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>>> index 6df0e9f4f56c..c024ab0f745c 100644
>>>> --- a/mm/huge_memory.c
>>>> +++ b/mm/huge_memory.c
>>>> @@ -3397,6 +3397,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>>>>                          * page_deferred_list.
>>>>                          */
>>>>                         list_del_init(&folio->_deferred_list);
>>>> +                       folio_clear_partially_mapped(folio);
>>>>                 }
>>>>                 spin_unlock(&ds_queue->split_queue_lock);
>>>>                 if (mapping) {
>>>> @@ -3453,11 +3454,12 @@ void __folio_undo_large_rmappable(struct folio *folio)
>>>>         if (!list_empty(&folio->_deferred_list)) {
>>>>                 ds_queue->split_queue_len--;
>>>>                 list_del_init(&folio->_deferred_list);
>>>> +               folio_clear_partially_mapped(folio);
>>>>         }
>>>>         spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
>>>>  }
>>>>
>>>> -void deferred_split_folio(struct folio *folio)
>>>> +void deferred_split_folio(struct folio *folio, bool partially_mapped)
>>>>  {
>>>>         struct deferred_split *ds_queue = get_deferred_split_queue(folio);
>>>>  #ifdef CONFIG_MEMCG
>>>> @@ -3485,14 +3487,17 @@ void deferred_split_folio(struct folio *folio)
>>>>         if (folio_test_swapcache(folio))
>>>>                 return;
>>>>
>>>> -       if (!list_empty(&folio->_deferred_list))
>>>> -               return;
>>>> -
>>>>         spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
>>>> +       if (partially_mapped)
>>>> +               folio_set_partially_mapped(folio);
>>>> +       else
>>>> +               folio_clear_partially_mapped(folio);
>>>
>>> Hi Usama,
>>>
>>> Do we need this? When can a partially_mapped folio on deferred_list become
>>> non-partially-mapped and need a clear? I understand transferring from
>>> entirely_map
>>> to partially_mapped is a one way process? partially_mapped folios can't go back
>>> to entirely_mapped?
>>>
>> Hi Barry,
>>
>> deferred_split_folio function is called in 3 places after this series, at fault, collapse and partial mapping. partial mapping can only happen after fault/collapse, and we have FOLIO_FLAG_FALSE(partially_mapped), i.e. flag initialized to false, so technically its not needed. A partially_mapped folio on deferred list wont become non-partially mapped.
>>
>> I just did it as a precaution if someone ever changes the kernel and calls deferred_split_folio with partially_mapped set to false after it had been true. The function arguments of deferred_split_folio make it seem that passing partially_mapped=false as an argument would clear it, which is why I cleared it as well. I could change the patch to something like below if it makes things better? i.e. add a comment at the top of the function:
>>
> 
> to me, it seems a BUG to call with false after a folio has been
> partially_mapped. So I'd rather
> VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio);
> 
> The below should also fix the MTHP_STAT_SPLIT_DEFERRED
> counter this patch is breaking?
> 
> @@ -3515,16 +3522,18 @@ void deferred_split_folio(struct folio *folio,
> bool partially_mapped)
>                 return;
> 
>         spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
> -       if (partially_mapped)
> -               folio_set_partially_mapped(folio);
> -       else
> -               folio_clear_partially_mapped(folio);
> -       if (list_empty(&folio->_deferred_list)) {
> -               if (partially_mapped) {
> +       if (partially_mapped) {
> +               if (!folio_test_set_partially_mapped(folio)) {
> +                       mod_mthp_stat(folio_order(folio),
> +                               MTHP_STAT_NR_SPLIT_DEFERRED, 1);
>                         if (folio_test_pmd_mappable(folio))
>                                 count_vm_event(THP_DEFERRED_SPLIT_PAGE);
>                         count_mthp_stat(folio_order(folio),
> MTHP_STAT_SPLIT_DEFERRED);
>                 }
> +       }
> +       VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio);
> +
> +       if (list_empty(&folio->_deferred_list)) {
>                 list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
>                 ds_queue->split_queue_len++;
>  #ifdef CONFIG_MEMCG
> 
> 
So I had sent the below without the VM_WARN_ON_FOLIO as a reply to the other email, below is with VM_WARN.


-void deferred_split_folio(struct folio *folio)
+/* partially_mapped=false won't clear PG_partially_mapped folio flag */
+void deferred_split_folio(struct folio *folio, bool partially_mapped)
 {
        struct deferred_split *ds_queue = get_deferred_split_queue(folio);
 #ifdef CONFIG_MEMCG
@@ -3485,14 +3488,17 @@ void deferred_split_folio(struct folio *folio)
        if (folio_test_swapcache(folio))
                return;
 
-       if (!list_empty(&folio->_deferred_list))
-               return;
-
        spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
-       if (list_empty(&folio->_deferred_list)) {
+       if (partially_mapped) {
+               folio_set_partially_mapped(folio);
                if (folio_test_pmd_mappable(folio))
                        count_vm_event(THP_DEFERRED_SPLIT_PAGE);
                count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
+       } else {
+               /* partially mapped folios cannont become partially unmapped */
+               VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio);
+       }
+       if (list_empty(&folio->_deferred_list)) {
                list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
                ds_queue->split_queue_len++;
 #ifdef CONFIG_MEMCG

 Thanks
Usama Arif Aug. 14, 2024, 12:36 p.m. UTC | #12
On 14/08/2024 12:23, Barry Song wrote:
> On Wed, Aug 14, 2024 at 11:20 PM Usama Arif <usamaarif642@gmail.com> wrote:
>>
>>
>>
>> On 14/08/2024 12:10, Barry Song wrote:
>>> On Wed, Aug 14, 2024 at 12:03 AM Usama Arif <usamaarif642@gmail.com> wrote:
>>>>
>>>> Currently folio->_deferred_list is used to keep track of
>>>> partially_mapped folios that are going to be split under memory
>>>> pressure. In the next patch, all THPs that are faulted in and collapsed
>>>> by khugepaged are also going to be tracked using _deferred_list.
>>>>
>>>> This patch introduces a pageflag to be able to distinguish between
>>>> partially mapped folios and others in the deferred_list at split time in
>>>> deferred_split_scan. Its needed as __folio_remove_rmap decrements
>>>> _mapcount, _large_mapcount and _entire_mapcount, hence it won't be
>>>> possible to distinguish between partially mapped folios and others in
>>>> deferred_split_scan.
>>>>
>>>> Eventhough it introduces an extra flag to track if the folio is
>>>> partially mapped, there is no functional change intended with this
>>>> patch and the flag is not useful in this patch itself, it will
>>>> become useful in the next patch when _deferred_list has non partially
>>>> mapped folios.
>>>>
>>>> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
>>>> ---
>>>>  include/linux/huge_mm.h    |  4 ++--
>>>>  include/linux/page-flags.h |  3 +++
>>>>  mm/huge_memory.c           | 21 +++++++++++++--------
>>>>  mm/hugetlb.c               |  1 +
>>>>  mm/internal.h              |  4 +++-
>>>>  mm/memcontrol.c            |  3 ++-
>>>>  mm/migrate.c               |  3 ++-
>>>>  mm/page_alloc.c            |  5 +++--
>>>>  mm/rmap.c                  |  3 ++-
>>>>  mm/vmscan.c                |  3 ++-
>>>>  10 files changed, 33 insertions(+), 17 deletions(-)
>>>>
>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>>> index 4c32058cacfe..969f11f360d2 100644
>>>> --- a/include/linux/huge_mm.h
>>>> +++ b/include/linux/huge_mm.h
>>>> @@ -321,7 +321,7 @@ static inline int split_huge_page(struct page *page)
>>>>  {
>>>>         return split_huge_page_to_list_to_order(page, NULL, 0);
>>>>  }
>>>> -void deferred_split_folio(struct folio *folio);
>>>> +void deferred_split_folio(struct folio *folio, bool partially_mapped);
>>>>
>>>>  void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
>>>>                 unsigned long address, bool freeze, struct folio *folio);
>>>> @@ -495,7 +495,7 @@ static inline int split_huge_page(struct page *page)
>>>>  {
>>>>         return 0;
>>>>  }
>>>> -static inline void deferred_split_folio(struct folio *folio) {}
>>>> +static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {}
>>>>  #define split_huge_pmd(__vma, __pmd, __address)        \
>>>>         do { } while (0)
>>>>
>>>> diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
>>>> index a0a29bd092f8..cecc1bad7910 100644
>>>> --- a/include/linux/page-flags.h
>>>> +++ b/include/linux/page-flags.h
>>>> @@ -182,6 +182,7 @@ enum pageflags {
>>>>         /* At least one page in this folio has the hwpoison flag set */
>>>>         PG_has_hwpoisoned = PG_active,
>>>>         PG_large_rmappable = PG_workingset, /* anon or file-backed */
>>>> +       PG_partially_mapped, /* was identified to be partially mapped */
>>>>  };
>>>>
>>>>  #define PAGEFLAGS_MASK         ((1UL << NR_PAGEFLAGS) - 1)
>>>> @@ -861,8 +862,10 @@ static inline void ClearPageCompound(struct page *page)
>>>>         ClearPageHead(page);
>>>>  }
>>>>  FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE)
>>>> +FOLIO_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
>>>>  #else
>>>>  FOLIO_FLAG_FALSE(large_rmappable)
>>>> +FOLIO_FLAG_FALSE(partially_mapped)
>>>>  #endif
>>>>
>>>>  #define PG_head_mask ((1UL << PG_head))
>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>>> index 6df0e9f4f56c..c024ab0f745c 100644
>>>> --- a/mm/huge_memory.c
>>>> +++ b/mm/huge_memory.c
>>>> @@ -3397,6 +3397,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>>>>                          * page_deferred_list.
>>>>                          */
>>>>                         list_del_init(&folio->_deferred_list);
>>>> +                       folio_clear_partially_mapped(folio);
>>>>                 }
>>>>                 spin_unlock(&ds_queue->split_queue_lock);
>>>>                 if (mapping) {
>>>> @@ -3453,11 +3454,12 @@ void __folio_undo_large_rmappable(struct folio *folio)
>>>>         if (!list_empty(&folio->_deferred_list)) {
>>>>                 ds_queue->split_queue_len--;
>>>>                 list_del_init(&folio->_deferred_list);
>>>> +               folio_clear_partially_mapped(folio);
>>>>         }
>>>>         spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
>>>>  }
>>>>
>>>> -void deferred_split_folio(struct folio *folio)
>>>> +void deferred_split_folio(struct folio *folio, bool partially_mapped)
>>>>  {
>>>>         struct deferred_split *ds_queue = get_deferred_split_queue(folio);
>>>>  #ifdef CONFIG_MEMCG
>>>> @@ -3485,14 +3487,17 @@ void deferred_split_folio(struct folio *folio)
>>>>         if (folio_test_swapcache(folio))
>>>>                 return;
>>>>
>>>> -       if (!list_empty(&folio->_deferred_list))
>>>> -               return;
>>>> -
>>>>         spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
>>>> +       if (partially_mapped)
>>>> +               folio_set_partially_mapped(folio);
>>>> +       else
>>>> +               folio_clear_partially_mapped(folio);
>>>>         if (list_empty(&folio->_deferred_list)) {
>>>> -               if (folio_test_pmd_mappable(folio))
>>>> -                       count_vm_event(THP_DEFERRED_SPLIT_PAGE);
>>>> -               count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
>>>> +               if (partially_mapped) {
>>>> +                       if (folio_test_pmd_mappable(folio))
>>>> +                               count_vm_event(THP_DEFERRED_SPLIT_PAGE);
>>>> +                       count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
>>>
>>> This code completely broke MTHP_STAT_SPLIT_DEFERRED for PMD_ORDER. It
>>> added the folio to the deferred_list as entirely_mapped
>>> (partially_mapped == false).
>>> However, when partially_mapped becomes true, there's no opportunity to
>>> add it again
>>> as it has been there on the list. Are you consistently seeing the counter for
>>> PMD_ORDER as 0?
>>>
>>
>> Ah I see it, this should fix it?
>>
>> -void deferred_split_folio(struct folio *folio)
>> +/* partially_mapped=false won't clear PG_partially_mapped folio flag */
>> +void deferred_split_folio(struct folio *folio, bool partially_mapped)
>>  {
>>         struct deferred_split *ds_queue = get_deferred_split_queue(folio);
>>  #ifdef CONFIG_MEMCG
>> @@ -3485,14 +3488,14 @@ void deferred_split_folio(struct folio *folio)
>>         if (folio_test_swapcache(folio))
>>                 return;
>>
>> -       if (!list_empty(&folio->_deferred_list))
>> -               return;
>> -
>>         spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
>> -       if (list_empty(&folio->_deferred_list)) {
>> +       if (partially_mapped) {
>> +               folio_set_partially_mapped(folio);
>>                 if (folio_test_pmd_mappable(folio))
>>                         count_vm_event(THP_DEFERRED_SPLIT_PAGE);
>>                 count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
>> +       }
>> +       if (list_empty(&folio->_deferred_list)) {
>>                 list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
>>                 ds_queue->split_queue_len++;
>>  #ifdef CONFIG_MEMCG
>>
> 
> not enough. as deferred_split_folio(true) won't be called if folio has been
> deferred_list in __folio_remove_rmap():
> 
>         if (partially_mapped && folio_test_anon(folio) &&
>             list_empty(&folio->_deferred_list))
>                 deferred_split_folio(folio, true);
> 
> so you will still see 0.
> 

ah yes, Thanks.

So below diff over the current v3 series should work for all cases:


diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index b4d72479330d..482e3ab60911 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3483,6 +3483,7 @@ void __folio_undo_large_rmappable(struct folio *folio)
        spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
 }
 
+/* partially_mapped=false won't clear PG_partially_mapped folio flag */
 void deferred_split_folio(struct folio *folio, bool partially_mapped)
 {
        struct deferred_split *ds_queue = get_deferred_split_queue(folio);
@@ -3515,16 +3516,16 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped)
                return;
 
        spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
-       if (partially_mapped)
+       if (partially_mapped) {
                folio_set_partially_mapped(folio);
-       else
-               folio_clear_partially_mapped(folio);
+               if (folio_test_pmd_mappable(folio))
+                       count_vm_event(THP_DEFERRED_SPLIT_PAGE);
+               count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
+       } else {
+               /* partially mapped folios cannont become partially unmapped */
+               VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio);
+       }
        if (list_empty(&folio->_deferred_list)) {
-               if (partially_mapped) {
-                       if (folio_test_pmd_mappable(folio))
-                               count_vm_event(THP_DEFERRED_SPLIT_PAGE);
-                       count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
-               }
                list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
                ds_queue->split_queue_len++;
 #ifdef CONFIG_MEMCG
diff --git a/mm/rmap.c b/mm/rmap.c
index 9ad558c2bad0..4c330635aa4e 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1578,7 +1578,7 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
         * Check partially_mapped first to ensure it is a large folio.
         */
        if (partially_mapped && folio_test_anon(folio) &&
-           list_empty(&folio->_deferred_list))
+           !folio_test_partially_mapped(folio))
                deferred_split_folio(folio, true);
 
        __folio_mod_stat(folio, -nr, -nr_pmdmapped);
Barry Song Aug. 14, 2024, 11:05 p.m. UTC | #13
On Thu, Aug 15, 2024 at 12:37 AM Usama Arif <usamaarif642@gmail.com> wrote:
[snip]
> >>>>
> >>>> -void deferred_split_folio(struct folio *folio)
> >>>> +void deferred_split_folio(struct folio *folio, bool partially_mapped)
> >>>>  {
> >>>>         struct deferred_split *ds_queue = get_deferred_split_queue(folio);
> >>>>  #ifdef CONFIG_MEMCG
> >>>> @@ -3485,14 +3487,17 @@ void deferred_split_folio(struct folio *folio)
> >>>>         if (folio_test_swapcache(folio))
> >>>>                 return;
> >>>>
> >>>> -       if (!list_empty(&folio->_deferred_list))
> >>>> -               return;
> >>>> -
> >>>>         spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
> >>>> +       if (partially_mapped)
> >>>> +               folio_set_partially_mapped(folio);
> >>>> +       else
> >>>> +               folio_clear_partially_mapped(folio);
> >>>>         if (list_empty(&folio->_deferred_list)) {
> >>>> -               if (folio_test_pmd_mappable(folio))
> >>>> -                       count_vm_event(THP_DEFERRED_SPLIT_PAGE);
> >>>> -               count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
> >>>> +               if (partially_mapped) {
> >>>> +                       if (folio_test_pmd_mappable(folio))
> >>>> +                               count_vm_event(THP_DEFERRED_SPLIT_PAGE);
> >>>> +                       count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
> >>>
> >>> This code completely broke MTHP_STAT_SPLIT_DEFERRED for PMD_ORDER. It
> >>> added the folio to the deferred_list as entirely_mapped
> >>> (partially_mapped == false).
> >>> However, when partially_mapped becomes true, there's no opportunity to
> >>> add it again
> >>> as it has been there on the list. Are you consistently seeing the counter for
> >>> PMD_ORDER as 0?
> >>>
> >>
> >> Ah I see it, this should fix it?
> >>
> >> -void deferred_split_folio(struct folio *folio)
> >> +/* partially_mapped=false won't clear PG_partially_mapped folio flag */
> >> +void deferred_split_folio(struct folio *folio, bool partially_mapped)
> >>  {
> >>         struct deferred_split *ds_queue = get_deferred_split_queue(folio);
> >>  #ifdef CONFIG_MEMCG
> >> @@ -3485,14 +3488,14 @@ void deferred_split_folio(struct folio *folio)
> >>         if (folio_test_swapcache(folio))
> >>                 return;
> >>
> >> -       if (!list_empty(&folio->_deferred_list))
> >> -               return;
> >> -
> >>         spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
> >> -       if (list_empty(&folio->_deferred_list)) {
> >> +       if (partially_mapped) {
> >> +               folio_set_partially_mapped(folio);
> >>                 if (folio_test_pmd_mappable(folio))
> >>                         count_vm_event(THP_DEFERRED_SPLIT_PAGE);
> >>                 count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
> >> +       }
> >> +       if (list_empty(&folio->_deferred_list)) {
> >>                 list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
> >>                 ds_queue->split_queue_len++;
> >>  #ifdef CONFIG_MEMCG
> >>
> >
> > not enough. as deferred_split_folio(true) won't be called if folio has been
> > deferred_list in __folio_remove_rmap():
> >
> >         if (partially_mapped && folio_test_anon(folio) &&
> >             list_empty(&folio->_deferred_list))
> >                 deferred_split_folio(folio, true);
> >
> > so you will still see 0.
> >
>
> ah yes, Thanks.
>
> So below diff over the current v3 series should work for all cases:
>
>
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index b4d72479330d..482e3ab60911 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -3483,6 +3483,7 @@ void __folio_undo_large_rmappable(struct folio *folio)
>         spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
>  }
>
> +/* partially_mapped=false won't clear PG_partially_mapped folio flag */
>  void deferred_split_folio(struct folio *folio, bool partially_mapped)
>  {
>         struct deferred_split *ds_queue = get_deferred_split_queue(folio);
> @@ -3515,16 +3516,16 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped)
>                 return;
>
>         spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
> -       if (partially_mapped)
> +       if (partially_mapped) {
>                 folio_set_partially_mapped(folio);
> -       else
> -               folio_clear_partially_mapped(folio);
> +               if (folio_test_pmd_mappable(folio))
> +                       count_vm_event(THP_DEFERRED_SPLIT_PAGE);
> +               count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
> +       } else {
> +               /* partially mapped folios cannont become partially unmapped */
> +               VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio);
> +       }
>         if (list_empty(&folio->_deferred_list)) {
> -               if (partially_mapped) {
> -                       if (folio_test_pmd_mappable(folio))
> -                               count_vm_event(THP_DEFERRED_SPLIT_PAGE);
> -                       count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
> -               }
>                 list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
>                 ds_queue->split_queue_len++;
>  #ifdef CONFIG_MEMCG
> diff --git a/mm/rmap.c b/mm/rmap.c
> index 9ad558c2bad0..4c330635aa4e 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -1578,7 +1578,7 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
>          * Check partially_mapped first to ensure it is a large folio.
>          */
>         if (partially_mapped && folio_test_anon(folio) &&
> -           list_empty(&folio->_deferred_list))
> +           !folio_test_partially_mapped(folio))
>                 deferred_split_folio(folio, true);
>
>         __folio_mod_stat(folio, -nr, -nr_pmdmapped);
>

This is an improvement, but there's still an issue. Two or more threads can
execute deferred_split_folio() simultaneously, which could lead to
DEFERRED_SPLIT being added multiple times. We should double-check
the status after acquiring the spinlock.

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f401ceded697..3d247826fb95 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3607,10 +3607,12 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped)
 
 	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
 	if (partially_mapped) {
-		folio_set_partially_mapped(folio);
-		if (folio_test_pmd_mappable(folio))
-			count_vm_event(THP_DEFERRED_SPLIT_PAGE);
-		count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
+		if (!folio_test_partially_mapped(folio)) {
+			folio_set_partially_mapped(folio);
+			if (folio_test_pmd_mappable(folio))
+				count_vm_event(THP_DEFERRED_SPLIT_PAGE);
+			count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
+		}
 	} else {
 		/* partially mapped folios cannont become partially unmapped */
 		VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio);
Usama Arif Aug. 15, 2024, 3:25 p.m. UTC | #14
On 15/08/2024 00:05, Barry Song wrote:
> 
> On Thu, Aug 15, 2024 at 12:37 AM Usama Arif <usamaarif642@gmail.com> wrote:
> [snip]
>>>>>>
>>>>>> -void deferred_split_folio(struct folio *folio)
>>>>>> +void deferred_split_folio(struct folio *folio, bool partially_mapped)
>>>>>>  {
>>>>>>         struct deferred_split *ds_queue = get_deferred_split_queue(folio);
>>>>>>  #ifdef CONFIG_MEMCG
>>>>>> @@ -3485,14 +3487,17 @@ void deferred_split_folio(struct folio *folio)
>>>>>>         if (folio_test_swapcache(folio))
>>>>>>                 return;
>>>>>>
>>>>>> -       if (!list_empty(&folio->_deferred_list))
>>>>>> -               return;
>>>>>> -
>>>>>>         spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
>>>>>> +       if (partially_mapped)
>>>>>> +               folio_set_partially_mapped(folio);
>>>>>> +       else
>>>>>> +               folio_clear_partially_mapped(folio);
>>>>>>         if (list_empty(&folio->_deferred_list)) {
>>>>>> -               if (folio_test_pmd_mappable(folio))
>>>>>> -                       count_vm_event(THP_DEFERRED_SPLIT_PAGE);
>>>>>> -               count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
>>>>>> +               if (partially_mapped) {
>>>>>> +                       if (folio_test_pmd_mappable(folio))
>>>>>> +                               count_vm_event(THP_DEFERRED_SPLIT_PAGE);
>>>>>> +                       count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
>>>>>
>>>>> This code completely broke MTHP_STAT_SPLIT_DEFERRED for PMD_ORDER. It
>>>>> added the folio to the deferred_list as entirely_mapped
>>>>> (partially_mapped == false).
>>>>> However, when partially_mapped becomes true, there's no opportunity to
>>>>> add it again
>>>>> as it has been there on the list. Are you consistently seeing the counter for
>>>>> PMD_ORDER as 0?
>>>>>
>>>>
>>>> Ah I see it, this should fix it?
>>>>
>>>> -void deferred_split_folio(struct folio *folio)
>>>> +/* partially_mapped=false won't clear PG_partially_mapped folio flag */
>>>> +void deferred_split_folio(struct folio *folio, bool partially_mapped)
>>>>  {
>>>>         struct deferred_split *ds_queue = get_deferred_split_queue(folio);
>>>>  #ifdef CONFIG_MEMCG
>>>> @@ -3485,14 +3488,14 @@ void deferred_split_folio(struct folio *folio)
>>>>         if (folio_test_swapcache(folio))
>>>>                 return;
>>>>
>>>> -       if (!list_empty(&folio->_deferred_list))
>>>> -               return;
>>>> -
>>>>         spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
>>>> -       if (list_empty(&folio->_deferred_list)) {
>>>> +       if (partially_mapped) {
>>>> +               folio_set_partially_mapped(folio);
>>>>                 if (folio_test_pmd_mappable(folio))
>>>>                         count_vm_event(THP_DEFERRED_SPLIT_PAGE);
>>>>                 count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
>>>> +       }
>>>> +       if (list_empty(&folio->_deferred_list)) {
>>>>                 list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
>>>>                 ds_queue->split_queue_len++;
>>>>  #ifdef CONFIG_MEMCG
>>>>
>>>
>>> not enough. as deferred_split_folio(true) won't be called if folio has been
>>> deferred_list in __folio_remove_rmap():
>>>
>>>         if (partially_mapped && folio_test_anon(folio) &&
>>>             list_empty(&folio->_deferred_list))
>>>                 deferred_split_folio(folio, true);
>>>
>>> so you will still see 0.
>>>
>>
>> ah yes, Thanks.
>>
>> So below diff over the current v3 series should work for all cases:
>>
>>
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index b4d72479330d..482e3ab60911 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -3483,6 +3483,7 @@ void __folio_undo_large_rmappable(struct folio *folio)
>>         spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
>>  }
>>
>> +/* partially_mapped=false won't clear PG_partially_mapped folio flag */
>>  void deferred_split_folio(struct folio *folio, bool partially_mapped)
>>  {
>>         struct deferred_split *ds_queue = get_deferred_split_queue(folio);
>> @@ -3515,16 +3516,16 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped)
>>                 return;
>>
>>         spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
>> -       if (partially_mapped)
>> +       if (partially_mapped) {
>>                 folio_set_partially_mapped(folio);
>> -       else
>> -               folio_clear_partially_mapped(folio);
>> +               if (folio_test_pmd_mappable(folio))
>> +                       count_vm_event(THP_DEFERRED_SPLIT_PAGE);
>> +               count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
>> +       } else {
>> +               /* partially mapped folios cannont become partially unmapped */
>> +               VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio);
>> +       }
>>         if (list_empty(&folio->_deferred_list)) {
>> -               if (partially_mapped) {
>> -                       if (folio_test_pmd_mappable(folio))
>> -                               count_vm_event(THP_DEFERRED_SPLIT_PAGE);
>> -                       count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
>> -               }
>>                 list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
>>                 ds_queue->split_queue_len++;
>>  #ifdef CONFIG_MEMCG
>> diff --git a/mm/rmap.c b/mm/rmap.c
>> index 9ad558c2bad0..4c330635aa4e 100644
>> --- a/mm/rmap.c
>> +++ b/mm/rmap.c
>> @@ -1578,7 +1578,7 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
>>          * Check partially_mapped first to ensure it is a large folio.
>>          */
>>         if (partially_mapped && folio_test_anon(folio) &&
>> -           list_empty(&folio->_deferred_list))
>> +           !folio_test_partially_mapped(folio))
>>                 deferred_split_folio(folio, true);
>>
>>         __folio_mod_stat(folio, -nr, -nr_pmdmapped);
>>
> 
> This is an improvement, but there's still an issue. Two or more threads can
> execute deferred_split_folio() simultaneously, which could lead to
> DEFERRED_SPLIT being added multiple times. We should double-check
> the status after acquiring the spinlock.
> 
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index f401ceded697..3d247826fb95 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -3607,10 +3607,12 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped)
>  
>  	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
>  	if (partially_mapped) {
> -		folio_set_partially_mapped(folio);
> -		if (folio_test_pmd_mappable(folio))
> -			count_vm_event(THP_DEFERRED_SPLIT_PAGE);
> -		count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
> +		if (!folio_test_partially_mapped(folio)) {
> +			folio_set_partially_mapped(folio);
> +			if (folio_test_pmd_mappable(folio))
> +				count_vm_event(THP_DEFERRED_SPLIT_PAGE);
> +			count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
> +		}
>  	} else {
>  		/* partially mapped folios cannont become partially unmapped */
>  		VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio);

Actually above is still not thread safe. multiple threads can test partially_mapped and see its false at the same time and all of them would then increment stats. I believe !folio_test_set_partially_mapped would be best. Hopefully below diff over v3 should cover all the fixes that have come up until now.

Independent of this series, I also think its a good idea to add a selftest for this deferred_split counter. I will send a separate patch for it that just maps a THP, unmaps a small part from it and checks the counter. I think split_huge_page_test.c is probably the right place for it.

If everyone is happy with it, Andrew could replace the original fix patch in [1] with below.

[1] https://lore.kernel.org/all/20240814200220.F1964C116B1@smtp.kernel.org/

commit c627655548fa09b59849e942da4decc84fa0b0f2
Author: Usama Arif <usamaarif642@gmail.com>
Date:   Thu Aug 15 16:07:20 2024 +0100

    mm: Introduce a pageflag for partially mapped folios fix
    
    Fixes the original commit by not clearing partially mapped bit
    in hugeTLB folios and fixing deferred split THP stats.
    
    Signed-off-by: Usama Arif <usamaarif642@gmail.com>

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index cecc1bad7910..7bee743ede40 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -863,6 +863,7 @@ static inline void ClearPageCompound(struct page *page)
 }
 FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE)
 FOLIO_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
+FOLIO_TEST_SET_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
 #else
 FOLIO_FLAG_FALSE(large_rmappable)
 FOLIO_FLAG_FALSE(partially_mapped)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index c024ab0f745c..24371e4ef19b 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3459,6 +3459,7 @@ void __folio_undo_large_rmappable(struct folio *folio)
        spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
 }
 
+/* partially_mapped=false won't clear PG_partially_mapped folio flag */
 void deferred_split_folio(struct folio *folio, bool partially_mapped)
 {
        struct deferred_split *ds_queue = get_deferred_split_queue(folio);
@@ -3488,16 +3489,17 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped)
                return;
 
        spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
-       if (partially_mapped)
-               folio_set_partially_mapped(folio);
-       else
-               folio_clear_partially_mapped(folio);
-       if (list_empty(&folio->_deferred_list)) {
-               if (partially_mapped) {
+       if (partially_mapped) {
+               if (!folio_test_set_partially_mapped(folio)) {
                        if (folio_test_pmd_mappable(folio))
                                count_vm_event(THP_DEFERRED_SPLIT_PAGE);
                        count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
                }
+       } else {
+               /* partially mapped folios cannot become non-partially mapped */
+               VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio);
+       }
+       if (list_empty(&folio->_deferred_list)) {
                list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
                ds_queue->split_queue_len++;
 #ifdef CONFIG_MEMCG
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 2ae2d9a18e40..1fdd9eab240c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1758,7 +1758,6 @@ static void __update_and_free_hugetlb_folio(struct hstate *h,
                free_gigantic_folio(folio, huge_page_order(h));
        } else {
                INIT_LIST_HEAD(&folio->_deferred_list);
-               folio_clear_partially_mapped(folio);
                folio_put(folio);
        }
 }
diff --git a/mm/rmap.c b/mm/rmap.c
index 9ad558c2bad0..4c330635aa4e 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1578,7 +1578,7 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
         * Check partially_mapped first to ensure it is a large folio.
         */
        if (partially_mapped && folio_test_anon(folio) &&
-           list_empty(&folio->_deferred_list))
+           !folio_test_partially_mapped(folio))
                deferred_split_folio(folio, true);
 
        __folio_mod_stat(folio, -nr, -nr_pmdmapped);
David Hildenbrand Aug. 15, 2024, 4:33 p.m. UTC | #15
On 13.08.24 14:02, Usama Arif wrote:
> Currently folio->_deferred_list is used to keep track of
> partially_mapped folios that are going to be split under memory
> pressure. In the next patch, all THPs that are faulted in and collapsed
> by khugepaged are also going to be tracked using _deferred_list.
> 
> This patch introduces a pageflag to be able to distinguish between
> partially mapped folios and others in the deferred_list at split time in
> deferred_split_scan. Its needed as __folio_remove_rmap decrements
> _mapcount, _large_mapcount and _entire_mapcount, hence it won't be
> possible to distinguish between partially mapped folios and others in
> deferred_split_scan.
> 
> Eventhough it introduces an extra flag to track if the folio is
> partially mapped, there is no functional change intended with this
> patch and the flag is not useful in this patch itself, it will
> become useful in the next patch when _deferred_list has non partially
> mapped folios.
> 
> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
> ---
>   include/linux/huge_mm.h    |  4 ++--
>   include/linux/page-flags.h |  3 +++
>   mm/huge_memory.c           | 21 +++++++++++++--------
>   mm/hugetlb.c               |  1 +
>   mm/internal.h              |  4 +++-
>   mm/memcontrol.c            |  3 ++-
>   mm/migrate.c               |  3 ++-
>   mm/page_alloc.c            |  5 +++--
>   mm/rmap.c                  |  3 ++-
>   mm/vmscan.c                |  3 ++-
>   10 files changed, 33 insertions(+), 17 deletions(-)
> 
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index 4c32058cacfe..969f11f360d2 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -321,7 +321,7 @@ static inline int split_huge_page(struct page *page)
>   {
>   	return split_huge_page_to_list_to_order(page, NULL, 0);
>   }
> -void deferred_split_folio(struct folio *folio);
> +void deferred_split_folio(struct folio *folio, bool partially_mapped);
>   
>   void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
>   		unsigned long address, bool freeze, struct folio *folio);
> @@ -495,7 +495,7 @@ static inline int split_huge_page(struct page *page)
>   {
>   	return 0;
>   }
> -static inline void deferred_split_folio(struct folio *folio) {}
> +static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {}
>   #define split_huge_pmd(__vma, __pmd, __address)	\
>   	do { } while (0)
>   
> diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
> index a0a29bd092f8..cecc1bad7910 100644
> --- a/include/linux/page-flags.h
> +++ b/include/linux/page-flags.h
> @@ -182,6 +182,7 @@ enum pageflags {
>   	/* At least one page in this folio has the hwpoison flag set */
>   	PG_has_hwpoisoned = PG_active,
>   	PG_large_rmappable = PG_workingset, /* anon or file-backed */
> +	PG_partially_mapped, /* was identified to be partially mapped */
>   };
>   
>   #define PAGEFLAGS_MASK		((1UL << NR_PAGEFLAGS) - 1)
> @@ -861,8 +862,10 @@ static inline void ClearPageCompound(struct page *page)
>   	ClearPageHead(page);
>   }
>   FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE)
> +FOLIO_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
>   #else
>   FOLIO_FLAG_FALSE(large_rmappable)
> +FOLIO_FLAG_FALSE(partially_mapped)
>   #endif
>   
>   #define PG_head_mask ((1UL << PG_head))
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 6df0e9f4f56c..c024ab0f745c 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -3397,6 +3397,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>   			 * page_deferred_list.
>   			 */
>   			list_del_init(&folio->_deferred_list);
> +			folio_clear_partially_mapped(folio);
>   		}
>   		spin_unlock(&ds_queue->split_queue_lock);
>   		if (mapping) {
> @@ -3453,11 +3454,12 @@ void __folio_undo_large_rmappable(struct folio *folio)
>   	if (!list_empty(&folio->_deferred_list)) {
>   		ds_queue->split_queue_len--;
>   		list_del_init(&folio->_deferred_list);
> +		folio_clear_partially_mapped(folio);
>   	}
>   	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
>   }
>   
> -void deferred_split_folio(struct folio *folio)
> +void deferred_split_folio(struct folio *folio, bool partially_mapped)
>   {
	/* We lost race with folio_put() */>   		 
list_del_init(&folio->_deferred_list);
> +			folio_clear_partially_mapped(folio);
>   			ds_queue->split_queue_len--;
>   		}
>   		if (!--sc->nr_to_scan)
> @@ -3558,7 +3564,6 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
>   next:
>   		folio_put(folio);
>   	}
> -
>   	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
>   	list_splice_tail(&list, &ds_queue->split_queue);
>   	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 1fdd9eab240c..2ae2d9a18e40 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -1758,6 +1758,7 @@ static void __update_and_free_hugetlb_folio(struct hstate *h,
>   		free_gigantic_folio(folio, huge_page_order(h));
>   	} else {
>   		INIT_LIST_HEAD(&folio->_deferred_list);
> +		folio_clear_partially_mapped(folio);
>   		folio_put(folio);
>   	}
>   }
> diff --git a/mm/internal.h b/mm/internal.h
> index 52f7fc4e8ac3..d64546b8d377 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -662,8 +662,10 @@ static inline void prep_compound_head(struct page *page, unsigned int order)
>   	atomic_set(&folio->_entire_mapcount, -1);
>   	atomic_set(&folio->_nr_pages_mapped, 0);
>   	atomic_set(&folio->_pincount, 0);
> -	if (order > 1)
> +	if (order > 1) {
>   		INIT_LIST_HEAD(&folio->_deferred_list);
> +		folio_clear_partially_mapped(folio);

Can we use the non-atomic version here?
Usama Arif Aug. 15, 2024, 5:10 p.m. UTC | #16
On 15/08/2024 17:33, David Hildenbrand wrote:
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 6df0e9f4f56c..c024ab0f745c 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -3397,6 +3397,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>>                * page_deferred_list.
>>                */
>>               list_del_init(&folio->_deferred_list);
>> +            folio_clear_partially_mapped(folio);
>>           }
>>           spin_unlock(&ds_queue->split_queue_lock);
>>           if (mapping) {
>> @@ -3453,11 +3454,12 @@ void __folio_undo_large_rmappable(struct folio *folio)
>>       if (!list_empty(&folio->_deferred_list)) {
>>           ds_queue->split_queue_len--;
>>           list_del_init(&folio->_deferred_list);
>> +        folio_clear_partially_mapped(folio);
>>       }
>>       spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
>>   }
>>   -void deferred_split_folio(struct folio *folio)
>> +void deferred_split_folio(struct folio *folio, bool partially_mapped)
>>   {
>     /* We lost race with folio_put() */>            list_del_init(&folio->_deferred_list);

Was there some comment here? I just see ">" remove from the start of /* We lost race with folio_put() */

>> +            folio_clear_partially_mapped(folio);
>>               ds_queue->split_queue_len--;
>>           }
>>           if (!--sc->nr_to_scan)
>> @@ -3558,7 +3564,6 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
>>   next:
>>           folio_put(folio);
>>       }
>> -
>>       spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
>>       list_splice_tail(&list, &ds_queue->split_queue);
>>       spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
>> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
>> index 1fdd9eab240c..2ae2d9a18e40 100644
>> --- a/mm/hugetlb.c
>> +++ b/mm/hugetlb.c
>> @@ -1758,6 +1758,7 @@ static void __update_and_free_hugetlb_folio(struct hstate *h,
>>           free_gigantic_folio(folio, huge_page_order(h));
>>       } else {
>>           INIT_LIST_HEAD(&folio->_deferred_list);
>> +        folio_clear_partially_mapped(folio);
>>           folio_put(folio);
>>       }
>>   }
>> diff --git a/mm/internal.h b/mm/internal.h
>> index 52f7fc4e8ac3..d64546b8d377 100644
>> --- a/mm/internal.h
>> +++ b/mm/internal.h
>> @@ -662,8 +662,10 @@ static inline void prep_compound_head(struct page *page, unsigned int order)
>>       atomic_set(&folio->_entire_mapcount, -1);
>>       atomic_set(&folio->_nr_pages_mapped, 0);
>>       atomic_set(&folio->_pincount, 0);
>> -    if (order > 1)
>> +    if (order > 1) {
>>           INIT_LIST_HEAD(&folio->_deferred_list);
>> +        folio_clear_partially_mapped(folio);
> 
> Can we use the non-atomic version here?
> 

I believe we can use the non-atomic version in all places where set/clear is done as all set/clear are protected by ds_queue->split_queue_lock. So basically could replace all folio_set/clear_partially_mapped with __folio_set/clear_partially_mapped.

But I guess its likely not going to make much difference? I will do it anyways in the next revision, rather than sending a fix patch. There haven't been any reviews for patch 5 so will wait a few days for any comments on that.

Thanks
Barry Song Aug. 15, 2024, 9:06 p.m. UTC | #17
On Fri, Aug 16, 2024 at 5:10 AM Usama Arif <usamaarif642@gmail.com> wrote:
>
>
>
> On 15/08/2024 17:33, David Hildenbrand wrote:
> >> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> >> index 6df0e9f4f56c..c024ab0f745c 100644
> >> --- a/mm/huge_memory.c
> >> +++ b/mm/huge_memory.c
> >> @@ -3397,6 +3397,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
> >>                * page_deferred_list.
> >>                */
> >>               list_del_init(&folio->_deferred_list);
> >> +            folio_clear_partially_mapped(folio);
> >>           }
> >>           spin_unlock(&ds_queue->split_queue_lock);
> >>           if (mapping) {
> >> @@ -3453,11 +3454,12 @@ void __folio_undo_large_rmappable(struct folio *folio)
> >>       if (!list_empty(&folio->_deferred_list)) {
> >>           ds_queue->split_queue_len--;
> >>           list_del_init(&folio->_deferred_list);
> >> +        folio_clear_partially_mapped(folio);
> >>       }
> >>       spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
> >>   }
> >>   -void deferred_split_folio(struct folio *folio)
> >> +void deferred_split_folio(struct folio *folio, bool partially_mapped)
> >>   {
> >     /* We lost race with folio_put() */>            list_del_init(&folio->_deferred_list);
>
> Was there some comment here? I just see ">" remove from the start of /* We lost race with folio_put() */
>
> >> +            folio_clear_partially_mapped(folio);
> >>               ds_queue->split_queue_len--;
> >>           }
> >>           if (!--sc->nr_to_scan)
> >> @@ -3558,7 +3564,6 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
> >>   next:
> >>           folio_put(folio);
> >>       }
> >> -
> >>       spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
> >>       list_splice_tail(&list, &ds_queue->split_queue);
> >>       spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
> >> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> >> index 1fdd9eab240c..2ae2d9a18e40 100644
> >> --- a/mm/hugetlb.c
> >> +++ b/mm/hugetlb.c
> >> @@ -1758,6 +1758,7 @@ static void __update_and_free_hugetlb_folio(struct hstate *h,
> >>           free_gigantic_folio(folio, huge_page_order(h));
> >>       } else {
> >>           INIT_LIST_HEAD(&folio->_deferred_list);
> >> +        folio_clear_partially_mapped(folio);
> >>           folio_put(folio);
> >>       }
> >>   }
> >> diff --git a/mm/internal.h b/mm/internal.h
> >> index 52f7fc4e8ac3..d64546b8d377 100644
> >> --- a/mm/internal.h
> >> +++ b/mm/internal.h
> >> @@ -662,8 +662,10 @@ static inline void prep_compound_head(struct page *page, unsigned int order)
> >>       atomic_set(&folio->_entire_mapcount, -1);
> >>       atomic_set(&folio->_nr_pages_mapped, 0);
> >>       atomic_set(&folio->_pincount, 0);
> >> -    if (order > 1)
> >> +    if (order > 1) {
> >>           INIT_LIST_HEAD(&folio->_deferred_list);
> >> +        folio_clear_partially_mapped(folio);
> >
> > Can we use the non-atomic version here?
> >
>
> I believe we can use the non-atomic version in all places where set/clear is done as all set/clear are protected by ds_queue->split_queue_lock. So basically could replace all folio_set/clear_partially_mapped with __folio_set/clear_partially_mapped.
>

right. That is why I thought the below is actually safe.
but i appreciate a test_set of course(and non-atomic):

+               if (!folio_test_partially_mapped(folio)) {
+                       folio_set_partially_mapped(folio);
+                       if (folio_test_pmd_mappable(folio))
+                               count_vm_event(THP_DEFERRED_SPLIT_PAGE);
+                       count_mthp_stat(folio_order(folio),
MTHP_STAT_SPLIT_DEFERRED);
+               }


> But I guess its likely not going to make much difference? I will do it anyways in the next revision, rather than sending a fix patch. There haven't been any reviews for patch 5 so will wait a few days for any comments on that.
>
> Thanks

Thanks
Barry
David Hildenbrand Aug. 15, 2024, 9:08 p.m. UTC | #18
> 
> Was there some comment here? I just see ">" remove from the start of /* We lost race with folio_put() */
> 

Likely I wanted to comment something but decided otherwise, sorry :)

>>> +            folio_clear_partially_mapped(folio);
>>>                ds_queue->split_queue_len--;
>>>            }
>>>            if (!--sc->nr_to_scan)
>>> @@ -3558,7 +3564,6 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
>>>    next:
>>>            folio_put(folio);
>>>        }
>>> -
>>>        spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
>>>        list_splice_tail(&list, &ds_queue->split_queue);
>>>        spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
>>> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
>>> index 1fdd9eab240c..2ae2d9a18e40 100644
>>> --- a/mm/hugetlb.c
>>> +++ b/mm/hugetlb.c
>>> @@ -1758,6 +1758,7 @@ static void __update_and_free_hugetlb_folio(struct hstate *h,
>>>            free_gigantic_folio(folio, huge_page_order(h));
>>>        } else {
>>>            INIT_LIST_HEAD(&folio->_deferred_list);
>>> +        folio_clear_partially_mapped(folio);
>>>            folio_put(folio);
>>>        }
>>>    }
>>> diff --git a/mm/internal.h b/mm/internal.h
>>> index 52f7fc4e8ac3..d64546b8d377 100644
>>> --- a/mm/internal.h
>>> +++ b/mm/internal.h
>>> @@ -662,8 +662,10 @@ static inline void prep_compound_head(struct page *page, unsigned int order)
>>>        atomic_set(&folio->_entire_mapcount, -1);
>>>        atomic_set(&folio->_nr_pages_mapped, 0);
>>>        atomic_set(&folio->_pincount, 0);
>>> -    if (order > 1)
>>> +    if (order > 1) {
>>>            INIT_LIST_HEAD(&folio->_deferred_list);
>>> +        folio_clear_partially_mapped(folio);
>>
>> Can we use the non-atomic version here?
>>
> 
> I believe we can use the non-atomic version in all places where set/clear is done as all set/clear are protected by ds_queue->split_queue_lock. So basically could replace all folio_set/clear_partially_mapped with __folio_set/clear_partially_mapped.
> 
> But I guess its likely not going to make much difference? I will do it anyways in the next revision, rather than sending a fix patch. There haven't been any reviews for patch 5 so will wait a few days for any comments on that.

If we can avoid atomics, please do! :)
Andrew Morton Aug. 15, 2024, 11:30 p.m. UTC | #19
On Thu, 15 Aug 2024 16:25:09 +0100 Usama Arif <usamaarif642@gmail.com> wrote:

> 
> 
> commit c627655548fa09b59849e942da4decc84fa0b0f2
> Author: Usama Arif <usamaarif642@gmail.com>
> Date:   Thu Aug 15 16:07:20 2024 +0100
> 
>     mm: Introduce a pageflag for partially mapped folios fix
>     
>     Fixes the original commit by not clearing partially mapped bit
>     in hugeTLB folios and fixing deferred split THP stats.
>
> ...
>

Life is getting complicated.

> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -1758,7 +1758,6 @@ static void __update_and_free_hugetlb_folio(struct hstate *h,
>                 free_gigantic_folio(folio, huge_page_order(h));
>         } else {
>                 INIT_LIST_HEAD(&folio->_deferred_list);
> -               folio_clear_partially_mapped(folio);
>                 folio_put(folio);
>         }
>  }

Yu Zhao's "mm/hugetlb: use __GFP_COMP for gigantic folios" was
expecting that folio_clear_partially_mapped() to be there.

I resolved this within
mm-hugetlb-use-__gfp_comp-for-gigantic-folios.patch thusly:

@@ -1748,18 +1704,8 @@ static void __update_and_free_hugetlb_fo
 
 	folio_ref_unfreeze(folio, 1);
 
-	/*
-	 * Non-gigantic pages demoted from CMA allocated gigantic pages
-	 * need to be given back to CMA in free_gigantic_folio.
-	 */
-	if (hstate_is_gigantic(h) ||
-	    hugetlb_cma_folio(folio, huge_page_order(h))) {
-		destroy_compound_gigantic_folio(folio, huge_page_order(h));
-		free_gigantic_folio(folio, huge_page_order(h));
-	} else {
-		INIT_LIST_HEAD(&folio->_deferred_list);
-		folio_put(folio);
-	}
+	INIT_LIST_HEAD(&folio->_deferred_list);
+	hugetlb_free_folio(folio);
 }
 
 /*

Please check.
Yu Zhao Aug. 16, 2024, 2:50 a.m. UTC | #20
On Thu, Aug 15, 2024 at 5:30 PM Andrew Morton <akpm@linux-foundation.org> wrote:
>
> On Thu, 15 Aug 2024 16:25:09 +0100 Usama Arif <usamaarif642@gmail.com> wrote:
>
> >
> >
> > commit c627655548fa09b59849e942da4decc84fa0b0f2
> > Author: Usama Arif <usamaarif642@gmail.com>
> > Date:   Thu Aug 15 16:07:20 2024 +0100
> >
> >     mm: Introduce a pageflag for partially mapped folios fix
> >
> >     Fixes the original commit by not clearing partially mapped bit
> >     in hugeTLB folios and fixing deferred split THP stats.
> >
> > ...
> >
>
> Life is getting complicated.
>
> > --- a/mm/hugetlb.c
> > +++ b/mm/hugetlb.c
> > @@ -1758,7 +1758,6 @@ static void __update_and_free_hugetlb_folio(struct hstate *h,
> >                 free_gigantic_folio(folio, huge_page_order(h));
> >         } else {
> >                 INIT_LIST_HEAD(&folio->_deferred_list);
> > -               folio_clear_partially_mapped(folio);
> >                 folio_put(folio);
> >         }
> >  }
>
> Yu Zhao's "mm/hugetlb: use __GFP_COMP for gigantic folios" was
> expecting that folio_clear_partially_mapped() to be there.
>
> I resolved this within
> mm-hugetlb-use-__gfp_comp-for-gigantic-folios.patch thusly:
>
> @@ -1748,18 +1704,8 @@ static void __update_and_free_hugetlb_fo
>
>         folio_ref_unfreeze(folio, 1);
>
> -       /*
> -        * Non-gigantic pages demoted from CMA allocated gigantic pages
> -        * need to be given back to CMA in free_gigantic_folio.
> -        */
> -       if (hstate_is_gigantic(h) ||
> -           hugetlb_cma_folio(folio, huge_page_order(h))) {
> -               destroy_compound_gigantic_folio(folio, huge_page_order(h));
> -               free_gigantic_folio(folio, huge_page_order(h));
> -       } else {
> -               INIT_LIST_HEAD(&folio->_deferred_list);
> -               folio_put(folio);
> -       }
> +       INIT_LIST_HEAD(&folio->_deferred_list);
> +       hugetlb_free_folio(folio);
>  }
>
>  /*
>
> Please check.

Confirmed, thanks.
Matthew Wilcox (Oracle) Aug. 16, 2024, 3:44 p.m. UTC | #21
On Tue, Aug 13, 2024 at 01:02:47PM +0100, Usama Arif wrote:
> diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
> index a0a29bd092f8..cecc1bad7910 100644
> --- a/include/linux/page-flags.h
> +++ b/include/linux/page-flags.h
> @@ -182,6 +182,7 @@ enum pageflags {
>  	/* At least one page in this folio has the hwpoison flag set */
>  	PG_has_hwpoisoned = PG_active,
>  	PG_large_rmappable = PG_workingset, /* anon or file-backed */
> +	PG_partially_mapped, /* was identified to be partially mapped */

No, you can't do this.  You have to be really careful when reusing page
flags, you can't just take the next one.  What made you think it would
be this easy?

I'd suggest using PG_reclaim.  You also need to add PG_partially_mapped
to PAGE_FLAGS_SECOND.  You might get away without that if you're
guaranteeing it'll always be clear when you free the folio; I don't
understand this series so I don't know if that's true or not.
Usama Arif Aug. 16, 2024, 4:08 p.m. UTC | #22
On 16/08/2024 16:44, Matthew Wilcox wrote:
> On Tue, Aug 13, 2024 at 01:02:47PM +0100, Usama Arif wrote:
>> diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
>> index a0a29bd092f8..cecc1bad7910 100644
>> --- a/include/linux/page-flags.h
>> +++ b/include/linux/page-flags.h
>> @@ -182,6 +182,7 @@ enum pageflags {
>>  	/* At least one page in this folio has the hwpoison flag set */
>>  	PG_has_hwpoisoned = PG_active,
>>  	PG_large_rmappable = PG_workingset, /* anon or file-backed */
>> +	PG_partially_mapped, /* was identified to be partially mapped */
> 
> No, you can't do this.  You have to be really careful when reusing page
> flags, you can't just take the next one.  What made you think it would
> be this easy?
> 
> I'd suggest using PG_reclaim.  You also need to add PG_partially_mapped
> to PAGE_FLAGS_SECOND.  You might get away without that if you're
> guaranteeing it'll always be clear when you free the folio; I don't
> understand this series so I don't know if that's true or not.

I am really not sure what the issue is over here.

From what I see, bits 0-7 of folio->_flags_1 are used for storing folio order, bit 8 for PG_has_hwpoisoned and bit 9 for PG_large_rmappable.
Bits 10 and above of folio->_flags_1 are not used any anywhere in the kernel. I am not reusing a page flag of folio->_flags_1, just taking an unused one.

Please have a look at the next few lines of the patch. I have defined the functions as FOLIO_FLAG(partially_mapped, FOLIO_SECOND_PAGE). I believe thats what you are saying in your second paragraph?
I am not sure what you meant by using PG_reclaim.

I have added the next few lines of the patch below:

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index a0a29bd092f8..cecc1bad7910 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -182,6 +182,7 @@ enum pageflags {
 	/* At least one page in this folio has the hwpoison flag set */
 	PG_has_hwpoisoned = PG_active,
 	PG_large_rmappable = PG_workingset, /* anon or file-backed */
+	PG_partially_mapped, /* was identified to be partially mapped */
 };
 
 #define PAGEFLAGS_MASK		((1UL << NR_PAGEFLAGS) - 1)
@@ -861,8 +862,10 @@ static inline void ClearPageCompound(struct page *page)
 	ClearPageHead(page);
 }
 FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE)
+FOLIO_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
 #else
 FOLIO_FLAG_FALSE(large_rmappable)
Matthew Wilcox (Oracle) Aug. 16, 2024, 4:28 p.m. UTC | #23
On Fri, Aug 16, 2024 at 05:08:35PM +0100, Usama Arif wrote:
> On 16/08/2024 16:44, Matthew Wilcox wrote:
> > On Tue, Aug 13, 2024 at 01:02:47PM +0100, Usama Arif wrote:
> >> diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
> >> index a0a29bd092f8..cecc1bad7910 100644
> >> --- a/include/linux/page-flags.h
> >> +++ b/include/linux/page-flags.h
> >> @@ -182,6 +182,7 @@ enum pageflags {
> >>  	/* At least one page in this folio has the hwpoison flag set */
> >>  	PG_has_hwpoisoned = PG_active,
> >>  	PG_large_rmappable = PG_workingset, /* anon or file-backed */
> >> +	PG_partially_mapped, /* was identified to be partially mapped */
> > 
> > No, you can't do this.  You have to be really careful when reusing page
> > flags, you can't just take the next one.  What made you think it would
> > be this easy?
> > 
> > I'd suggest using PG_reclaim.  You also need to add PG_partially_mapped
> > to PAGE_FLAGS_SECOND.  You might get away without that if you're
> > guaranteeing it'll always be clear when you free the folio; I don't
> > understand this series so I don't know if that's true or not.
> 
> I am really not sure what the issue is over here.

You've made the code more fragile.  It might happen to work today, but
you've either done something which is subtly broken today, or might
break tomorrow when somebody else rearranges the flags without knowing
about your fragility.

> >From what I see, bits 0-7 of folio->_flags_1 are used for storing folio order, bit 8 for PG_has_hwpoisoned and bit 9 for PG_large_rmappable.
> Bits 10 and above of folio->_flags_1 are not used any anywhere in the kernel. I am not reusing a page flag of folio->_flags_1, just taking an unused one.

No, wrong.  PG_anon_exclusive is used on every page, including tail
pages, and that's above bit 10.

> Please have a look at the next few lines of the patch. I have defined the functions as FOLIO_FLAG(partially_mapped, FOLIO_SECOND_PAGE). I believe thats what you are saying in your second paragraph?
> I am not sure what you meant by using PG_reclaim.

I mean:

-	PG_usama_new_thing,
+	PG_usama_new_thing = PG_reclaim,
Usama Arif Aug. 16, 2024, 4:41 p.m. UTC | #24
On 16/08/2024 17:28, Matthew Wilcox wrote:
> On Fri, Aug 16, 2024 at 05:08:35PM +0100, Usama Arif wrote:
>> On 16/08/2024 16:44, Matthew Wilcox wrote:
>>> On Tue, Aug 13, 2024 at 01:02:47PM +0100, Usama Arif wrote:
>>>> diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
>>>> index a0a29bd092f8..cecc1bad7910 100644
>>>> --- a/include/linux/page-flags.h
>>>> +++ b/include/linux/page-flags.h
>>>> @@ -182,6 +182,7 @@ enum pageflags {
>>>>  	/* At least one page in this folio has the hwpoison flag set */
>>>>  	PG_has_hwpoisoned = PG_active,
>>>>  	PG_large_rmappable = PG_workingset, /* anon or file-backed */
>>>> +	PG_partially_mapped, /* was identified to be partially mapped */
>>>
>>> No, you can't do this.  You have to be really careful when reusing page
>>> flags, you can't just take the next one.  What made you think it would
>>> be this easy?
>>>
>>> I'd suggest using PG_reclaim.  You also need to add PG_partially_mapped
>>> to PAGE_FLAGS_SECOND.  You might get away without that if you're
>>> guaranteeing it'll always be clear when you free the folio; I don't
>>> understand this series so I don't know if that's true or not.
>>
>> I am really not sure what the issue is over here.
> 
> You've made the code more fragile.  It might happen to work today, but
> you've either done something which is subtly broken today, or might
> break tomorrow when somebody else rearranges the flags without knowing
> about your fragility.
> 
>> >From what I see, bits 0-7 of folio->_flags_1 are used for storing folio order, bit 8 for PG_has_hwpoisoned and bit 9 for PG_large_rmappable.
>> Bits 10 and above of folio->_flags_1 are not used any anywhere in the kernel. I am not reusing a page flag of folio->_flags_1, just taking an unused one.
> 
> No, wrong.  PG_anon_exclusive is used on every page, including tail
> pages, and that's above bit 10.
> 
>> Please have a look at the next few lines of the patch. I have defined the functions as FOLIO_FLAG(partially_mapped, FOLIO_SECOND_PAGE). I believe thats what you are saying in your second paragraph?
>> I am not sure what you meant by using PG_reclaim.
> 
> I mean:
> 
> -	PG_usama_new_thing,
> +	PG_usama_new_thing = PG_reclaim,
> 

Ah ok, Thanks. The flags below PG_reclaim are either marked as PF_ANY or are arch dependent. So eventhough they might not be used currently for _flags_1, they could be in the future.

I will use PG_partially_mapped = PG_reclaim in the next revision.

Thanks
diff mbox series

Patch

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 4c32058cacfe..969f11f360d2 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -321,7 +321,7 @@  static inline int split_huge_page(struct page *page)
 {
 	return split_huge_page_to_list_to_order(page, NULL, 0);
 }
-void deferred_split_folio(struct folio *folio);
+void deferred_split_folio(struct folio *folio, bool partially_mapped);
 
 void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 		unsigned long address, bool freeze, struct folio *folio);
@@ -495,7 +495,7 @@  static inline int split_huge_page(struct page *page)
 {
 	return 0;
 }
-static inline void deferred_split_folio(struct folio *folio) {}
+static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {}
 #define split_huge_pmd(__vma, __pmd, __address)	\
 	do { } while (0)
 
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index a0a29bd092f8..cecc1bad7910 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -182,6 +182,7 @@  enum pageflags {
 	/* At least one page in this folio has the hwpoison flag set */
 	PG_has_hwpoisoned = PG_active,
 	PG_large_rmappable = PG_workingset, /* anon or file-backed */
+	PG_partially_mapped, /* was identified to be partially mapped */
 };
 
 #define PAGEFLAGS_MASK		((1UL << NR_PAGEFLAGS) - 1)
@@ -861,8 +862,10 @@  static inline void ClearPageCompound(struct page *page)
 	ClearPageHead(page);
 }
 FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE)
+FOLIO_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
 #else
 FOLIO_FLAG_FALSE(large_rmappable)
+FOLIO_FLAG_FALSE(partially_mapped)
 #endif
 
 #define PG_head_mask ((1UL << PG_head))
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 6df0e9f4f56c..c024ab0f745c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3397,6 +3397,7 @@  int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
 			 * page_deferred_list.
 			 */
 			list_del_init(&folio->_deferred_list);
+			folio_clear_partially_mapped(folio);
 		}
 		spin_unlock(&ds_queue->split_queue_lock);
 		if (mapping) {
@@ -3453,11 +3454,12 @@  void __folio_undo_large_rmappable(struct folio *folio)
 	if (!list_empty(&folio->_deferred_list)) {
 		ds_queue->split_queue_len--;
 		list_del_init(&folio->_deferred_list);
+		folio_clear_partially_mapped(folio);
 	}
 	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
 }
 
-void deferred_split_folio(struct folio *folio)
+void deferred_split_folio(struct folio *folio, bool partially_mapped)
 {
 	struct deferred_split *ds_queue = get_deferred_split_queue(folio);
 #ifdef CONFIG_MEMCG
@@ -3485,14 +3487,17 @@  void deferred_split_folio(struct folio *folio)
 	if (folio_test_swapcache(folio))
 		return;
 
-	if (!list_empty(&folio->_deferred_list))
-		return;
-
 	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
+	if (partially_mapped)
+		folio_set_partially_mapped(folio);
+	else
+		folio_clear_partially_mapped(folio);
 	if (list_empty(&folio->_deferred_list)) {
-		if (folio_test_pmd_mappable(folio))
-			count_vm_event(THP_DEFERRED_SPLIT_PAGE);
-		count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
+		if (partially_mapped) {
+			if (folio_test_pmd_mappable(folio))
+				count_vm_event(THP_DEFERRED_SPLIT_PAGE);
+			count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
+		}
 		list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
 		ds_queue->split_queue_len++;
 #ifdef CONFIG_MEMCG
@@ -3541,6 +3546,7 @@  static unsigned long deferred_split_scan(struct shrinker *shrink,
 		} else {
 			/* We lost race with folio_put() */
 			list_del_init(&folio->_deferred_list);
+			folio_clear_partially_mapped(folio);
 			ds_queue->split_queue_len--;
 		}
 		if (!--sc->nr_to_scan)
@@ -3558,7 +3564,6 @@  static unsigned long deferred_split_scan(struct shrinker *shrink,
 next:
 		folio_put(folio);
 	}
-
 	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
 	list_splice_tail(&list, &ds_queue->split_queue);
 	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 1fdd9eab240c..2ae2d9a18e40 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1758,6 +1758,7 @@  static void __update_and_free_hugetlb_folio(struct hstate *h,
 		free_gigantic_folio(folio, huge_page_order(h));
 	} else {
 		INIT_LIST_HEAD(&folio->_deferred_list);
+		folio_clear_partially_mapped(folio);
 		folio_put(folio);
 	}
 }
diff --git a/mm/internal.h b/mm/internal.h
index 52f7fc4e8ac3..d64546b8d377 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -662,8 +662,10 @@  static inline void prep_compound_head(struct page *page, unsigned int order)
 	atomic_set(&folio->_entire_mapcount, -1);
 	atomic_set(&folio->_nr_pages_mapped, 0);
 	atomic_set(&folio->_pincount, 0);
-	if (order > 1)
+	if (order > 1) {
 		INIT_LIST_HEAD(&folio->_deferred_list);
+		folio_clear_partially_mapped(folio);
+	}
 }
 
 static inline void prep_compound_tail(struct page *head, int tail_idx)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e1ffd2950393..0fd95daecf9a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4669,7 +4669,8 @@  static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
 	VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
 	VM_BUG_ON_FOLIO(folio_order(folio) > 1 &&
 			!folio_test_hugetlb(folio) &&
-			!list_empty(&folio->_deferred_list), folio);
+			!list_empty(&folio->_deferred_list) &&
+			folio_test_partially_mapped(folio), folio);
 
 	/*
 	 * Nobody should be changing or seriously looking at
diff --git a/mm/migrate.c b/mm/migrate.c
index 3288ac041d03..6e32098ac2dc 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1734,7 +1734,8 @@  static int migrate_pages_batch(struct list_head *from,
 			 * use _deferred_list.
 			 */
 			if (nr_pages > 2 &&
-			   !list_empty(&folio->_deferred_list)) {
+			   !list_empty(&folio->_deferred_list) &&
+			   folio_test_partially_mapped(folio)) {
 				if (!try_split_folio(folio, split_folios, mode)) {
 					nr_failed++;
 					stats->nr_thp_failed += is_thp;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 408ef3d25cf5..a145c550dd2a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -957,8 +957,9 @@  static int free_tail_page_prepare(struct page *head_page, struct page *page)
 		break;
 	case 2:
 		/* the second tail page: deferred_list overlaps ->mapping */
-		if (unlikely(!list_empty(&folio->_deferred_list))) {
-			bad_page(page, "on deferred list");
+		if (unlikely(!list_empty(&folio->_deferred_list) &&
+		    folio_test_partially_mapped(folio))) {
+			bad_page(page, "partially mapped folio on deferred list");
 			goto out;
 		}
 		break;
diff --git a/mm/rmap.c b/mm/rmap.c
index a6b9cd0b2b18..9ad558c2bad0 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1579,7 +1579,8 @@  static __always_inline void __folio_remove_rmap(struct folio *folio,
 	 */
 	if (partially_mapped && folio_test_anon(folio) &&
 	    list_empty(&folio->_deferred_list))
-		deferred_split_folio(folio);
+		deferred_split_folio(folio, true);
+
 	__folio_mod_stat(folio, -nr, -nr_pmdmapped);
 
 	/*
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 25e43bb3b574..25f4e8403f41 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1233,7 +1233,8 @@  static unsigned int shrink_folio_list(struct list_head *folio_list,
 					 * Split partially mapped folios right away.
 					 * We can free the unmapped pages without IO.
 					 */
-					if (data_race(!list_empty(&folio->_deferred_list)) &&
+					if (data_race(!list_empty(&folio->_deferred_list) &&
+					    folio_test_partially_mapped(folio)) &&
 					    split_folio_to_list(folio, folio_list))
 						goto activate_locked;
 				}