diff mbox series

[v6,08/15] mm/khugepaged: add flag to ignore THP sysfs enabled

Message ID 20220604004004.954674-9-zokeefe@google.com (mailing list archive)
State New
Headers show
Series mm: userspace hugepage collapse | expand

Commit Message

Zach O'Keefe June 4, 2022, 12:39 a.m. UTC
Add enforce_thp_enabled flag to struct collapse_control that allows context
to ignore constraints imposed by /sys/kernel/transparent_hugepage/enabled.

This flag is set in khugepaged collapse context to preserve existing
khugepaged behavior.

This flag will be used (unset) when introducing madvise collapse
context since the desired THP semantics of MADV_COLLAPSE aren't coupled
to sysfs THP settings.  Most notably, for the purpose of eventual
madvise_collapse(2) support, this allows userspace to trigger THP collapse
on behalf of another processes, without adding support to meddle with
the VMA flags of said process, or change sysfs THP settings.

For now, limit this flag to /sys/kernel/transparent_hugepage/enabled,
but it can be expanded to include
/sys/kernel/transparent_hugepage/shmem_enabled later.

Link: https://lore.kernel.org/linux-mm/CAAa6QmQxay1_=Pmt8oCX2-Va18t44FV-Vs-WsQt_6+qBks4nZA@mail.gmail.com/

Signed-off-by: Zach O'Keefe <zokeefe@google.com>
---
 mm/khugepaged.c | 34 +++++++++++++++++++++++++++-------
 1 file changed, 27 insertions(+), 7 deletions(-)

Comments

Yang Shi June 6, 2022, 11:02 p.m. UTC | #1
On Fri, Jun 3, 2022 at 5:40 PM Zach O'Keefe <zokeefe@google.com> wrote:
>
> Add enforce_thp_enabled flag to struct collapse_control that allows context
> to ignore constraints imposed by /sys/kernel/transparent_hugepage/enabled.
>
> This flag is set in khugepaged collapse context to preserve existing
> khugepaged behavior.
>
> This flag will be used (unset) when introducing madvise collapse
> context since the desired THP semantics of MADV_COLLAPSE aren't coupled
> to sysfs THP settings.  Most notably, for the purpose of eventual
> madvise_collapse(2) support, this allows userspace to trigger THP collapse
> on behalf of another processes, without adding support to meddle with
> the VMA flags of said process, or change sysfs THP settings.
>
> For now, limit this flag to /sys/kernel/transparent_hugepage/enabled,
> but it can be expanded to include
> /sys/kernel/transparent_hugepage/shmem_enabled later.
>
> Link: https://lore.kernel.org/linux-mm/CAAa6QmQxay1_=Pmt8oCX2-Va18t44FV-Vs-WsQt_6+qBks4nZA@mail.gmail.com/
>
> Signed-off-by: Zach O'Keefe <zokeefe@google.com>

Looks good to me. Reviewed-by: Yang Shi <shy828301@gmail.com>

Just a reminder, I just posted series
https://lore.kernel.org/linux-mm/20220606214414.736109-1-shy828301@gmail.com/T/#m5dae2dfa4b247f3b3903951dd3a1f0978a927e16,
it changed some logic in hugepage_vma_check(). If your series gets in
after it, you should need some additional tweaks to disregard sys THP
setting.

> ---
>  mm/khugepaged.c | 34 +++++++++++++++++++++++++++-------
>  1 file changed, 27 insertions(+), 7 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index c3589b3e238d..4ad04f552347 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -94,6 +94,11 @@ struct collapse_control {
>          */
>         bool enforce_page_heuristics;
>
> +       /* Enforce constraints of
> +        * /sys/kernel/mm/transparent_hugepage/enabled
> +        */
> +       bool enforce_thp_enabled;
> +
>         /* Num pages scanned per node */
>         int node_load[MAX_NUMNODES];
>
> @@ -893,10 +898,12 @@ static bool khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
>   */
>
>  static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
> -               struct vm_area_struct **vmap)
> +                                  struct vm_area_struct **vmap,
> +                                  struct collapse_control *cc)
>  {
>         struct vm_area_struct *vma;
>         unsigned long hstart, hend;
> +       unsigned long vma_flags;
>
>         if (unlikely(khugepaged_test_exit(mm)))
>                 return SCAN_ANY_PROCESS;
> @@ -909,7 +916,18 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
>         hend = vma->vm_end & HPAGE_PMD_MASK;
>         if (address < hstart || address + HPAGE_PMD_SIZE > hend)
>                 return SCAN_ADDRESS_RANGE;
> -       if (!hugepage_vma_check(vma, vma->vm_flags))
> +
> +       /*
> +        * If !cc->enforce_thp_enabled, set VM_HUGEPAGE so that
> +        * hugepage_vma_check() can pass even if
> +        * TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG is set (i.e. "madvise" mode).
> +        * Note that hugepage_vma_check() doesn't enforce that
> +        * TRANSPARENT_HUGEPAGE_FLAG or TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG
> +        * must be set (i.e. "never" mode).
> +        */
> +       vma_flags = cc->enforce_thp_enabled ?  vma->vm_flags
> +                       : vma->vm_flags | VM_HUGEPAGE;
> +       if (!hugepage_vma_check(vma, vma_flags))
>                 return SCAN_VMA_CHECK;
>         /* Anon VMA expected */
>         if (!vma->anon_vma || !vma_is_anonymous(vma))
> @@ -953,7 +971,8 @@ static int find_pmd_or_thp_or_none(struct mm_struct *mm,
>  static bool __collapse_huge_page_swapin(struct mm_struct *mm,
>                                         struct vm_area_struct *vma,
>                                         unsigned long haddr, pmd_t *pmd,
> -                                       int referenced)
> +                                       int referenced,
> +                                       struct collapse_control *cc)
>  {
>         int swapped_in = 0;
>         vm_fault_t ret = 0;
> @@ -980,7 +999,7 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
>                 /* do_swap_page returns VM_FAULT_RETRY with released mmap_lock */
>                 if (ret & VM_FAULT_RETRY) {
>                         mmap_read_lock(mm);
> -                       if (hugepage_vma_revalidate(mm, haddr, &vma)) {
> +                       if (hugepage_vma_revalidate(mm, haddr, &vma, cc)) {
>                                 /* vma is no longer available, don't continue to swapin */
>                                 trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
>                                 return false;
> @@ -1047,7 +1066,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
>                 goto out_nolock;
>
>         mmap_read_lock(mm);
> -       result = hugepage_vma_revalidate(mm, address, &vma);
> +       result = hugepage_vma_revalidate(mm, address, &vma, cc);
>         if (result) {
>                 mmap_read_unlock(mm);
>                 goto out_nolock;
> @@ -1066,7 +1085,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
>          * Continuing to collapse causes inconsistency.
>          */
>         if (unmapped && !__collapse_huge_page_swapin(mm, vma, address,
> -                                                    pmd, referenced)) {
> +                                                    pmd, referenced, cc)) {
>                 mmap_read_unlock(mm);
>                 goto out_nolock;
>         }
> @@ -1078,7 +1097,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
>          * handled by the anon_vma lock + PG_lock.
>          */
>         mmap_write_lock(mm);
> -       result = hugepage_vma_revalidate(mm, address, &vma);
> +       result = hugepage_vma_revalidate(mm, address, &vma, cc);
>         if (result)
>                 goto out_up_write;
>         /* check if the pmd is still valid */
> @@ -2277,6 +2296,7 @@ static int khugepaged(void *none)
>         struct mm_slot *mm_slot;
>         struct collapse_control cc = {
>                 .enforce_page_heuristics = true,
> +               .enforce_thp_enabled = true,
>                 .last_target_node = NUMA_NO_NODE,
>                 /* .gfp set later  */
>         };
> --
> 2.36.1.255.ge46751e96f-goog
>
Peter Xu June 30, 2022, 2:32 a.m. UTC | #2
On Wed, Jun 29, 2022 at 06:42:25PM -0700, Zach O'Keefe wrote:
> On Jun 29 19:21, Peter Xu wrote:
> > On Fri, Jun 03, 2022 at 05:39:57PM -0700, Zach O'Keefe wrote:
> > > Add enforce_thp_enabled flag to struct collapse_control that allows context
> > > to ignore constraints imposed by /sys/kernel/transparent_hugepage/enabled.
> > >
> > > This flag is set in khugepaged collapse context to preserve existing
> > > khugepaged behavior.
> > >
> > > This flag will be used (unset) when introducing madvise collapse
> > > context since the desired THP semantics of MADV_COLLAPSE aren't coupled
> > > to sysfs THP settings.  Most notably, for the purpose of eventual
> > > madvise_collapse(2) support, this allows userspace to trigger THP collapse
> > > on behalf of another processes, without adding support to meddle with
> > > the VMA flags of said process, or change sysfs THP settings.
> > >
> > > For now, limit this flag to /sys/kernel/transparent_hugepage/enabled,
> > > but it can be expanded to include
> > > /sys/kernel/transparent_hugepage/shmem_enabled later.
> > >
> > > Link: https://lore.kernel.org/linux-mm/CAAa6QmQxay1_=Pmt8oCX2-Va18t44FV-Vs-WsQt_6+qBks4nZA@mail.gmail.com/
> > >
> > > Signed-off-by: Zach O'Keefe <zokeefe@google.com>
> > > ---
> > >  mm/khugepaged.c | 34 +++++++++++++++++++++++++++-------
> > >  1 file changed, 27 insertions(+), 7 deletions(-)
> > >
> > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > > index c3589b3e238d..4ad04f552347 100644
> > > --- a/mm/khugepaged.c
> > > +++ b/mm/khugepaged.c
> > > @@ -94,6 +94,11 @@ struct collapse_control {
> > >      */
> > >     bool enforce_page_heuristics;
> > >
> > > +   /* Enforce constraints of
> > > +    * /sys/kernel/mm/transparent_hugepage/enabled
> > > +    */
> > > +   bool enforce_thp_enabled;
> >
> > Small nitpick that we could have merged the two booleans if they always
> > match, but no strong opinions if you think these two are clearer.  Or maybe
> > there's other plan of using them?
> >
> > > +
> > >     /* Num pages scanned per node */
> > >     int node_load[MAX_NUMNODES];
> > >
> > > @@ -893,10 +898,12 @@ static bool khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
> > >   */
> > >
> > >  static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
> > > -           struct vm_area_struct **vmap)
> > > +                              struct vm_area_struct **vmap,
> > > +                              struct collapse_control *cc)
> > >  {
> > >     struct vm_area_struct *vma;
> > >     unsigned long hstart, hend;
> > > +   unsigned long vma_flags;
> > >
> > >     if (unlikely(khugepaged_test_exit(mm)))
> > >             return SCAN_ANY_PROCESS;
> > > @@ -909,7 +916,18 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
> > >     hend = vma->vm_end & HPAGE_PMD_MASK;
> > >     if (address < hstart || address + HPAGE_PMD_SIZE > hend)
> > >             return SCAN_ADDRESS_RANGE;
> > > -   if (!hugepage_vma_check(vma, vma->vm_flags))
> > > +
> > > +   /*
> > > +    * If !cc->enforce_thp_enabled, set VM_HUGEPAGE so that
> > > +    * hugepage_vma_check() can pass even if
> > > +    * TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG is set (i.e. "madvise" mode).
> > > +    * Note that hugepage_vma_check() doesn't enforce that
> > > +    * TRANSPARENT_HUGEPAGE_FLAG or TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG
> > > +    * must be set (i.e. "never" mode).
> > > +    */
> > > +   vma_flags = cc->enforce_thp_enabled ?  vma->vm_flags
> > > +                   : vma->vm_flags | VM_HUGEPAGE;
> >
> > Another nitpick..
> >
> > We could get a weird vm_flags when VM_NOHUGEPAGE is set.  I don't think
> > it'll go wrong since hugepage_vma_check() checks NOHUGEPAGE first, but IMHO
> > we shouldn't rely on that as it seems error prone (e.g. when accidentally
> > moved things around).
> >
> > So maybe nicer to only apply VM_HUGEPAGE if !VM_NOHUGEPAGE?  Or pass over
> > "enforce_thp_enabled" into hugepage_vma_check() should work too, iiuc.
> > Passing in the boolean has one benefit that we don't really need the
> > complicated comment above since the code should be able to explain itself.
> 
> Hey Peter, thanks again for taking the time to review.
> 
> Answering both of the above at the time:
> 
> As in this series so far, I've tried to keep context functionally-declarative -
> specifying the intended behavior (e.g. "enforce_page_heuristics") rather than
> adding "if (khugepaged) { .. } else if (madv_collapse) { .. } else if { .. }"
> around the code which, IMO, makes it difficult to follow. Unfortunately, I've
> ran into the 2 problems you've stated here:
> 
> 1) *Right now* all the behavior knobs are either off/on at the same time
> 2) For hugepage_vma_check() (now in mm/huge_memory.c and acting as the central
>    authority on THP eligibility), things are complicated enough that I
>    couldn't find a clean way to describe the parameters of the context without
>    explicitly mentioning the caller.
> 
> For (2), instead of adding another arg to specify MADV_COLLAPSE's behavior,
> I think we need to package these contexts into a single set of flags:
> 
> enum thp_ctx_flags {
>         THP_CTX_ANON_FAULT              = 1 << 1,
>         THP_CTX_KHUGEPAGED              = 1 << 2,
>         THP_CTX_SMAPS                   = 1 << 3,
>         THP_CTX_MADVISE_COLLAPSE        = 1 << 4,
> };
> 
> That will avoid hacking vma flags passed to hugepage_vma_check().
> 
> And, if we have these anyways, I might as well do away with some of the
> (semantically meaningful but functionally redundant) flags in
> struct collapse_control and just specify a single .thp_ctx_flags member. I'm
> not entirely happy with it - but that's what I'm planning.
> 
> WDYT?

Firstly I think I wrongly sent previous email privately.. :( Let me try to
add the list back..

IMHO we don't need to worry too much on the "if... else if... else",
because they shouldn't be more complicated than when you spread the
meanings into multiple flags, or how could it be? :) IMHO it should
literally be as simple as applying:

  s/enforce_{A|B|C|...}/khugepaged_initiated/g

Throughout the patches, then we squash the patches introducing enforce_X.

If you worry it's not clear on "what does khugepaged_initiated mean", we
could add whatever comment above the variable explaining A/B/C/D will be
covered when this is set, and we could postpone to do the flag split only
until there're real user.

Adding these flags could add unnecessary bit-and instructions into the code
generated at last, and if it's only about readability issue that's really
what comment is for?

Thanks,
Zach O'Keefe June 30, 2022, 2:17 p.m. UTC | #3
On Jun 29 22:32, Peter Xu wrote:
> On Wed, Jun 29, 2022 at 06:42:25PM -0700, Zach O'Keefe wrote:
> > On Jun 29 19:21, Peter Xu wrote:
> > > On Fri, Jun 03, 2022 at 05:39:57PM -0700, Zach O'Keefe wrote:
> > > > Add enforce_thp_enabled flag to struct collapse_control that allows context
> > > > to ignore constraints imposed by /sys/kernel/transparent_hugepage/enabled.
> > > >
> > > > This flag is set in khugepaged collapse context to preserve existing
> > > > khugepaged behavior.
> > > >
> > > > This flag will be used (unset) when introducing madvise collapse
> > > > context since the desired THP semantics of MADV_COLLAPSE aren't coupled
> > > > to sysfs THP settings.  Most notably, for the purpose of eventual
> > > > madvise_collapse(2) support, this allows userspace to trigger THP collapse
> > > > on behalf of another processes, without adding support to meddle with
> > > > the VMA flags of said process, or change sysfs THP settings.
> > > >
> > > > For now, limit this flag to /sys/kernel/transparent_hugepage/enabled,
> > > > but it can be expanded to include
> > > > /sys/kernel/transparent_hugepage/shmem_enabled later.
> > > >
> > > > Link: https://lore.kernel.org/linux-mm/CAAa6QmQxay1_=Pmt8oCX2-Va18t44FV-Vs-WsQt_6+qBks4nZA@mail.gmail.com/
> > > >
> > > > Signed-off-by: Zach O'Keefe <zokeefe@google.com>
> > > > ---
> > > >  mm/khugepaged.c | 34 +++++++++++++++++++++++++++-------
> > > >  1 file changed, 27 insertions(+), 7 deletions(-)
> > > >
> > > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > > > index c3589b3e238d..4ad04f552347 100644
> > > > --- a/mm/khugepaged.c
> > > > +++ b/mm/khugepaged.c
> > > > @@ -94,6 +94,11 @@ struct collapse_control {
> > > >      */
> > > >     bool enforce_page_heuristics;
> > > >
> > > > +   /* Enforce constraints of
> > > > +    * /sys/kernel/mm/transparent_hugepage/enabled
> > > > +    */
> > > > +   bool enforce_thp_enabled;
> > >
> > > Small nitpick that we could have merged the two booleans if they always
> > > match, but no strong opinions if you think these two are clearer.  Or maybe
> > > there's other plan of using them?
> > >
> > > > +
> > > >     /* Num pages scanned per node */
> > > >     int node_load[MAX_NUMNODES];
> > > >
> > > > @@ -893,10 +898,12 @@ static bool khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
> > > >   */
> > > >
> > > >  static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
> > > > -           struct vm_area_struct **vmap)
> > > > +                              struct vm_area_struct **vmap,
> > > > +                              struct collapse_control *cc)
> > > >  {
> > > >     struct vm_area_struct *vma;
> > > >     unsigned long hstart, hend;
> > > > +   unsigned long vma_flags;
> > > >
> > > >     if (unlikely(khugepaged_test_exit(mm)))
> > > >             return SCAN_ANY_PROCESS;
> > > > @@ -909,7 +916,18 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
> > > >     hend = vma->vm_end & HPAGE_PMD_MASK;
> > > >     if (address < hstart || address + HPAGE_PMD_SIZE > hend)
> > > >             return SCAN_ADDRESS_RANGE;
> > > > -   if (!hugepage_vma_check(vma, vma->vm_flags))
> > > > +
> > > > +   /*
> > > > +    * If !cc->enforce_thp_enabled, set VM_HUGEPAGE so that
> > > > +    * hugepage_vma_check() can pass even if
> > > > +    * TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG is set (i.e. "madvise" mode).
> > > > +    * Note that hugepage_vma_check() doesn't enforce that
> > > > +    * TRANSPARENT_HUGEPAGE_FLAG or TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG
> > > > +    * must be set (i.e. "never" mode).
> > > > +    */
> > > > +   vma_flags = cc->enforce_thp_enabled ?  vma->vm_flags
> > > > +                   : vma->vm_flags | VM_HUGEPAGE;
> > >
> > > Another nitpick..
> > >
> > > We could get a weird vm_flags when VM_NOHUGEPAGE is set.  I don't think
> > > it'll go wrong since hugepage_vma_check() checks NOHUGEPAGE first, but IMHO
> > > we shouldn't rely on that as it seems error prone (e.g. when accidentally
> > > moved things around).
> > >
> > > So maybe nicer to only apply VM_HUGEPAGE if !VM_NOHUGEPAGE?  Or pass over
> > > "enforce_thp_enabled" into hugepage_vma_check() should work too, iiuc.
> > > Passing in the boolean has one benefit that we don't really need the
> > > complicated comment above since the code should be able to explain itself.
> > 
> > Hey Peter, thanks again for taking the time to review.
> > 
> > Answering both of the above at the time:
> > 
> > As in this series so far, I've tried to keep context functionally-declarative -
> > specifying the intended behavior (e.g. "enforce_page_heuristics") rather than
> > adding "if (khugepaged) { .. } else if (madv_collapse) { .. } else if { .. }"
> > around the code which, IMO, makes it difficult to follow. Unfortunately, I've
> > ran into the 2 problems you've stated here:
> > 
> > 1) *Right now* all the behavior knobs are either off/on at the same time
> > 2) For hugepage_vma_check() (now in mm/huge_memory.c and acting as the central
> >    authority on THP eligibility), things are complicated enough that I
> >    couldn't find a clean way to describe the parameters of the context without
> >    explicitly mentioning the caller.
> > 
> > For (2), instead of adding another arg to specify MADV_COLLAPSE's behavior,
> > I think we need to package these contexts into a single set of flags:
> > 
> > enum thp_ctx_flags {
> >         THP_CTX_ANON_FAULT              = 1 << 1,
> >         THP_CTX_KHUGEPAGED              = 1 << 2,
> >         THP_CTX_SMAPS                   = 1 << 3,
> >         THP_CTX_MADVISE_COLLAPSE        = 1 << 4,
> > };
> > 
> > That will avoid hacking vma flags passed to hugepage_vma_check().
> > 
> > And, if we have these anyways, I might as well do away with some of the
> > (semantically meaningful but functionally redundant) flags in
> > struct collapse_control and just specify a single .thp_ctx_flags member. I'm
> > not entirely happy with it - but that's what I'm planning.
> > 
> > WDYT?
> 
> Firstly I think I wrongly sent previous email privately.. :( Let me try to
> add the list back..
> 
> IMHO we don't need to worry too much on the "if... else if... else",
> because they shouldn't be more complicated than when you spread the
> meanings into multiple flags, or how could it be? :) IMHO it should
> literally be as simple as applying:
> 
>   s/enforce_{A|B|C|...}/khugepaged_initiated/g
> 
> Throughout the patches, then we squash the patches introducing enforce_X.

Right, the code today will be virtually identical. The attempt to describe
contexts in terms of behaviors is based on an unfounded assumption that
successive contexts could reuse said behaviors - but there are currently no
plans for other collapsing contexts.

> If you worry it's not clear on "what does khugepaged_initiated mean", we
> could add whatever comment above the variable explaining A/B/C/D will be
> covered when this is set, and we could postpone to do the flag split only
> until there're real user.
> 
> Adding these flags could add unnecessary bit-and instructions into the code
> generated at last, and if it's only about readability issue that's really
> what comment is for?

Ya maybe I'm overthinking it and will just do the most straightforward thing for
now.

As always, thanks for taking the time to review / offer suggestions!

Best,
Zach

> Thanks,
> 
> -- 
> Peter Xu
>
diff mbox series

Patch

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index c3589b3e238d..4ad04f552347 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -94,6 +94,11 @@  struct collapse_control {
 	 */
 	bool enforce_page_heuristics;
 
+	/* Enforce constraints of
+	 * /sys/kernel/mm/transparent_hugepage/enabled
+	 */
+	bool enforce_thp_enabled;
+
 	/* Num pages scanned per node */
 	int node_load[MAX_NUMNODES];
 
@@ -893,10 +898,12 @@  static bool khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
  */
 
 static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
-		struct vm_area_struct **vmap)
+				   struct vm_area_struct **vmap,
+				   struct collapse_control *cc)
 {
 	struct vm_area_struct *vma;
 	unsigned long hstart, hend;
+	unsigned long vma_flags;
 
 	if (unlikely(khugepaged_test_exit(mm)))
 		return SCAN_ANY_PROCESS;
@@ -909,7 +916,18 @@  static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
 	hend = vma->vm_end & HPAGE_PMD_MASK;
 	if (address < hstart || address + HPAGE_PMD_SIZE > hend)
 		return SCAN_ADDRESS_RANGE;
-	if (!hugepage_vma_check(vma, vma->vm_flags))
+
+	/*
+	 * If !cc->enforce_thp_enabled, set VM_HUGEPAGE so that
+	 * hugepage_vma_check() can pass even if
+	 * TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG is set (i.e. "madvise" mode).
+	 * Note that hugepage_vma_check() doesn't enforce that
+	 * TRANSPARENT_HUGEPAGE_FLAG or TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG
+	 * must be set (i.e. "never" mode).
+	 */
+	vma_flags = cc->enforce_thp_enabled ?  vma->vm_flags
+			: vma->vm_flags | VM_HUGEPAGE;
+	if (!hugepage_vma_check(vma, vma_flags))
 		return SCAN_VMA_CHECK;
 	/* Anon VMA expected */
 	if (!vma->anon_vma || !vma_is_anonymous(vma))
@@ -953,7 +971,8 @@  static int find_pmd_or_thp_or_none(struct mm_struct *mm,
 static bool __collapse_huge_page_swapin(struct mm_struct *mm,
 					struct vm_area_struct *vma,
 					unsigned long haddr, pmd_t *pmd,
-					int referenced)
+					int referenced,
+					struct collapse_control *cc)
 {
 	int swapped_in = 0;
 	vm_fault_t ret = 0;
@@ -980,7 +999,7 @@  static bool __collapse_huge_page_swapin(struct mm_struct *mm,
 		/* do_swap_page returns VM_FAULT_RETRY with released mmap_lock */
 		if (ret & VM_FAULT_RETRY) {
 			mmap_read_lock(mm);
-			if (hugepage_vma_revalidate(mm, haddr, &vma)) {
+			if (hugepage_vma_revalidate(mm, haddr, &vma, cc)) {
 				/* vma is no longer available, don't continue to swapin */
 				trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
 				return false;
@@ -1047,7 +1066,7 @@  static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
 		goto out_nolock;
 
 	mmap_read_lock(mm);
-	result = hugepage_vma_revalidate(mm, address, &vma);
+	result = hugepage_vma_revalidate(mm, address, &vma, cc);
 	if (result) {
 		mmap_read_unlock(mm);
 		goto out_nolock;
@@ -1066,7 +1085,7 @@  static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
 	 * Continuing to collapse causes inconsistency.
 	 */
 	if (unmapped && !__collapse_huge_page_swapin(mm, vma, address,
-						     pmd, referenced)) {
+						     pmd, referenced, cc)) {
 		mmap_read_unlock(mm);
 		goto out_nolock;
 	}
@@ -1078,7 +1097,7 @@  static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
 	 * handled by the anon_vma lock + PG_lock.
 	 */
 	mmap_write_lock(mm);
-	result = hugepage_vma_revalidate(mm, address, &vma);
+	result = hugepage_vma_revalidate(mm, address, &vma, cc);
 	if (result)
 		goto out_up_write;
 	/* check if the pmd is still valid */
@@ -2277,6 +2296,7 @@  static int khugepaged(void *none)
 	struct mm_slot *mm_slot;
 	struct collapse_control cc = {
 		.enforce_page_heuristics = true,
+		.enforce_thp_enabled = true,
 		.last_target_node = NUMA_NO_NODE,
 		/* .gfp set later  */
 	};