diff mbox series

[RFC,12/14] mm/madvise: introduce batched madvise(MADV_COLLPASE) collapse

Message ID 20220308213417.1407042-13-zokeefe@google.com (mailing list archive)
State New
Headers show
Series mm: userspace hugepage collapse | expand

Commit Message

Zach O'Keefe March 8, 2022, 9:34 p.m. UTC
Introduce the main madvise collapse batched logic, including the overall
locking strategy.  Stubs for individual batched actions, such as
scanning pmds in batch, have been stubbed out, and will be added later
in the series.

Note the main benefit from doing all this work in a batched manner is
that __madvise__collapse_pmd_batch() (stubbed out) can be called inside
a single mmap_lock write.

Per-batch data is stored in a struct madvise_collapse_data array, with
an entry for each pmd to collapse, and is shared between the various
*_batch actions.  This allows for partial success of collapsing a range
of pmds - we continue as long as some pmds can be successfully
collapsed.

A "success" here, is where all pmds can be (or already are) collapsed.
On failure, the caller will need to verify what, if any, partial
successes occurred via smaps or otherwise.

Also note that, where possible, if collapse fails for a particular pmd
after a hugepage has already been allocated, said hugepage is kept on a
per-node free list for the purpose of backing subsequent pmd collapses.
All unused hugepages are returned before _madvise_collapse() returns.

Note that bisect at this patch won't break; madvise(MADV_COLLAPSE) will
return -1 always.

Signed-off-by: Zach O'Keefe <zokeefe@google.com>
---
 mm/khugepaged.c | 279 ++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 273 insertions(+), 6 deletions(-)

Comments

Yang Shi March 10, 2022, 12:06 a.m. UTC | #1
On Tue, Mar 8, 2022 at 1:35 PM Zach O'Keefe <zokeefe@google.com> wrote:
>
> Introduce the main madvise collapse batched logic, including the overall
> locking strategy.  Stubs for individual batched actions, such as
> scanning pmds in batch, have been stubbed out, and will be added later
> in the series.
>
> Note the main benefit from doing all this work in a batched manner is
> that __madvise__collapse_pmd_batch() (stubbed out) can be called inside
> a single mmap_lock write.

I don't get why this is preferred? Isn't it more preferred to minimize
the scope of write mmap_lock? Assuming you batch large number of PMDs,
MADV_COLLAPSE may hold write mmap_lock for a long time, it doesn't
seem it could scale.

>
> Per-batch data is stored in a struct madvise_collapse_data array, with
> an entry for each pmd to collapse, and is shared between the various
> *_batch actions.  This allows for partial success of collapsing a range
> of pmds - we continue as long as some pmds can be successfully
> collapsed.
>
> A "success" here, is where all pmds can be (or already are) collapsed.
> On failure, the caller will need to verify what, if any, partial
> successes occurred via smaps or otherwise.

And the further question is why you have to batch it? In the first
place my guess is you want to achieve a binary result, all valid PMDs
get collapsed or no PMD gets collapsed. But it seems partial collapse
is fine. So I don't get why you have to batch it.

The side effect, off the top of my head, is you may preallocate a lot
of huge pages, but the later collapse is blocked, for example, can't
get mmap_lock or ptl, and the system may be under memory pressure,
however the pre-allocated huge pages can't get reclaimed at all.

Could you please elaborate why you didn't go with the non-batch approach?

>
> Also note that, where possible, if collapse fails for a particular pmd
> after a hugepage has already been allocated, said hugepage is kept on a
> per-node free list for the purpose of backing subsequent pmd collapses.
> All unused hugepages are returned before _madvise_collapse() returns.
>
> Note that bisect at this patch won't break; madvise(MADV_COLLAPSE) will
> return -1 always.
>
> Signed-off-by: Zach O'Keefe <zokeefe@google.com>
> ---
>  mm/khugepaged.c | 279 ++++++++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 273 insertions(+), 6 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index ca1e523086ed..ea53c706602e 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -86,6 +86,9 @@ static struct kmem_cache *mm_slot_cache __read_mostly;
>  #define MAX_PTE_MAPPED_THP 8
>
>  struct collapse_control {
> +       /* Used by MADV_COLLAPSE batch collapse */
> +       struct list_head free_hpages[MAX_NUMNODES];
> +
>         /* Respect khugepaged_max_ptes_[none|swap|shared] */
>         bool enforce_pte_scan_limits;
>
> @@ -99,8 +102,13 @@ struct collapse_control {
>  static void collapse_control_init(struct collapse_control *cc,
>                                   bool enforce_pte_scan_limits)
>  {
> +       int i;
> +
>         cc->enforce_pte_scan_limits = enforce_pte_scan_limits;
>         cc->last_target_node = NUMA_NO_NODE;
> +
> +       for (i = 0; i < MAX_NUMNODES; ++i)
> +               INIT_LIST_HEAD(cc->free_hpages + i);
>  }
>
>  /**
> @@ -1033,7 +1041,7 @@ static pmd_t *find_pmd_or_thp_or_none(struct mm_struct *mm,
>         /* See comments in pmd_none_or_trans_huge_or_clear_bad() */
>         barrier();
>  #endif
> -       if (!pmd_present(pmde) || !pmd_none(pmde)) {
> +       if (!pmd_present(pmde) || pmd_none(pmde)) {
>                 *result = SCAN_PMD_NULL;
>                 return NULL;
>         } else if (pmd_trans_huge(pmde)) {
> @@ -1054,12 +1062,16 @@ static pmd_t *find_pmd_or_thp_or_none(struct mm_struct *mm,
>  static bool __collapse_huge_page_swapin(struct mm_struct *mm,
>                                         struct vm_area_struct *vma,
>                                         unsigned long haddr, pmd_t *pmd,
> -                                       int referenced)
> +                                       int referenced,
> +                                       unsigned long vm_flags_ignored,
> +                                       bool *mmap_lock_dropped)
>  {
>         int swapped_in = 0;
>         vm_fault_t ret = 0;
>         unsigned long address, end = haddr + (HPAGE_PMD_NR * PAGE_SIZE);
>
> +       if (mmap_lock_dropped)
> +               *mmap_lock_dropped = false;
>         for (address = haddr; address < end; address += PAGE_SIZE) {
>                 struct vm_fault vmf = {
>                         .vma = vma,
> @@ -1080,8 +1092,10 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
>
>                 /* do_swap_page returns VM_FAULT_RETRY with released mmap_lock */
>                 if (ret & VM_FAULT_RETRY) {
> +                       if (mmap_lock_dropped)
> +                               *mmap_lock_dropped = true;
>                         mmap_read_lock(mm);
> -                       if (hugepage_vma_revalidate(mm, haddr, VM_NONE, &vma)) {
> +                       if (hugepage_vma_revalidate(mm, haddr, vm_flags_ignored, &vma)) {
>                                 /* vma is no longer available, don't continue to swapin */
>                                 trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
>                                 return false;
> @@ -1256,7 +1270,8 @@ static void khugepaged_collapse_huge_page(struct mm_struct *mm,
>          * Continuing to collapse causes inconsistency.
>          */
>         if (unmapped && !__collapse_huge_page_swapin(mm, vma, address,
> -                                                    pmd, referenced)) {
> +                                                    pmd, referenced, VM_NONE,
> +                                                    NULL)) {
>                 mmap_read_unlock(mm);
>                 goto out_nolock;
>         }
> @@ -2520,6 +2535,128 @@ void khugepaged_min_free_kbytes_update(void)
>         mutex_unlock(&khugepaged_mutex);
>  }
>
> +struct madvise_collapse_data {
> +       struct page *hpage; /* Preallocated THP */
> +       bool continue_collapse;  /* Should we attempt / continue collapse? */
> +
> +       struct scan_pmd_result scan_result;
> +       pmd_t *pmd;
> +};
> +
> +static int
> +madvise_collapse_vma_revalidate_pmd_count(struct mm_struct *mm,
> +                                         unsigned long address, int nr,
> +                                         struct vm_area_struct **vmap)
> +{
> +       /* madvise_collapse() ignores MADV_NOHUGEPAGE */
> +       return hugepage_vma_revalidate_pmd_count(mm, address, nr, VM_NOHUGEPAGE,
> +                       vmap);
> +}
> +
> +/*
> + * Scan pmd to see which we can collapse, and to determine node to allocate on.
> + *
> + * Must be called with mmap_lock in read, and returns with the lock held in
> + * read. Does not drop the lock.
> + *
> + * Set batch_data[i]->continue_collapse to false for any pmd that can't be
> + * collapsed.
> + *
> + * Return the number of existing THPs in batch.
> + */
> +static int
> +__madvise_collapse_scan_pmd_batch(struct mm_struct *mm,
> +                                 struct vm_area_struct *vma,
> +                                 unsigned long batch_start,
> +                                 struct madvise_collapse_data *batch_data,
> +                                 int batch_size,
> +                                 struct collapse_control *cc)
> +{
> +       /* Implemented in later patch */
> +       return 0;
> +}
> +
> +/*
> + * Preallocate and charge huge page for each pmd in the batch, store the
> + * new page in batch_data[i]->hpage.
> + *
> + * Return the number of huge pages allocated.
> + */
> +static int
> +__madvise_collapse_prealloc_hpages_batch(struct mm_struct *mm,
> +                                        gfp_t gfp,
> +                                        int node,
> +                                        struct madvise_collapse_data *batch_data,
> +                                        int batch_size,
> +                                        struct collapse_control *cc)
> +{
> +       /* Implemented in later patch */
> +       return 0;
> +}
> +
> +/*
> + * Do swapin for all ranges in batch, returns true iff successful.
> + *
> + * Called with mmap_lock held in read, and returns with it held in read.
> + * Might drop the lock.
> + *
> + * Set batch_data[i]->continue_collapse to false for any pmd that can't be
> + * collapsed. Else, set batch_data[i]->pmd to the found pmd.
> + */
> +static bool
> +__madvise_collapse_swapin_pmd_batch(struct mm_struct *mm,
> +                                   int node,
> +                                   unsigned long batch_start,
> +                                   struct madvise_collapse_data *batch_data,
> +                                   int batch_size,
> +                                   struct collapse_control *cc)
> +
> +{
> +       /* Implemented in later patch */
> +       return true;
> +}
> +
> +/*
> + * Do the collapse operation. Return number of THPs collapsed successfully.
> + *
> + * Called with mmap_lock held in write, and returns with it held. Does not
> + * drop the lock.
> + */
> +static int
> +__madvise_collapse_pmd_batch(struct mm_struct *mm,
> +                            unsigned long batch_start,
> +                            int batch_size,
> +                            struct madvise_collapse_data *batch_data,
> +                            int node,
> +                            struct collapse_control *cc)
> +{
> +       /* Implemented in later patch */
> +       return 0;
> +}
> +
> +static bool continue_collapse(struct madvise_collapse_data *batch_data,
> +                             int batch_size)
> +{
> +       int i;
> +
> +       for (i = 0; i < batch_size; ++i)
> +               if (batch_data[i].continue_collapse)
> +                       return true;
> +       return false;
> +}
> +
> +static bool madvise_transparent_hugepage_enabled(struct vm_area_struct *vma)
> +{
> +       if (vma_is_anonymous(vma))
> +               /* madvise_collapse() ignores MADV_NOHUGEPAGE */
> +               return __transparent_hugepage_enabled(vma, vma->vm_flags &
> +                                                     ~VM_NOHUGEPAGE);
> +       /* TODO: Support file-backed memory */
> +       return false;
> +}
> +
> +#define MADVISE_COLLAPSE_BATCH_SIZE 8
> +
>  /*
>   * Returns 0 if successfully able to collapse range into THPs (or range already
>   * backed by THPs). Due to implementation detail, THPs collapsed here may be
> @@ -2532,8 +2669,138 @@ static int _madvise_collapse(struct mm_struct *mm,
>                              unsigned long end, gfp_t gfp,
>                              struct collapse_control *cc)
>  {
> -       /* Implemented in later patch */
> -       return -ENOSYS;
> +       unsigned long hstart, hend, batch_addr;
> +       int ret = -EINVAL, collapsed = 0, nr_hpages = 0, i;
> +       struct madvise_collapse_data batch_data[MADVISE_COLLAPSE_BATCH_SIZE];
> +
> +       mmap_assert_locked(mm);
> +       BUG_ON(vma->vm_start > start);
> +       BUG_ON(vma->vm_end < end);
> +       VM_BUG_ON_MM(atomic_read(&mm->mm_users) == 0, mm);
> +
> +       hstart = (start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
> +       hend = end & HPAGE_PMD_MASK;
> +       nr_hpages = (hend - hstart) >> HPAGE_PMD_SHIFT;
> +       if (hstart >= hend)
> +               goto out;
> +
> +       if (!madvise_transparent_hugepage_enabled(vma))
> +               goto out;
> +
> +       /*
> +        * Request might cover multiple hugepages. Strategy is to batch
> +        * allocation and collapse operations so that we do more work while
> +        * mmap_lock is held exclusively.
> +        *
> +        * While processing batch, mmap_lock is locked/unlocked many times for
> +        * the supplied VMA. It's possible that the original VMA is split while
> +        * lock was dropped. If in the context of the (possibly new) VMA, THP
> +        * collapse is possible, we continue.
> +        */
> +       for (batch_addr = hstart;
> +            batch_addr < hend;
> +            batch_addr += HPAGE_PMD_SIZE * MADVISE_COLLAPSE_BATCH_SIZE) {
> +               int node, batch_size;
> +               int thps; /* Number of existing THPs in range */
> +
> +               batch_size = (hend - batch_addr) >> HPAGE_PMD_SHIFT;
> +               batch_size = min_t(int, batch_size,
> +                                  MADVISE_COLLAPSE_BATCH_SIZE);
> +
> +               BUG_ON(batch_size <= 0);
> +               memset(batch_data, 0, sizeof(batch_data));
> +               cond_resched();
> +               VM_BUG_ON_MM(atomic_read(&mm->mm_users) == 0, mm);
> +
> +               /*
> +                * If first batch, we still hold mmap_lock from madvise
> +                * call and haven't dropped it since checking the VMA. Else,
> +                * we've dropped the lock and we need to revalidate.
> +                */
> +               if (batch_addr != hstart) {
> +                       mmap_read_lock(mm);
> +                       if (madvise_collapse_vma_revalidate_pmd_count(mm,
> +                                                                     batch_addr,
> +                                                                     batch_size,
> +                                                                     &vma))
> +                               goto loop_unlock_break;
> +               }
> +
> +               mmap_assert_locked(mm);
> +
> +               thps = __madvise_collapse_scan_pmd_batch(mm, vma, batch_addr,
> +                                                        batch_data, batch_size,
> +                                                        cc);
> +               mmap_read_unlock(mm);
> +
> +               /* Count existing THPs as-if we collapsed them */
> +               collapsed += thps;
> +               if (thps == batch_size || !continue_collapse(batch_data,
> +                                                            batch_size))
> +                       continue;
> +
> +               node = find_target_node(cc);
> +               if (!__madvise_collapse_prealloc_hpages_batch(mm, gfp, node,
> +                                                             batch_data,
> +                                                             batch_size, cc)) {
> +                       /* No more THPs available - so give up */
> +                       ret = -ENOMEM;
> +                       break;
> +               }
> +
> +               mmap_read_lock(mm);
> +               if (!__madvise_collapse_swapin_pmd_batch(mm, node, batch_addr,
> +                                                        batch_data, batch_size,
> +                                                        cc))
> +                       goto loop_unlock_break;
> +               mmap_read_unlock(mm);
> +               mmap_write_lock(mm);
> +               collapsed += __madvise_collapse_pmd_batch(mm,
> +                               batch_addr, batch_size, batch_data,
> +                               node, cc);
> +               mmap_write_unlock(mm);
> +
> +               for (i = 0; i < batch_size; ++i) {
> +                       struct page *page = batch_data[i].hpage;
> +
> +                       if (page && !IS_ERR(page)) {
> +                               list_add_tail(&page->lru,
> +                                             &cc->free_hpages[node]);
> +                               batch_data[i].hpage = NULL;
> +                       }
> +               }
> +               /* mmap_lock is unlocked here */
> +               continue;
> +loop_unlock_break:
> +               mmap_read_unlock(mm);
> +               break;
> +       }
> +       /* mmap_lock is unlocked here */
> +
> +       for (i = 0; i < MADVISE_COLLAPSE_BATCH_SIZE; ++i) {
> +               struct page *page = batch_data[i].hpage;
> +
> +               if (page && !IS_ERR(page)) {
> +                       mem_cgroup_uncharge(page_folio(page));
> +                       put_page(page);
> +               }
> +       }
> +       for (i = 0; i < MAX_NUMNODES; ++i) {
> +               struct page *page, *tmp;
> +
> +               list_for_each_entry_safe(page, tmp, cc->free_hpages + i, lru) {
> +                       list_del(&page->lru);
> +                       mem_cgroup_uncharge(page_folio(page));
> +                       put_page(page);
> +               }
> +       }
> +       ret = collapsed == nr_hpages ? 0 : -1;
> +       vma = NULL;             /* tell sys_madvise we dropped mmap_lock */
> +       mmap_read_lock(mm);     /* sys_madvise expects us to have mmap_lock */
> +out:
> +       *prev = vma;            /* we didn't drop mmap_lock, so this holds */
> +
> +       return ret;
>  }
>
>  int madvise_collapse(struct vm_area_struct *vma,
> --
> 2.35.1.616.g0bdcbb4464-goog
>
David Rientjes March 10, 2022, 7:26 p.m. UTC | #2
On Wed, 9 Mar 2022, Yang Shi wrote:

> > Introduce the main madvise collapse batched logic, including the overall
> > locking strategy.  Stubs for individual batched actions, such as
> > scanning pmds in batch, have been stubbed out, and will be added later
> > in the series.
> >
> > Note the main benefit from doing all this work in a batched manner is
> > that __madvise__collapse_pmd_batch() (stubbed out) can be called inside
> > a single mmap_lock write.
> 
> I don't get why this is preferred? Isn't it more preferred to minimize
> the scope of write mmap_lock? Assuming you batch large number of PMDs,
> MADV_COLLAPSE may hold write mmap_lock for a long time, it doesn't
> seem it could scale.
> 

One concern might be the queueing of read locks needed for page faults 
behind a collapser of a long range of memory that is otherwise looping 
and repeatedly taking the write lock.

To have minimal impact on concurrent page faults, which I think we should 
be optimizing for, I don't know the answer without data.  Any ideas you 
have as a general rule-of-thumb for what would be optimal here between 
collapsing one page at a time vs handling multiple collapses per mmap_lock 
write so that readers aren't constantly getting queued?

The easiest answer would be to not do batching at all and leave the impact 
to readers up to the userspace doing the MADV_COLLAPSE :)  I was wondering 
if there was a better default behavior we could implement in the kernel, 
however.
Matthew Wilcox March 10, 2022, 8:16 p.m. UTC | #3
On Thu, Mar 10, 2022 at 11:26:15AM -0800, David Rientjes wrote:
> One concern might be the queueing of read locks needed for page faults 
> behind a collapser of a long range of memory that is otherwise looping 
> and repeatedly taking the write lock.

I would have thought that _not_ batching would improve this situation.
Unless our implementation of rwsems has changed since the last time I
looked, dropping-and-reacquiring a rwsem while there are pending readers
means you go to the end of the line and they all get to handle their
page faults.
Zach O'Keefe March 11, 2022, 12:06 a.m. UTC | #4
On Thu, Mar 10, 2022 at 12:17 PM Matthew Wilcox <willy@infradead.org> wrote:
>
> On Thu, Mar 10, 2022 at 11:26:15AM -0800, David Rientjes wrote:
> > One concern might be the queueing of read locks needed for page faults
> > behind a collapser of a long range of memory that is otherwise looping
> > and repeatedly taking the write lock.
>
> I would have thought that _not_ batching would improve this situation.
> Unless our implementation of rwsems has changed since the last time I
> looked, dropping-and-reacquiring a rwsem while there are pending readers
> means you go to the end of the line and they all get to handle their
> page faults.
>

Hey Matthew, thanks for the review / feedback.

I don't have great intuition here, so I'll try to put together a
simple synthetic test to get some data. Though the code would be
different, I can functionally approximate a non-batched approach with
a batch size of 1, and compare that against N.

My file-backed patches likewise weren't able to take advantage of
batching outside mmap lock contention, so the data should equally
apply there.
Zach O'Keefe March 25, 2022, 4:51 p.m. UTC | #5
Hey All,

Sorry for the delay. So, I ran some synthetic tests on a dual socket
Skylake with configured batch sizes of 1, 8, 32, and 64. Basic setup
was: 1 thread continuously madvise(MADV_COLLAPSE)'ing memory, 20
threads continuously faulting-in pages, and some basic synchronization
so that all threads follow a "only do work when all other threads have
work to do" model (i.e. so we don't measure faults in the absence of
simultaneous collapses, or vice versa). I used bpftrace attached to
tracepoint:mmap_lock to measure r/w mmap_lock contention over 20
minutes.

Assuming we want to optimize for fault-path readers, the results are
pretty clear: BATCH-1 outperforms BATCH-8, BATCH-32, and BATCH-64 by
254%, 381%, and 425% respectively, in terms of mean time for
fault-threads to acquire mmap_lock in read, while also having less
tail latency (didn't calculate, just looked at bpftrace histograms).
If we cared at all about madvise(MADV_COLLAPSE) performance, then
BATCH-1 is 83-86% as fast as the others and holds mmap_lock in write
for about the same amount of time in aggregate (~0 +/- 2%).

I've included the bpftrace histograms for fault-threads acquiring
mmap_lock in read at the end for posterity, and can provide more data
/ info if folks are interested.

In light of these results, I'll rework the code to iteratively operate
on single hugepages, which should have the added benefit of
considerably simplifying the code for an eminent V1 series.

Thanks,
Zach

bpftrace data:

/*****************************************************************************/
batch size: 1

@mmap_lock_r_acquire[fault-thread]:
[128, 256)          1254 |                                                    |
[256, 512)       2691261 |@@@@@@@@@@@@@@@@@                                   |
[512, 1K)        2969500 |@@@@@@@@@@@@@@@@@@@                                 |
[1K, 2K)         1794738 |@@@@@@@@@@@                                         |
[2K, 4K)         1590984 |@@@@@@@@@@                                          |
[4K, 8K)         3273349 |@@@@@@@@@@@@@@@@@@@@@                               |
[8K, 16K)         851467 |@@@@@                                               |
[16K, 32K)        460653 |@@                                                  |
[32K, 64K)          7274 |                                                    |
[64K, 128K)           25 |                                                    |
[128K, 256K)           0 |                                                    |
[256K, 512K)           0 |                                                    |
[512K, 1M)       8085437 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
[1M, 2M)          381735 |@@                                                  |
[2M, 4M)              28 |                                                    |

@mmap_lock_r_acquire_stat[fault-thread]: count 22107705, average
326480, total 7217743234867

/*****************************************************************************/
batch size: 8

@mmap_lock_r_acquire[fault-thread]:
[128, 256)            55 |                                                    |
[256, 512)        247028 |@@@@@@                                              |
[512, 1K)         239083 |@@@@@@                                              |
[1K, 2K)          142296 |@@@                                                 |
[2K, 4K)          153149 |@@@@                                                |
[4K, 8K)         1899396 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
[8K, 16K)        1780734 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@    |
[16K, 32K)         95645 |@@                                                  |
[32K, 64K)          1933 |                                                    |
[64K, 128K)            3 |                                                    |
[128K, 256K)           0 |                                                    |
[256K, 512K)           0 |                                                    |
[512K, 1M)             0 |                                                    |
[1M, 2M)               0 |                                                    |
[2M, 4M)               0 |                                                    |
[4M, 8M)         1132899 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@                     |
[8M, 16M)           3953 |                                                    |

@mmap_lock_r_acquire_stat[fault-thread]: count 5696174, average
1156055, total 6585091744973

/*****************************************************************************/
batch size: 32

@mmap_lock_r_acquire[fault-thread]:
[128, 256)            35 |                                                    |
[256, 512)         63413 |@                                                   |
[512, 1K)          78130 |@                                                   |
[1K, 2K)           39548 |                                                    |
[2K, 4K)           44331 |                                                    |
[4K, 8K)         2398751 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
[8K, 16K)        1316932 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@                        |
[16K, 32K)         54798 |@                                                   |
[32K, 64K)           771 |                                                    |
[64K, 128K)            2 |                                                    |
[128K, 256K)           0 |                                                    |
[256K, 512K)           0 |                                                    |
[512K, 1M)             0 |                                                    |
[1M, 2M)               0 |                                                    |
[2M, 4M)               0 |                                                    |
[4M, 8M)               0 |                                                    |
[8M, 16M)              0 |                                                    |
[16M, 32M)        280791 |@@@@@@                                              |
[32M, 64M)           809 |                                                    |

@mmap_lock_r_acquire_stat[fault-thread]: count 4278311, average
1571585, total 6723733081824

/*****************************************************************************/
batch size: 64

@mmap_lock_r_acquire[fault-thread]:
[256, 512)         30303 |                                                    |
[512, 1K)          42366 |@                                                   |
[1K, 2K)           23679 |                                                    |
[2K, 4K)           22781 |                                                    |
[4K, 8K)         1637566 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@         |
[8K, 16K)        1955773 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
[16K, 32K)         41832 |@                                                   |
[32K, 64K)           563 |                                                    |
[64K, 128K)            0 |                                                    |
[128K, 256K)           0 |                                                    |
[256K, 512K)           0 |                                                    |
[512K, 1M)             0 |                                                    |
[1M, 2M)               0 |                                                    |
[2M, 4M)               0 |                                                    |
[4M, 8M)               0 |                                                    |
[8M, 16M)              0 |                                                    |
[16M, 32M)             0 |                                                    |
[32M, 64M)        140723 |@@@                                                 |
[64M, 128M)           77 |                                                    |

@mmap_lock_r_acquire_stat[fault-thread]: count 3895663, average
1715797, total 6684170171691

On Thu, Mar 10, 2022 at 4:06 PM Zach O'Keefe <zokeefe@google.com> wrote:
>
> On Thu, Mar 10, 2022 at 12:17 PM Matthew Wilcox <willy@infradead.org> wrote:
> >
> > On Thu, Mar 10, 2022 at 11:26:15AM -0800, David Rientjes wrote:
> > > One concern might be the queueing of read locks needed for page faults
> > > behind a collapser of a long range of memory that is otherwise looping
> > > and repeatedly taking the write lock.
> >
> > I would have thought that _not_ batching would improve this situation.
> > Unless our implementation of rwsems has changed since the last time I
> > looked, dropping-and-reacquiring a rwsem while there are pending readers
> > means you go to the end of the line and they all get to handle their
> > page faults.
> >
>
> Hey Matthew, thanks for the review / feedback.
>
> I don't have great intuition here, so I'll try to put together a
> simple synthetic test to get some data. Though the code would be
> different, I can functionally approximate a non-batched approach with
> a batch size of 1, and compare that against N.
>
> My file-backed patches likewise weren't able to take advantage of
> batching outside mmap lock contention, so the data should equally
> apply there.
Yang Shi March 25, 2022, 7:54 p.m. UTC | #6
On Fri, Mar 25, 2022 at 9:51 AM Zach O'Keefe <zokeefe@google.com> wrote:
>
> Hey All,
>
> Sorry for the delay. So, I ran some synthetic tests on a dual socket
> Skylake with configured batch sizes of 1, 8, 32, and 64. Basic setup
> was: 1 thread continuously madvise(MADV_COLLAPSE)'ing memory, 20
> threads continuously faulting-in pages, and some basic synchronization
> so that all threads follow a "only do work when all other threads have
> work to do" model (i.e. so we don't measure faults in the absence of
> simultaneous collapses, or vice versa). I used bpftrace attached to
> tracepoint:mmap_lock to measure r/w mmap_lock contention over 20
> minutes.
>
> Assuming we want to optimize for fault-path readers, the results are
> pretty clear: BATCH-1 outperforms BATCH-8, BATCH-32, and BATCH-64 by
> 254%, 381%, and 425% respectively, in terms of mean time for
> fault-threads to acquire mmap_lock in read, while also having less
> tail latency (didn't calculate, just looked at bpftrace histograms).
> If we cared at all about madvise(MADV_COLLAPSE) performance, then
> BATCH-1 is 83-86% as fast as the others and holds mmap_lock in write
> for about the same amount of time in aggregate (~0 +/- 2%).
>
> I've included the bpftrace histograms for fault-threads acquiring
> mmap_lock in read at the end for posterity, and can provide more data
> / info if folks are interested.
>
> In light of these results, I'll rework the code to iteratively operate
> on single hugepages, which should have the added benefit of
> considerably simplifying the code for an eminent V1 series.

Thanks for the data. Yeah, I agree this is the best tradeoff.

>
> Thanks,
> Zach
>
> bpftrace data:
>
> /*****************************************************************************/
> batch size: 1
>
> @mmap_lock_r_acquire[fault-thread]:
> [128, 256)          1254 |                                                    |
> [256, 512)       2691261 |@@@@@@@@@@@@@@@@@                                   |
> [512, 1K)        2969500 |@@@@@@@@@@@@@@@@@@@                                 |
> [1K, 2K)         1794738 |@@@@@@@@@@@                                         |
> [2K, 4K)         1590984 |@@@@@@@@@@                                          |
> [4K, 8K)         3273349 |@@@@@@@@@@@@@@@@@@@@@                               |
> [8K, 16K)         851467 |@@@@@                                               |
> [16K, 32K)        460653 |@@                                                  |
> [32K, 64K)          7274 |                                                    |
> [64K, 128K)           25 |                                                    |
> [128K, 256K)           0 |                                                    |
> [256K, 512K)           0 |                                                    |
> [512K, 1M)       8085437 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
> [1M, 2M)          381735 |@@                                                  |
> [2M, 4M)              28 |                                                    |
>
> @mmap_lock_r_acquire_stat[fault-thread]: count 22107705, average
> 326480, total 7217743234867
>
> /*****************************************************************************/
> batch size: 8
>
> @mmap_lock_r_acquire[fault-thread]:
> [128, 256)            55 |                                                    |
> [256, 512)        247028 |@@@@@@                                              |
> [512, 1K)         239083 |@@@@@@                                              |
> [1K, 2K)          142296 |@@@                                                 |
> [2K, 4K)          153149 |@@@@                                                |
> [4K, 8K)         1899396 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
> [8K, 16K)        1780734 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@    |
> [16K, 32K)         95645 |@@                                                  |
> [32K, 64K)          1933 |                                                    |
> [64K, 128K)            3 |                                                    |
> [128K, 256K)           0 |                                                    |
> [256K, 512K)           0 |                                                    |
> [512K, 1M)             0 |                                                    |
> [1M, 2M)               0 |                                                    |
> [2M, 4M)               0 |                                                    |
> [4M, 8M)         1132899 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@                     |
> [8M, 16M)           3953 |                                                    |
>
> @mmap_lock_r_acquire_stat[fault-thread]: count 5696174, average
> 1156055, total 6585091744973
>
> /*****************************************************************************/
> batch size: 32
>
> @mmap_lock_r_acquire[fault-thread]:
> [128, 256)            35 |                                                    |
> [256, 512)         63413 |@                                                   |
> [512, 1K)          78130 |@                                                   |
> [1K, 2K)           39548 |                                                    |
> [2K, 4K)           44331 |                                                    |
> [4K, 8K)         2398751 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
> [8K, 16K)        1316932 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@                        |
> [16K, 32K)         54798 |@                                                   |
> [32K, 64K)           771 |                                                    |
> [64K, 128K)            2 |                                                    |
> [128K, 256K)           0 |                                                    |
> [256K, 512K)           0 |                                                    |
> [512K, 1M)             0 |                                                    |
> [1M, 2M)               0 |                                                    |
> [2M, 4M)               0 |                                                    |
> [4M, 8M)               0 |                                                    |
> [8M, 16M)              0 |                                                    |
> [16M, 32M)        280791 |@@@@@@                                              |
> [32M, 64M)           809 |                                                    |
>
> @mmap_lock_r_acquire_stat[fault-thread]: count 4278311, average
> 1571585, total 6723733081824
>
> /*****************************************************************************/
> batch size: 64
>
> @mmap_lock_r_acquire[fault-thread]:
> [256, 512)         30303 |                                                    |
> [512, 1K)          42366 |@                                                   |
> [1K, 2K)           23679 |                                                    |
> [2K, 4K)           22781 |                                                    |
> [4K, 8K)         1637566 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@         |
> [8K, 16K)        1955773 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
> [16K, 32K)         41832 |@                                                   |
> [32K, 64K)           563 |                                                    |
> [64K, 128K)            0 |                                                    |
> [128K, 256K)           0 |                                                    |
> [256K, 512K)           0 |                                                    |
> [512K, 1M)             0 |                                                    |
> [1M, 2M)               0 |                                                    |
> [2M, 4M)               0 |                                                    |
> [4M, 8M)               0 |                                                    |
> [8M, 16M)              0 |                                                    |
> [16M, 32M)             0 |                                                    |
> [32M, 64M)        140723 |@@@                                                 |
> [64M, 128M)           77 |                                                    |
>
> @mmap_lock_r_acquire_stat[fault-thread]: count 3895663, average
> 1715797, total 6684170171691
>
> On Thu, Mar 10, 2022 at 4:06 PM Zach O'Keefe <zokeefe@google.com> wrote:
> >
> > On Thu, Mar 10, 2022 at 12:17 PM Matthew Wilcox <willy@infradead.org> wrote:
> > >
> > > On Thu, Mar 10, 2022 at 11:26:15AM -0800, David Rientjes wrote:
> > > > One concern might be the queueing of read locks needed for page faults
> > > > behind a collapser of a long range of memory that is otherwise looping
> > > > and repeatedly taking the write lock.
> > >
> > > I would have thought that _not_ batching would improve this situation.
> > > Unless our implementation of rwsems has changed since the last time I
> > > looked, dropping-and-reacquiring a rwsem while there are pending readers
> > > means you go to the end of the line and they all get to handle their
> > > page faults.
> > >
> >
> > Hey Matthew, thanks for the review / feedback.
> >
> > I don't have great intuition here, so I'll try to put together a
> > simple synthetic test to get some data. Though the code would be
> > different, I can functionally approximate a non-batched approach with
> > a batch size of 1, and compare that against N.
> >
> > My file-backed patches likewise weren't able to take advantage of
> > batching outside mmap lock contention, so the data should equally
> > apply there.
diff mbox series

Patch

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index ca1e523086ed..ea53c706602e 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -86,6 +86,9 @@  static struct kmem_cache *mm_slot_cache __read_mostly;
 #define MAX_PTE_MAPPED_THP 8
 
 struct collapse_control {
+	/* Used by MADV_COLLAPSE batch collapse */
+	struct list_head free_hpages[MAX_NUMNODES];
+
 	/* Respect khugepaged_max_ptes_[none|swap|shared] */
 	bool enforce_pte_scan_limits;
 
@@ -99,8 +102,13 @@  struct collapse_control {
 static void collapse_control_init(struct collapse_control *cc,
 				  bool enforce_pte_scan_limits)
 {
+	int i;
+
 	cc->enforce_pte_scan_limits = enforce_pte_scan_limits;
 	cc->last_target_node = NUMA_NO_NODE;
+
+	for (i = 0; i < MAX_NUMNODES; ++i)
+		INIT_LIST_HEAD(cc->free_hpages + i);
 }
 
 /**
@@ -1033,7 +1041,7 @@  static pmd_t *find_pmd_or_thp_or_none(struct mm_struct *mm,
 	/* See comments in pmd_none_or_trans_huge_or_clear_bad() */
 	barrier();
 #endif
-	if (!pmd_present(pmde) || !pmd_none(pmde)) {
+	if (!pmd_present(pmde) || pmd_none(pmde)) {
 		*result = SCAN_PMD_NULL;
 		return NULL;
 	} else if (pmd_trans_huge(pmde)) {
@@ -1054,12 +1062,16 @@  static pmd_t *find_pmd_or_thp_or_none(struct mm_struct *mm,
 static bool __collapse_huge_page_swapin(struct mm_struct *mm,
 					struct vm_area_struct *vma,
 					unsigned long haddr, pmd_t *pmd,
-					int referenced)
+					int referenced,
+					unsigned long vm_flags_ignored,
+					bool *mmap_lock_dropped)
 {
 	int swapped_in = 0;
 	vm_fault_t ret = 0;
 	unsigned long address, end = haddr + (HPAGE_PMD_NR * PAGE_SIZE);
 
+	if (mmap_lock_dropped)
+		*mmap_lock_dropped = false;
 	for (address = haddr; address < end; address += PAGE_SIZE) {
 		struct vm_fault vmf = {
 			.vma = vma,
@@ -1080,8 +1092,10 @@  static bool __collapse_huge_page_swapin(struct mm_struct *mm,
 
 		/* do_swap_page returns VM_FAULT_RETRY with released mmap_lock */
 		if (ret & VM_FAULT_RETRY) {
+			if (mmap_lock_dropped)
+				*mmap_lock_dropped = true;
 			mmap_read_lock(mm);
-			if (hugepage_vma_revalidate(mm, haddr, VM_NONE, &vma)) {
+			if (hugepage_vma_revalidate(mm, haddr, vm_flags_ignored, &vma)) {
 				/* vma is no longer available, don't continue to swapin */
 				trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
 				return false;
@@ -1256,7 +1270,8 @@  static void khugepaged_collapse_huge_page(struct mm_struct *mm,
 	 * Continuing to collapse causes inconsistency.
 	 */
 	if (unmapped && !__collapse_huge_page_swapin(mm, vma, address,
-						     pmd, referenced)) {
+						     pmd, referenced, VM_NONE,
+						     NULL)) {
 		mmap_read_unlock(mm);
 		goto out_nolock;
 	}
@@ -2520,6 +2535,128 @@  void khugepaged_min_free_kbytes_update(void)
 	mutex_unlock(&khugepaged_mutex);
 }
 
+struct madvise_collapse_data {
+	struct page *hpage; /* Preallocated THP */
+	bool continue_collapse;  /* Should we attempt / continue collapse? */
+
+	struct scan_pmd_result scan_result;
+	pmd_t *pmd;
+};
+
+static int
+madvise_collapse_vma_revalidate_pmd_count(struct mm_struct *mm,
+					  unsigned long address, int nr,
+					  struct vm_area_struct **vmap)
+{
+	/* madvise_collapse() ignores MADV_NOHUGEPAGE */
+	return hugepage_vma_revalidate_pmd_count(mm, address, nr, VM_NOHUGEPAGE,
+			vmap);
+}
+
+/*
+ * Scan pmd to see which we can collapse, and to determine node to allocate on.
+ *
+ * Must be called with mmap_lock in read, and returns with the lock held in
+ * read. Does not drop the lock.
+ *
+ * Set batch_data[i]->continue_collapse to false for any pmd that can't be
+ * collapsed.
+ *
+ * Return the number of existing THPs in batch.
+ */
+static int
+__madvise_collapse_scan_pmd_batch(struct mm_struct *mm,
+				  struct vm_area_struct *vma,
+				  unsigned long batch_start,
+				  struct madvise_collapse_data *batch_data,
+				  int batch_size,
+				  struct collapse_control *cc)
+{
+	/* Implemented in later patch */
+	return 0;
+}
+
+/*
+ * Preallocate and charge huge page for each pmd in the batch, store the
+ * new page in batch_data[i]->hpage.
+ *
+ * Return the number of huge pages allocated.
+ */
+static int
+__madvise_collapse_prealloc_hpages_batch(struct mm_struct *mm,
+					 gfp_t gfp,
+					 int node,
+					 struct madvise_collapse_data *batch_data,
+					 int batch_size,
+					 struct collapse_control *cc)
+{
+	/* Implemented in later patch */
+	return 0;
+}
+
+/*
+ * Do swapin for all ranges in batch, returns true iff successful.
+ *
+ * Called with mmap_lock held in read, and returns with it held in read.
+ * Might drop the lock.
+ *
+ * Set batch_data[i]->continue_collapse to false for any pmd that can't be
+ * collapsed. Else, set batch_data[i]->pmd to the found pmd.
+ */
+static bool
+__madvise_collapse_swapin_pmd_batch(struct mm_struct *mm,
+				    int node,
+				    unsigned long batch_start,
+				    struct madvise_collapse_data *batch_data,
+				    int batch_size,
+				    struct collapse_control *cc)
+
+{
+	/* Implemented in later patch */
+	return true;
+}
+
+/*
+ * Do the collapse operation. Return number of THPs collapsed successfully.
+ *
+ * Called with mmap_lock held in write, and returns with it held. Does not
+ * drop the lock.
+ */
+static int
+__madvise_collapse_pmd_batch(struct mm_struct *mm,
+			     unsigned long batch_start,
+			     int batch_size,
+			     struct madvise_collapse_data *batch_data,
+			     int node,
+			     struct collapse_control *cc)
+{
+	/* Implemented in later patch */
+	return 0;
+}
+
+static bool continue_collapse(struct madvise_collapse_data *batch_data,
+			      int batch_size)
+{
+	int i;
+
+	for (i = 0; i < batch_size; ++i)
+		if (batch_data[i].continue_collapse)
+			return true;
+	return false;
+}
+
+static bool madvise_transparent_hugepage_enabled(struct vm_area_struct *vma)
+{
+	if (vma_is_anonymous(vma))
+		/* madvise_collapse() ignores MADV_NOHUGEPAGE */
+		return __transparent_hugepage_enabled(vma, vma->vm_flags &
+						      ~VM_NOHUGEPAGE);
+	/* TODO: Support file-backed memory */
+	return false;
+}
+
+#define MADVISE_COLLAPSE_BATCH_SIZE 8
+
 /*
  * Returns 0 if successfully able to collapse range into THPs (or range already
  * backed by THPs). Due to implementation detail, THPs collapsed here may be
@@ -2532,8 +2669,138 @@  static int _madvise_collapse(struct mm_struct *mm,
 			     unsigned long end, gfp_t gfp,
 			     struct collapse_control *cc)
 {
-	/* Implemented in later patch */
-	return -ENOSYS;
+	unsigned long hstart, hend, batch_addr;
+	int ret = -EINVAL, collapsed = 0, nr_hpages = 0, i;
+	struct madvise_collapse_data batch_data[MADVISE_COLLAPSE_BATCH_SIZE];
+
+	mmap_assert_locked(mm);
+	BUG_ON(vma->vm_start > start);
+	BUG_ON(vma->vm_end < end);
+	VM_BUG_ON_MM(atomic_read(&mm->mm_users) == 0, mm);
+
+	hstart = (start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
+	hend = end & HPAGE_PMD_MASK;
+	nr_hpages = (hend - hstart) >> HPAGE_PMD_SHIFT;
+	if (hstart >= hend)
+		goto out;
+
+	if (!madvise_transparent_hugepage_enabled(vma))
+		goto out;
+
+	/*
+	 * Request might cover multiple hugepages. Strategy is to batch
+	 * allocation and collapse operations so that we do more work while
+	 * mmap_lock is held exclusively.
+	 *
+	 * While processing batch, mmap_lock is locked/unlocked many times for
+	 * the supplied VMA. It's possible that the original VMA is split while
+	 * lock was dropped. If in the context of the (possibly new) VMA, THP
+	 * collapse is possible, we continue.
+	 */
+	for (batch_addr = hstart;
+	     batch_addr < hend;
+	     batch_addr += HPAGE_PMD_SIZE * MADVISE_COLLAPSE_BATCH_SIZE) {
+		int node, batch_size;
+		int thps; /* Number of existing THPs in range */
+
+		batch_size = (hend - batch_addr) >> HPAGE_PMD_SHIFT;
+		batch_size = min_t(int, batch_size,
+				   MADVISE_COLLAPSE_BATCH_SIZE);
+
+		BUG_ON(batch_size <= 0);
+		memset(batch_data, 0, sizeof(batch_data));
+		cond_resched();
+		VM_BUG_ON_MM(atomic_read(&mm->mm_users) == 0, mm);
+
+		/*
+		 * If first batch, we still hold mmap_lock from madvise
+		 * call and haven't dropped it since checking the VMA. Else,
+		 * we've dropped the lock and we need to revalidate.
+		 */
+		if (batch_addr != hstart) {
+			mmap_read_lock(mm);
+			if (madvise_collapse_vma_revalidate_pmd_count(mm,
+								      batch_addr,
+								      batch_size,
+								      &vma))
+				goto loop_unlock_break;
+		}
+
+		mmap_assert_locked(mm);
+
+		thps = __madvise_collapse_scan_pmd_batch(mm, vma, batch_addr,
+							 batch_data, batch_size,
+							 cc);
+		mmap_read_unlock(mm);
+
+		/* Count existing THPs as-if we collapsed them */
+		collapsed += thps;
+		if (thps == batch_size || !continue_collapse(batch_data,
+							     batch_size))
+			continue;
+
+		node = find_target_node(cc);
+		if (!__madvise_collapse_prealloc_hpages_batch(mm, gfp, node,
+							      batch_data,
+							      batch_size, cc)) {
+			/* No more THPs available - so give up */
+			ret = -ENOMEM;
+			break;
+		}
+
+		mmap_read_lock(mm);
+		if (!__madvise_collapse_swapin_pmd_batch(mm, node, batch_addr,
+							 batch_data, batch_size,
+							 cc))
+			goto loop_unlock_break;
+		mmap_read_unlock(mm);
+		mmap_write_lock(mm);
+		collapsed += __madvise_collapse_pmd_batch(mm,
+				batch_addr, batch_size, batch_data,
+				node, cc);
+		mmap_write_unlock(mm);
+
+		for (i = 0; i < batch_size; ++i) {
+			struct page *page = batch_data[i].hpage;
+
+			if (page && !IS_ERR(page)) {
+				list_add_tail(&page->lru,
+					      &cc->free_hpages[node]);
+				batch_data[i].hpage = NULL;
+			}
+		}
+		/* mmap_lock is unlocked here */
+		continue;
+loop_unlock_break:
+		mmap_read_unlock(mm);
+		break;
+	}
+	/* mmap_lock is unlocked here */
+
+	for (i = 0; i < MADVISE_COLLAPSE_BATCH_SIZE; ++i) {
+		struct page *page = batch_data[i].hpage;
+
+		if (page && !IS_ERR(page)) {
+			mem_cgroup_uncharge(page_folio(page));
+			put_page(page);
+		}
+	}
+	for (i = 0; i < MAX_NUMNODES; ++i) {
+		struct page *page, *tmp;
+
+		list_for_each_entry_safe(page, tmp, cc->free_hpages + i, lru) {
+			list_del(&page->lru);
+			mem_cgroup_uncharge(page_folio(page));
+			put_page(page);
+		}
+	}
+	ret = collapsed == nr_hpages ? 0 : -1;
+	vma = NULL;		/* tell sys_madvise we dropped mmap_lock */
+	mmap_read_lock(mm);	/* sys_madvise expects us to have mmap_lock */
+out:
+	*prev = vma;		/* we didn't drop mmap_lock, so this holds */
+
+	return ret;
 }
 
 int madvise_collapse(struct vm_area_struct *vma,