Message ID | 20240102175338.62012-9-ryncsn@gmail.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | swapin refactor for optimization and unified readahead | expand |
Kairui Song <ryncsn@gmail.com> writes: > From: Kairui Song <kasong@tencent.com> > > There are two places where swapin is not caused by direct anon page fault: > - shmem swapin, invoked indirectly through shmem mapping > - swapoff > > They used to construct a pseudo vmfault struct for swapin function. > Shmem has dropped the pseudo vmfault recently in commit ddc1a5cbc05d > ("mempolicy: alloc_pages_mpol() for NUMA policy without vma"). Swapoff > path is still using one. > > Introduce a helper for them both, this help save stack usage for swapoff > path, and help apply a unified swapin cache and readahead policy check. > > Due to missing vmfault info, the caller have to pass in mempolicy > explicitly, make it different from swapin_entry and name it > swapin_entry_mpol. > > This commit convert swapoff to use this helper, follow-up commits will > convert shmem to use it too. > > Signed-off-by: Kairui Song <kasong@tencent.com> > --- > mm/swap.h | 9 +++++++++ > mm/swap_state.c | 40 ++++++++++++++++++++++++++++++++-------- > mm/swapfile.c | 15 ++++++--------- > 3 files changed, 47 insertions(+), 17 deletions(-) > > diff --git a/mm/swap.h b/mm/swap.h > index 9180411afcfe..8f790a67b948 100644 > --- a/mm/swap.h > +++ b/mm/swap.h > @@ -73,6 +73,9 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t flag, > struct mempolicy *mpol, pgoff_t ilx); > struct folio *swapin_entry(swp_entry_t entry, gfp_t flag, > struct vm_fault *vmf, enum swap_cache_result *result); > +struct folio *swapin_entry_mpol(swp_entry_t entry, gfp_t gfp_mask, > + struct mempolicy *mpol, pgoff_t ilx, > + enum swap_cache_result *result); > > static inline unsigned int folio_swap_flags(struct folio *folio) > { > @@ -109,6 +112,12 @@ static inline struct folio *swapin_entry(swp_entry_t swp, gfp_t gfp_mask, > return NULL; > } > > +static inline struct page *swapin_entry_mpol(swp_entry_t entry, gfp_t gfp_mask, > + struct mempolicy *mpol, pgoff_t ilx, enum swap_cache_result *result) > +{ > + return NULL; > +} > + > static inline int swap_writepage(struct page *p, struct writeback_control *wbc) > { > return 0; > diff --git a/mm/swap_state.c b/mm/swap_state.c > index 21badd4f0fc7..3edf4b63158d 100644 > --- a/mm/swap_state.c > +++ b/mm/swap_state.c > @@ -880,14 +880,13 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, > * in. > */ > static struct folio *swapin_direct(swp_entry_t entry, gfp_t gfp_mask, > - struct vm_fault *vmf, void *shadow) > + struct mempolicy *mpol, pgoff_t ilx, > + void *shadow) > { > - struct vm_area_struct *vma = vmf->vma; > struct folio *folio; > > - /* skip swapcache */ > - folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, > - vma, vmf->address, false); > + folio = (struct folio *)alloc_pages_mpol(gfp_mask, 0, > + mpol, ilx, numa_node_id()); > if (folio) { > if (mem_cgroup_swapin_charge_folio(folio, NULL, > GFP_KERNEL, entry)) { > @@ -943,18 +942,18 @@ struct folio *swapin_entry(swp_entry_t entry, gfp_t gfp_mask, > goto done; > } > > + mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx); > if (swap_use_no_readahead(swp_swap_info(entry), entry)) { > - folio = swapin_direct(entry, gfp_mask, vmf, shadow); > + folio = swapin_direct(entry, gfp_mask, mpol, ilx, shadow); > cache_result = SWAP_CACHE_BYPASS; > } else { > - mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx); > if (swap_use_vma_readahead()) > folio = swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf); > else > folio = swap_cluster_readahead(entry, gfp_mask, mpol, ilx); > - mpol_cond_put(mpol); > cache_result = SWAP_CACHE_MISS; > } > + mpol_cond_put(mpol); > done: > if (result) > *result = cache_result; > @@ -962,6 +961,31 @@ struct folio *swapin_entry(swp_entry_t entry, gfp_t gfp_mask, > return folio; > } > > +struct folio *swapin_entry_mpol(swp_entry_t entry, gfp_t gfp_mask, > + struct mempolicy *mpol, pgoff_t ilx, > + enum swap_cache_result *result) > +{ > + enum swap_cache_result cache_result; > + void *shadow = NULL; > + struct folio *folio; > + > + folio = swap_cache_get_folio(entry, NULL, 0, &shadow); > + if (folio) { > + cache_result = SWAP_CACHE_HIT; > + } else if (swap_use_no_readahead(swp_swap_info(entry), entry)) { > + folio = swapin_direct(entry, gfp_mask, mpol, ilx, shadow); > + cache_result = SWAP_CACHE_BYPASS; > + } else { > + folio = swap_cluster_readahead(entry, gfp_mask, mpol, ilx); > + cache_result = SWAP_CACHE_MISS; > + } > + > + if (result) > + *result = cache_result; > + > + return folio; > +} > + > #ifdef CONFIG_SYSFS > static ssize_t vma_ra_enabled_show(struct kobject *kobj, > struct kobj_attribute *attr, char *buf) > diff --git a/mm/swapfile.c b/mm/swapfile.c > index 5aa44de11edc..2f77bf143af8 100644 > --- a/mm/swapfile.c > +++ b/mm/swapfile.c > @@ -1840,18 +1840,13 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, > do { > struct folio *folio; > unsigned long offset; > + struct mempolicy *mpol; > unsigned char swp_count; > swp_entry_t entry; > + pgoff_t ilx; > int ret; > pte_t ptent; > > - struct vm_fault vmf = { > - .vma = vma, > - .address = addr, > - .real_address = addr, > - .pmd = pmd, > - }; > - > if (!pte++) { > pte = pte_offset_map(pmd, addr); > if (!pte) > @@ -1871,8 +1866,10 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, > pte_unmap(pte); > pte = NULL; > > - folio = swapin_entry(entry, GFP_HIGHUSER_MOVABLE, > - &vmf, NULL); > + mpol = get_vma_policy(vma, addr, 0, &ilx); > + folio = swapin_entry_mpol(entry, GFP_HIGHUSER_MOVABLE, > + mpol, ilx, NULL); > + mpol_cond_put(mpol); > if (!folio) { > /* > * The entry could have been freed, and will not IIUC, after the change, we will always use cluster readahead for swapoff. This may be OK. But, at least we need some test results which show that this will not cause any issue for this behavior change. And the behavior change should be described explicitly in patch description. And I don't think it's a good abstraction to make swapin_entry_mpol() always use cluster swapin, while swapin_entry() will try to use vma swapin. I think we can add "struct mempolicy *mpol" and "pgoff_t ilx" to swapin_entry() as parameters, and use them if vmf == NULL. If we want to enforce cluster swapin in swapoff path, it will be better to add some comments to describe why. -- Best Regards, Huang, Ying
Huang, Ying <ying.huang@intel.com> 于2024年1月9日周二 09:11写道: > > Kairui Song <ryncsn@gmail.com> writes: > > > From: Kairui Song <kasong@tencent.com> > > > > There are two places where swapin is not caused by direct anon page fault: > > - shmem swapin, invoked indirectly through shmem mapping > > - swapoff > > > > They used to construct a pseudo vmfault struct for swapin function. > > Shmem has dropped the pseudo vmfault recently in commit ddc1a5cbc05d > > ("mempolicy: alloc_pages_mpol() for NUMA policy without vma"). Swapoff > > path is still using one. > > > > Introduce a helper for them both, this help save stack usage for swapoff > > path, and help apply a unified swapin cache and readahead policy check. > > > > Due to missing vmfault info, the caller have to pass in mempolicy > > explicitly, make it different from swapin_entry and name it > > swapin_entry_mpol. > > > > This commit convert swapoff to use this helper, follow-up commits will > > convert shmem to use it too. > > > > Signed-off-by: Kairui Song <kasong@tencent.com> > > --- > > mm/swap.h | 9 +++++++++ > > mm/swap_state.c | 40 ++++++++++++++++++++++++++++++++-------- > > mm/swapfile.c | 15 ++++++--------- > > 3 files changed, 47 insertions(+), 17 deletions(-) > > > > diff --git a/mm/swap.h b/mm/swap.h > > index 9180411afcfe..8f790a67b948 100644 > > --- a/mm/swap.h > > +++ b/mm/swap.h > > @@ -73,6 +73,9 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t flag, > > struct mempolicy *mpol, pgoff_t ilx); > > struct folio *swapin_entry(swp_entry_t entry, gfp_t flag, > > struct vm_fault *vmf, enum swap_cache_result *result); > > +struct folio *swapin_entry_mpol(swp_entry_t entry, gfp_t gfp_mask, > > + struct mempolicy *mpol, pgoff_t ilx, > > + enum swap_cache_result *result); > > > > static inline unsigned int folio_swap_flags(struct folio *folio) > > { > > @@ -109,6 +112,12 @@ static inline struct folio *swapin_entry(swp_entry_t swp, gfp_t gfp_mask, > > return NULL; > > } > > > > +static inline struct page *swapin_entry_mpol(swp_entry_t entry, gfp_t gfp_mask, > > + struct mempolicy *mpol, pgoff_t ilx, enum swap_cache_result *result) > > +{ > > + return NULL; > > +} > > + > > static inline int swap_writepage(struct page *p, struct writeback_control *wbc) > > { > > return 0; > > diff --git a/mm/swap_state.c b/mm/swap_state.c > > index 21badd4f0fc7..3edf4b63158d 100644 > > --- a/mm/swap_state.c > > +++ b/mm/swap_state.c > > @@ -880,14 +880,13 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, > > * in. > > */ > > static struct folio *swapin_direct(swp_entry_t entry, gfp_t gfp_mask, > > - struct vm_fault *vmf, void *shadow) > > + struct mempolicy *mpol, pgoff_t ilx, > > + void *shadow) > > { > > - struct vm_area_struct *vma = vmf->vma; > > struct folio *folio; > > > > - /* skip swapcache */ > > - folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, > > - vma, vmf->address, false); > > + folio = (struct folio *)alloc_pages_mpol(gfp_mask, 0, > > + mpol, ilx, numa_node_id()); > > if (folio) { > > if (mem_cgroup_swapin_charge_folio(folio, NULL, > > GFP_KERNEL, entry)) { > > @@ -943,18 +942,18 @@ struct folio *swapin_entry(swp_entry_t entry, gfp_t gfp_mask, > > goto done; > > } > > > > + mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx); > > if (swap_use_no_readahead(swp_swap_info(entry), entry)) { > > - folio = swapin_direct(entry, gfp_mask, vmf, shadow); > > + folio = swapin_direct(entry, gfp_mask, mpol, ilx, shadow); > > cache_result = SWAP_CACHE_BYPASS; > > } else { > > - mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx); > > if (swap_use_vma_readahead()) > > folio = swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf); > > else > > folio = swap_cluster_readahead(entry, gfp_mask, mpol, ilx); > > - mpol_cond_put(mpol); > > cache_result = SWAP_CACHE_MISS; > > } > > + mpol_cond_put(mpol); > > done: > > if (result) > > *result = cache_result; > > @@ -962,6 +961,31 @@ struct folio *swapin_entry(swp_entry_t entry, gfp_t gfp_mask, > > return folio; > > } > > > > +struct folio *swapin_entry_mpol(swp_entry_t entry, gfp_t gfp_mask, > > + struct mempolicy *mpol, pgoff_t ilx, > > + enum swap_cache_result *result) > > +{ > > + enum swap_cache_result cache_result; > > + void *shadow = NULL; > > + struct folio *folio; > > + > > + folio = swap_cache_get_folio(entry, NULL, 0, &shadow); > > + if (folio) { > > + cache_result = SWAP_CACHE_HIT; > > + } else if (swap_use_no_readahead(swp_swap_info(entry), entry)) { > > + folio = swapin_direct(entry, gfp_mask, mpol, ilx, shadow); > > + cache_result = SWAP_CACHE_BYPASS; > > + } else { > > + folio = swap_cluster_readahead(entry, gfp_mask, mpol, ilx); > > + cache_result = SWAP_CACHE_MISS; > > + } > > + > > + if (result) > > + *result = cache_result; > > + > > + return folio; > > +} > > + > > #ifdef CONFIG_SYSFS > > static ssize_t vma_ra_enabled_show(struct kobject *kobj, > > struct kobj_attribute *attr, char *buf) > > diff --git a/mm/swapfile.c b/mm/swapfile.c > > index 5aa44de11edc..2f77bf143af8 100644 > > --- a/mm/swapfile.c > > +++ b/mm/swapfile.c > > @@ -1840,18 +1840,13 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, > > do { > > struct folio *folio; > > unsigned long offset; > > + struct mempolicy *mpol; > > unsigned char swp_count; > > swp_entry_t entry; > > + pgoff_t ilx; > > int ret; > > pte_t ptent; > > > > - struct vm_fault vmf = { > > - .vma = vma, > > - .address = addr, > > - .real_address = addr, > > - .pmd = pmd, > > - }; > > - > > if (!pte++) { > > pte = pte_offset_map(pmd, addr); > > if (!pte) > > @@ -1871,8 +1866,10 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, > > pte_unmap(pte); > > pte = NULL; > > > > - folio = swapin_entry(entry, GFP_HIGHUSER_MOVABLE, > > - &vmf, NULL); > > + mpol = get_vma_policy(vma, addr, 0, &ilx); > > + folio = swapin_entry_mpol(entry, GFP_HIGHUSER_MOVABLE, > > + mpol, ilx, NULL); > > + mpol_cond_put(mpol); > > if (!folio) { > > /* > > * The entry could have been freed, and will not > > IIUC, after the change, we will always use cluster readahead for > swapoff. This may be OK. But, at least we need some test results which > show that this will not cause any issue for this behavior change. And > the behavior change should be described explicitly in patch description. Hi Ying Actually there is a swap_use_no_readahead check in swapin_entry_mpol, so when readahaed is not needed (SYNC_IO), it's just skipped. And I think VMA readahead is not helpful swapoff, swapoff is already walking the VMA, mostly uninterrupted in kernel space. With VMA readahead or not, it will issue IO page by page. The benchmark result I posted before is actually VMA readhead vs no-readahead for ZRAM, sorry I didn't make it clear. It's obvious no-readahead is faster. For actual block device, cluster readahead might be a good choice for swapoff, since all pages will be read for swapoff, there has to be enough memory for all swapcached page to stay in memory or swapoff will fail anyway, and cluster read is faster for block devices. > > And I don't think it's a good abstraction to make swapin_entry_mpol() > always use cluster swapin, while swapin_entry() will try to use vma > swapin. I think we can add "struct mempolicy *mpol" and "pgoff_t ilx" > to swapin_entry() as parameters, and use them if vmf == NULL. If we > want to enforce cluster swapin in swapoff path, it will be better to add > some comments to describe why. Good suggestion, I thought extending swapin_entry may make its arguments list get too long, but seems mpol and ilx are the only thing needed here. I'll update it.
Kairui Song <ryncsn@gmail.com> writes: > Huang, Ying <ying.huang@intel.com> 于2024年1月9日周二 09:11写道: >> >> Kairui Song <ryncsn@gmail.com> writes: >> >> > From: Kairui Song <kasong@tencent.com> >> > >> > There are two places where swapin is not caused by direct anon page fault: >> > - shmem swapin, invoked indirectly through shmem mapping >> > - swapoff >> > >> > They used to construct a pseudo vmfault struct for swapin function. >> > Shmem has dropped the pseudo vmfault recently in commit ddc1a5cbc05d >> > ("mempolicy: alloc_pages_mpol() for NUMA policy without vma"). Swapoff >> > path is still using one. >> > >> > Introduce a helper for them both, this help save stack usage for swapoff >> > path, and help apply a unified swapin cache and readahead policy check. >> > >> > Due to missing vmfault info, the caller have to pass in mempolicy >> > explicitly, make it different from swapin_entry and name it >> > swapin_entry_mpol. >> > >> > This commit convert swapoff to use this helper, follow-up commits will >> > convert shmem to use it too. >> > >> > Signed-off-by: Kairui Song <kasong@tencent.com> >> > --- >> > mm/swap.h | 9 +++++++++ >> > mm/swap_state.c | 40 ++++++++++++++++++++++++++++++++-------- >> > mm/swapfile.c | 15 ++++++--------- >> > 3 files changed, 47 insertions(+), 17 deletions(-) >> > >> > diff --git a/mm/swap.h b/mm/swap.h >> > index 9180411afcfe..8f790a67b948 100644 >> > --- a/mm/swap.h >> > +++ b/mm/swap.h >> > @@ -73,6 +73,9 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t flag, >> > struct mempolicy *mpol, pgoff_t ilx); >> > struct folio *swapin_entry(swp_entry_t entry, gfp_t flag, >> > struct vm_fault *vmf, enum swap_cache_result *result); >> > +struct folio *swapin_entry_mpol(swp_entry_t entry, gfp_t gfp_mask, >> > + struct mempolicy *mpol, pgoff_t ilx, >> > + enum swap_cache_result *result); >> > >> > static inline unsigned int folio_swap_flags(struct folio *folio) >> > { >> > @@ -109,6 +112,12 @@ static inline struct folio *swapin_entry(swp_entry_t swp, gfp_t gfp_mask, >> > return NULL; >> > } >> > >> > +static inline struct page *swapin_entry_mpol(swp_entry_t entry, gfp_t gfp_mask, >> > + struct mempolicy *mpol, pgoff_t ilx, enum swap_cache_result *result) >> > +{ >> > + return NULL; >> > +} >> > + >> > static inline int swap_writepage(struct page *p, struct writeback_control *wbc) >> > { >> > return 0; >> > diff --git a/mm/swap_state.c b/mm/swap_state.c >> > index 21badd4f0fc7..3edf4b63158d 100644 >> > --- a/mm/swap_state.c >> > +++ b/mm/swap_state.c >> > @@ -880,14 +880,13 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, >> > * in. >> > */ >> > static struct folio *swapin_direct(swp_entry_t entry, gfp_t gfp_mask, >> > - struct vm_fault *vmf, void *shadow) >> > + struct mempolicy *mpol, pgoff_t ilx, >> > + void *shadow) >> > { >> > - struct vm_area_struct *vma = vmf->vma; >> > struct folio *folio; >> > >> > - /* skip swapcache */ >> > - folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, >> > - vma, vmf->address, false); >> > + folio = (struct folio *)alloc_pages_mpol(gfp_mask, 0, >> > + mpol, ilx, numa_node_id()); >> > if (folio) { >> > if (mem_cgroup_swapin_charge_folio(folio, NULL, >> > GFP_KERNEL, entry)) { >> > @@ -943,18 +942,18 @@ struct folio *swapin_entry(swp_entry_t entry, gfp_t gfp_mask, >> > goto done; >> > } >> > >> > + mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx); >> > if (swap_use_no_readahead(swp_swap_info(entry), entry)) { >> > - folio = swapin_direct(entry, gfp_mask, vmf, shadow); >> > + folio = swapin_direct(entry, gfp_mask, mpol, ilx, shadow); >> > cache_result = SWAP_CACHE_BYPASS; >> > } else { >> > - mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx); >> > if (swap_use_vma_readahead()) >> > folio = swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf); >> > else >> > folio = swap_cluster_readahead(entry, gfp_mask, mpol, ilx); >> > - mpol_cond_put(mpol); >> > cache_result = SWAP_CACHE_MISS; >> > } >> > + mpol_cond_put(mpol); >> > done: >> > if (result) >> > *result = cache_result; >> > @@ -962,6 +961,31 @@ struct folio *swapin_entry(swp_entry_t entry, gfp_t gfp_mask, >> > return folio; >> > } >> > >> > +struct folio *swapin_entry_mpol(swp_entry_t entry, gfp_t gfp_mask, >> > + struct mempolicy *mpol, pgoff_t ilx, >> > + enum swap_cache_result *result) >> > +{ >> > + enum swap_cache_result cache_result; >> > + void *shadow = NULL; >> > + struct folio *folio; >> > + >> > + folio = swap_cache_get_folio(entry, NULL, 0, &shadow); >> > + if (folio) { >> > + cache_result = SWAP_CACHE_HIT; >> > + } else if (swap_use_no_readahead(swp_swap_info(entry), entry)) { >> > + folio = swapin_direct(entry, gfp_mask, mpol, ilx, shadow); >> > + cache_result = SWAP_CACHE_BYPASS; >> > + } else { >> > + folio = swap_cluster_readahead(entry, gfp_mask, mpol, ilx); >> > + cache_result = SWAP_CACHE_MISS; >> > + } >> > + >> > + if (result) >> > + *result = cache_result; >> > + >> > + return folio; >> > +} >> > + >> > #ifdef CONFIG_SYSFS >> > static ssize_t vma_ra_enabled_show(struct kobject *kobj, >> > struct kobj_attribute *attr, char *buf) >> > diff --git a/mm/swapfile.c b/mm/swapfile.c >> > index 5aa44de11edc..2f77bf143af8 100644 >> > --- a/mm/swapfile.c >> > +++ b/mm/swapfile.c >> > @@ -1840,18 +1840,13 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, >> > do { >> > struct folio *folio; >> > unsigned long offset; >> > + struct mempolicy *mpol; >> > unsigned char swp_count; >> > swp_entry_t entry; >> > + pgoff_t ilx; >> > int ret; >> > pte_t ptent; >> > >> > - struct vm_fault vmf = { >> > - .vma = vma, >> > - .address = addr, >> > - .real_address = addr, >> > - .pmd = pmd, >> > - }; >> > - >> > if (!pte++) { >> > pte = pte_offset_map(pmd, addr); >> > if (!pte) >> > @@ -1871,8 +1866,10 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, >> > pte_unmap(pte); >> > pte = NULL; >> > >> > - folio = swapin_entry(entry, GFP_HIGHUSER_MOVABLE, >> > - &vmf, NULL); >> > + mpol = get_vma_policy(vma, addr, 0, &ilx); >> > + folio = swapin_entry_mpol(entry, GFP_HIGHUSER_MOVABLE, >> > + mpol, ilx, NULL); >> > + mpol_cond_put(mpol); >> > if (!folio) { >> > /* >> > * The entry could have been freed, and will not >> >> IIUC, after the change, we will always use cluster readahead for >> swapoff. This may be OK. But, at least we need some test results which >> show that this will not cause any issue for this behavior change. And >> the behavior change should be described explicitly in patch description. > > Hi Ying > > Actually there is a swap_use_no_readahead check in swapin_entry_mpol, > so when readahaed is not needed (SYNC_IO), it's just skipped. > > And I think VMA readahead is not helpful swapoff, swapoff is already > walking the VMA, mostly uninterrupted in kernel space. With VMA > readahead or not, it will issue IO page by page. > The benchmark result I posted before is actually VMA readhead vs > no-readahead for ZRAM, sorry I didn't make it clear. It's obvious > no-readahead is faster. > > For actual block device, cluster readahead might be a good choice for > swapoff, since all pages will be read for swapoff, there has to be > enough memory for all swapcached page to stay in memory or swapoff > will fail anyway, and cluster read is faster for block devices. It's possible. But please run the tests on some actual block devices and show your results. Random memory accessing pattern should be tested, and the swap space usage should be > 50% to show some not so friendly situation. >> >> And I don't think it's a good abstraction to make swapin_entry_mpol() >> always use cluster swapin, while swapin_entry() will try to use vma >> swapin. I think we can add "struct mempolicy *mpol" and "pgoff_t ilx" >> to swapin_entry() as parameters, and use them if vmf == NULL. If we >> want to enforce cluster swapin in swapoff path, it will be better to add >> some comments to describe why. > > Good suggestion, I thought extending swapin_entry may make its > arguments list get too long, but seems mpol and ilx are the only thing > needed here. I'll update it. Thanks! -- Best Regards, Huang, Ying
On Mon, Jan 15, 2024 at 9:54 AM Huang, Ying <ying.huang@intel.com> wrote: > > Kairui Song <ryncsn@gmail.com> writes: > > > Huang, Ying <ying.huang@intel.com> 于2024年1月9日周二 09:11写道: > >> > >> Kairui Song <ryncsn@gmail.com> writes: > >> > >> > From: Kairui Song <kasong@tencent.com> > >> > > >> > There are two places where swapin is not caused by direct anon page fault: > >> > - shmem swapin, invoked indirectly through shmem mapping > >> > - swapoff > >> > > >> > They used to construct a pseudo vmfault struct for swapin function. > >> > Shmem has dropped the pseudo vmfault recently in commit ddc1a5cbc05d > >> > ("mempolicy: alloc_pages_mpol() for NUMA policy without vma"). Swapoff > >> > path is still using one. > >> > > >> > Introduce a helper for them both, this help save stack usage for swapoff > >> > path, and help apply a unified swapin cache and readahead policy check. > >> > > >> > Due to missing vmfault info, the caller have to pass in mempolicy > >> > explicitly, make it different from swapin_entry and name it > >> > swapin_entry_mpol. > >> > > >> > This commit convert swapoff to use this helper, follow-up commits will > >> > convert shmem to use it too. > >> > > >> > Signed-off-by: Kairui Song <kasong@tencent.com> > >> > --- > >> > mm/swap.h | 9 +++++++++ > >> > mm/swap_state.c | 40 ++++++++++++++++++++++++++++++++-------- > >> > mm/swapfile.c | 15 ++++++--------- > >> > 3 files changed, 47 insertions(+), 17 deletions(-) > >> > > >> > diff --git a/mm/swap.h b/mm/swap.h > >> > index 9180411afcfe..8f790a67b948 100644 > >> > --- a/mm/swap.h > >> > +++ b/mm/swap.h > >> > @@ -73,6 +73,9 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t flag, > >> > struct mempolicy *mpol, pgoff_t ilx); > >> > struct folio *swapin_entry(swp_entry_t entry, gfp_t flag, > >> > struct vm_fault *vmf, enum swap_cache_result *result); > >> > +struct folio *swapin_entry_mpol(swp_entry_t entry, gfp_t gfp_mask, > >> > + struct mempolicy *mpol, pgoff_t ilx, > >> > + enum swap_cache_result *result); > >> > > >> > static inline unsigned int folio_swap_flags(struct folio *folio) > >> > { > >> > @@ -109,6 +112,12 @@ static inline struct folio *swapin_entry(swp_entry_t swp, gfp_t gfp_mask, > >> > return NULL; > >> > } > >> > > >> > +static inline struct page *swapin_entry_mpol(swp_entry_t entry, gfp_t gfp_mask, > >> > + struct mempolicy *mpol, pgoff_t ilx, enum swap_cache_result *result) > >> > +{ > >> > + return NULL; > >> > +} > >> > + > >> > static inline int swap_writepage(struct page *p, struct writeback_control *wbc) > >> > { > >> > return 0; > >> > diff --git a/mm/swap_state.c b/mm/swap_state.c > >> > index 21badd4f0fc7..3edf4b63158d 100644 > >> > --- a/mm/swap_state.c > >> > +++ b/mm/swap_state.c > >> > @@ -880,14 +880,13 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, > >> > * in. > >> > */ > >> > static struct folio *swapin_direct(swp_entry_t entry, gfp_t gfp_mask, > >> > - struct vm_fault *vmf, void *shadow) > >> > + struct mempolicy *mpol, pgoff_t ilx, > >> > + void *shadow) > >> > { > >> > - struct vm_area_struct *vma = vmf->vma; > >> > struct folio *folio; > >> > > >> > - /* skip swapcache */ > >> > - folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, > >> > - vma, vmf->address, false); > >> > + folio = (struct folio *)alloc_pages_mpol(gfp_mask, 0, > >> > + mpol, ilx, numa_node_id()); > >> > if (folio) { > >> > if (mem_cgroup_swapin_charge_folio(folio, NULL, > >> > GFP_KERNEL, entry)) { > >> > @@ -943,18 +942,18 @@ struct folio *swapin_entry(swp_entry_t entry, gfp_t gfp_mask, > >> > goto done; > >> > } > >> > > >> > + mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx); > >> > if (swap_use_no_readahead(swp_swap_info(entry), entry)) { > >> > - folio = swapin_direct(entry, gfp_mask, vmf, shadow); > >> > + folio = swapin_direct(entry, gfp_mask, mpol, ilx, shadow); > >> > cache_result = SWAP_CACHE_BYPASS; > >> > } else { > >> > - mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx); > >> > if (swap_use_vma_readahead()) > >> > folio = swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf); > >> > else > >> > folio = swap_cluster_readahead(entry, gfp_mask, mpol, ilx); > >> > - mpol_cond_put(mpol); > >> > cache_result = SWAP_CACHE_MISS; > >> > } > >> > + mpol_cond_put(mpol); > >> > done: > >> > if (result) > >> > *result = cache_result; > >> > @@ -962,6 +961,31 @@ struct folio *swapin_entry(swp_entry_t entry, gfp_t gfp_mask, > >> > return folio; > >> > } > >> > > >> > +struct folio *swapin_entry_mpol(swp_entry_t entry, gfp_t gfp_mask, > >> > + struct mempolicy *mpol, pgoff_t ilx, > >> > + enum swap_cache_result *result) > >> > +{ > >> > + enum swap_cache_result cache_result; > >> > + void *shadow = NULL; > >> > + struct folio *folio; > >> > + > >> > + folio = swap_cache_get_folio(entry, NULL, 0, &shadow); > >> > + if (folio) { > >> > + cache_result = SWAP_CACHE_HIT; > >> > + } else if (swap_use_no_readahead(swp_swap_info(entry), entry)) { > >> > + folio = swapin_direct(entry, gfp_mask, mpol, ilx, shadow); > >> > + cache_result = SWAP_CACHE_BYPASS; > >> > + } else { > >> > + folio = swap_cluster_readahead(entry, gfp_mask, mpol, ilx); > >> > + cache_result = SWAP_CACHE_MISS; > >> > + } > >> > + > >> > + if (result) > >> > + *result = cache_result; > >> > + > >> > + return folio; > >> > +} > >> > + > >> > #ifdef CONFIG_SYSFS > >> > static ssize_t vma_ra_enabled_show(struct kobject *kobj, > >> > struct kobj_attribute *attr, char *buf) > >> > diff --git a/mm/swapfile.c b/mm/swapfile.c > >> > index 5aa44de11edc..2f77bf143af8 100644 > >> > --- a/mm/swapfile.c > >> > +++ b/mm/swapfile.c > >> > @@ -1840,18 +1840,13 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, > >> > do { > >> > struct folio *folio; > >> > unsigned long offset; > >> > + struct mempolicy *mpol; > >> > unsigned char swp_count; > >> > swp_entry_t entry; > >> > + pgoff_t ilx; > >> > int ret; > >> > pte_t ptent; > >> > > >> > - struct vm_fault vmf = { > >> > - .vma = vma, > >> > - .address = addr, > >> > - .real_address = addr, > >> > - .pmd = pmd, > >> > - }; > >> > - > >> > if (!pte++) { > >> > pte = pte_offset_map(pmd, addr); > >> > if (!pte) > >> > @@ -1871,8 +1866,10 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, > >> > pte_unmap(pte); > >> > pte = NULL; > >> > > >> > - folio = swapin_entry(entry, GFP_HIGHUSER_MOVABLE, > >> > - &vmf, NULL); > >> > + mpol = get_vma_policy(vma, addr, 0, &ilx); > >> > + folio = swapin_entry_mpol(entry, GFP_HIGHUSER_MOVABLE, > >> > + mpol, ilx, NULL); > >> > + mpol_cond_put(mpol); > >> > if (!folio) { > >> > /* > >> > * The entry could have been freed, and will not > >> > >> IIUC, after the change, we will always use cluster readahead for > >> swapoff. This may be OK. But, at least we need some test results which > >> show that this will not cause any issue for this behavior change. And > >> the behavior change should be described explicitly in patch description. > > > > Hi Ying > > > > Actually there is a swap_use_no_readahead check in swapin_entry_mpol, > > so when readahaed is not needed (SYNC_IO), it's just skipped. > > > > And I think VMA readahead is not helpful swapoff, swapoff is already > > walking the VMA, mostly uninterrupted in kernel space. With VMA > > readahead or not, it will issue IO page by page. > > The benchmark result I posted before is actually VMA readhead vs > > no-readahead for ZRAM, sorry I didn't make it clear. It's obvious > > no-readahead is faster. > > > > For actual block device, cluster readahead might be a good choice for > > swapoff, since all pages will be read for swapoff, there has to be > > enough memory for all swapcached page to stay in memory or swapoff > > will fail anyway, and cluster read is faster for block devices. > > It's possible. But please run the tests on some actual block devices > and show your results. Random memory accessing pattern should be > tested, and the swap space usage should be > 50% to show some not so > friendly situation. > Hi Ying, I setup a test environment and did following test, and found that cluster readahaed for swapoff is actually much worse in default setup: 1. Setup MySQL server using 2G memcg, with 28G buffer pool, and 24G NVME swap 2. Stress test with sysbench for 15min. 3. Remove the 2G memcg limit and swapoff. Before this patch, swapoff will take about 9m. After this patch, swapoff will take about 30m. After some analysis I found the reason is that cluster readahead is almost disabled (window == 1 or 2) during swapoff, because it will detect a very low hit rate on fragmented swap. But VMA readhead is much more aggressive here since swapoff is walking the VMA, with a very high hit rate. But If I force cluster readahead to use a large window for swapoff, the swapoff performance boost by a lot: By adding following change in swap_cluster_readahead: if (unlikely(!(si->flags & SWP_WRITEOK))) mask = max_t(unsigned long, 1 << READ_ONCE(page_cluster), PMD_SIZE / PAGE_SIZE) - 1; The swapoff will take only 40s to finish, more than 10x faster than the VMA readahead path (9m), because VMA readhead is still doing 4K random IO just with a longer queue due to async readahead. But cluster readhead will be doing 2M IO now. I think PMD size window is good here since it still keep a balance between good IO performance and the swapoff progress can still be interrupted, and the system is responsible. And in most cases we expect swapoff to success, if it fail, the RA windows should still keep the side effect of extra swapcache being generated acceptable. But this showed a bad effect of ignoring mem policy. Actually this is not a new issue, cluster readhead is already violating VMA's mem policy since it does readhead only based on entry value not VMA, the entry being swapped in is not aware of which VMA it belongs. And I was thinking, maybe we can just drop the mpol all the way, and use the nid from page shadow to alloc pages, that may save a lot of effort, and make cluster readhead more usable in general, also might simplify a lot of code. How do you think? If this is acceptable, I think I can send out a new series first and then rework this one later.
Kairui Song <ryncsn@gmail.com> writes: > On Mon, Jan 15, 2024 at 9:54 AM Huang, Ying <ying.huang@intel.com> wrote: >> >> Kairui Song <ryncsn@gmail.com> writes: >> >> > Huang, Ying <ying.huang@intel.com> 于2024年1月9日周二 09:11写道: >> >> >> >> Kairui Song <ryncsn@gmail.com> writes: >> >> >> >> > From: Kairui Song <kasong@tencent.com> >> >> > >> >> > There are two places where swapin is not caused by direct anon page fault: >> >> > - shmem swapin, invoked indirectly through shmem mapping >> >> > - swapoff >> >> > >> >> > They used to construct a pseudo vmfault struct for swapin function. >> >> > Shmem has dropped the pseudo vmfault recently in commit ddc1a5cbc05d >> >> > ("mempolicy: alloc_pages_mpol() for NUMA policy without vma"). Swapoff >> >> > path is still using one. >> >> > >> >> > Introduce a helper for them both, this help save stack usage for swapoff >> >> > path, and help apply a unified swapin cache and readahead policy check. >> >> > >> >> > Due to missing vmfault info, the caller have to pass in mempolicy >> >> > explicitly, make it different from swapin_entry and name it >> >> > swapin_entry_mpol. >> >> > >> >> > This commit convert swapoff to use this helper, follow-up commits will >> >> > convert shmem to use it too. >> >> > >> >> > Signed-off-by: Kairui Song <kasong@tencent.com> >> >> > --- >> >> > mm/swap.h | 9 +++++++++ >> >> > mm/swap_state.c | 40 ++++++++++++++++++++++++++++++++-------- >> >> > mm/swapfile.c | 15 ++++++--------- >> >> > 3 files changed, 47 insertions(+), 17 deletions(-) >> >> > >> >> > diff --git a/mm/swap.h b/mm/swap.h >> >> > index 9180411afcfe..8f790a67b948 100644 >> >> > --- a/mm/swap.h >> >> > +++ b/mm/swap.h >> >> > @@ -73,6 +73,9 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t flag, >> >> > struct mempolicy *mpol, pgoff_t ilx); >> >> > struct folio *swapin_entry(swp_entry_t entry, gfp_t flag, >> >> > struct vm_fault *vmf, enum swap_cache_result *result); >> >> > +struct folio *swapin_entry_mpol(swp_entry_t entry, gfp_t gfp_mask, >> >> > + struct mempolicy *mpol, pgoff_t ilx, >> >> > + enum swap_cache_result *result); >> >> > >> >> > static inline unsigned int folio_swap_flags(struct folio *folio) >> >> > { >> >> > @@ -109,6 +112,12 @@ static inline struct folio *swapin_entry(swp_entry_t swp, gfp_t gfp_mask, >> >> > return NULL; >> >> > } >> >> > >> >> > +static inline struct page *swapin_entry_mpol(swp_entry_t entry, gfp_t gfp_mask, >> >> > + struct mempolicy *mpol, pgoff_t ilx, enum swap_cache_result *result) >> >> > +{ >> >> > + return NULL; >> >> > +} >> >> > + >> >> > static inline int swap_writepage(struct page *p, struct writeback_control *wbc) >> >> > { >> >> > return 0; >> >> > diff --git a/mm/swap_state.c b/mm/swap_state.c >> >> > index 21badd4f0fc7..3edf4b63158d 100644 >> >> > --- a/mm/swap_state.c >> >> > +++ b/mm/swap_state.c >> >> > @@ -880,14 +880,13 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, >> >> > * in. >> >> > */ >> >> > static struct folio *swapin_direct(swp_entry_t entry, gfp_t gfp_mask, >> >> > - struct vm_fault *vmf, void *shadow) >> >> > + struct mempolicy *mpol, pgoff_t ilx, >> >> > + void *shadow) >> >> > { >> >> > - struct vm_area_struct *vma = vmf->vma; >> >> > struct folio *folio; >> >> > >> >> > - /* skip swapcache */ >> >> > - folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, >> >> > - vma, vmf->address, false); >> >> > + folio = (struct folio *)alloc_pages_mpol(gfp_mask, 0, >> >> > + mpol, ilx, numa_node_id()); >> >> > if (folio) { >> >> > if (mem_cgroup_swapin_charge_folio(folio, NULL, >> >> > GFP_KERNEL, entry)) { >> >> > @@ -943,18 +942,18 @@ struct folio *swapin_entry(swp_entry_t entry, gfp_t gfp_mask, >> >> > goto done; >> >> > } >> >> > >> >> > + mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx); >> >> > if (swap_use_no_readahead(swp_swap_info(entry), entry)) { >> >> > - folio = swapin_direct(entry, gfp_mask, vmf, shadow); >> >> > + folio = swapin_direct(entry, gfp_mask, mpol, ilx, shadow); >> >> > cache_result = SWAP_CACHE_BYPASS; >> >> > } else { >> >> > - mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx); >> >> > if (swap_use_vma_readahead()) >> >> > folio = swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf); >> >> > else >> >> > folio = swap_cluster_readahead(entry, gfp_mask, mpol, ilx); >> >> > - mpol_cond_put(mpol); >> >> > cache_result = SWAP_CACHE_MISS; >> >> > } >> >> > + mpol_cond_put(mpol); >> >> > done: >> >> > if (result) >> >> > *result = cache_result; >> >> > @@ -962,6 +961,31 @@ struct folio *swapin_entry(swp_entry_t entry, gfp_t gfp_mask, >> >> > return folio; >> >> > } >> >> > >> >> > +struct folio *swapin_entry_mpol(swp_entry_t entry, gfp_t gfp_mask, >> >> > + struct mempolicy *mpol, pgoff_t ilx, >> >> > + enum swap_cache_result *result) >> >> > +{ >> >> > + enum swap_cache_result cache_result; >> >> > + void *shadow = NULL; >> >> > + struct folio *folio; >> >> > + >> >> > + folio = swap_cache_get_folio(entry, NULL, 0, &shadow); >> >> > + if (folio) { >> >> > + cache_result = SWAP_CACHE_HIT; >> >> > + } else if (swap_use_no_readahead(swp_swap_info(entry), entry)) { >> >> > + folio = swapin_direct(entry, gfp_mask, mpol, ilx, shadow); >> >> > + cache_result = SWAP_CACHE_BYPASS; >> >> > + } else { >> >> > + folio = swap_cluster_readahead(entry, gfp_mask, mpol, ilx); >> >> > + cache_result = SWAP_CACHE_MISS; >> >> > + } >> >> > + >> >> > + if (result) >> >> > + *result = cache_result; >> >> > + >> >> > + return folio; >> >> > +} >> >> > + >> >> > #ifdef CONFIG_SYSFS >> >> > static ssize_t vma_ra_enabled_show(struct kobject *kobj, >> >> > struct kobj_attribute *attr, char *buf) >> >> > diff --git a/mm/swapfile.c b/mm/swapfile.c >> >> > index 5aa44de11edc..2f77bf143af8 100644 >> >> > --- a/mm/swapfile.c >> >> > +++ b/mm/swapfile.c >> >> > @@ -1840,18 +1840,13 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, >> >> > do { >> >> > struct folio *folio; >> >> > unsigned long offset; >> >> > + struct mempolicy *mpol; >> >> > unsigned char swp_count; >> >> > swp_entry_t entry; >> >> > + pgoff_t ilx; >> >> > int ret; >> >> > pte_t ptent; >> >> > >> >> > - struct vm_fault vmf = { >> >> > - .vma = vma, >> >> > - .address = addr, >> >> > - .real_address = addr, >> >> > - .pmd = pmd, >> >> > - }; >> >> > - >> >> > if (!pte++) { >> >> > pte = pte_offset_map(pmd, addr); >> >> > if (!pte) >> >> > @@ -1871,8 +1866,10 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, >> >> > pte_unmap(pte); >> >> > pte = NULL; >> >> > >> >> > - folio = swapin_entry(entry, GFP_HIGHUSER_MOVABLE, >> >> > - &vmf, NULL); >> >> > + mpol = get_vma_policy(vma, addr, 0, &ilx); >> >> > + folio = swapin_entry_mpol(entry, GFP_HIGHUSER_MOVABLE, >> >> > + mpol, ilx, NULL); >> >> > + mpol_cond_put(mpol); >> >> > if (!folio) { >> >> > /* >> >> > * The entry could have been freed, and will not >> >> >> >> IIUC, after the change, we will always use cluster readahead for >> >> swapoff. This may be OK. But, at least we need some test results which >> >> show that this will not cause any issue for this behavior change. And >> >> the behavior change should be described explicitly in patch description. >> > >> > Hi Ying >> > >> > Actually there is a swap_use_no_readahead check in swapin_entry_mpol, >> > so when readahaed is not needed (SYNC_IO), it's just skipped. >> > >> > And I think VMA readahead is not helpful swapoff, swapoff is already >> > walking the VMA, mostly uninterrupted in kernel space. With VMA >> > readahead or not, it will issue IO page by page. >> > The benchmark result I posted before is actually VMA readhead vs >> > no-readahead for ZRAM, sorry I didn't make it clear. It's obvious >> > no-readahead is faster. >> > >> > For actual block device, cluster readahead might be a good choice for >> > swapoff, since all pages will be read for swapoff, there has to be >> > enough memory for all swapcached page to stay in memory or swapoff >> > will fail anyway, and cluster read is faster for block devices. >> >> It's possible. But please run the tests on some actual block devices >> and show your results. Random memory accessing pattern should be >> tested, and the swap space usage should be > 50% to show some not so >> friendly situation. >> > > Hi Ying, > > I setup a test environment and did following test, and found that > cluster readahaed for swapoff is actually much worse in default setup: > > 1. Setup MySQL server using 2G memcg, with 28G buffer pool, and 24G NVME swap > 2. Stress test with sysbench for 15min. > 3. Remove the 2G memcg limit and swapoff. > > Before this patch, swapoff will take about 9m. > After this patch, swapoff will take about 30m. Thanks for data! > After some analysis I found the reason is that cluster readahead is > almost disabled (window == 1 or 2) during swapoff, because it will > detect a very low hit rate on fragmented swap. But VMA readhead is > much more aggressive here since swapoff is walking the VMA, with a > very high hit rate. > > But If I force cluster readahead to use a large window for swapoff, > the swapoff performance boost by a lot: > By adding following change in swap_cluster_readahead: > > if (unlikely(!(si->flags & SWP_WRITEOK))) > mask = max_t(unsigned long, 1 << READ_ONCE(page_cluster), PMD_SIZE > / PAGE_SIZE) - 1; > > The swapoff will take only 40s to finish, more than 10x faster than > the VMA readahead path (9m), because VMA readhead is still doing 4K > random IO just with a longer queue due to async readahead. But cluster > readhead will be doing 2M IO now. > I think PMD size window is good here since it still keep a balance > between good IO performance and the swapoff progress can still be > interrupted, and the system is responsible. And in most cases we > expect swapoff to success, if it fail, the RA windows should still > keep the side effect of extra swapcache being generated acceptable. swapoff performance isn't very important because swapoff is a very rare operation. It's OK to optimize it if the change is simple and doesn't compromise other stuff. But, as you said below, using large readahead window makes mempolicy issue more serious. Why isn't the original swapoff performance good enough for you? > But this showed a bad effect of ignoring mem policy. Actually this is > not a new issue, cluster readhead is already violating VMA's mem > policy since it does readhead only based on entry value not VMA, the > entry being swapped in is not aware of which VMA it belongs. > > And I was thinking, maybe we can just drop the mpol all the way, and > use the nid from page shadow to alloc pages, that may save a lot of > effort, and make cluster readhead more usable in general, also might > simplify a lot of code. How do you think? If this is acceptable, I > think I can send out a new series first and then rework this one > later. The "shadow" node can be reclaimed, please take a look at scan_shadow_nodes(). Although this hasn't been implemented, it may be implemented someday. And, shadow nid may not record the correct memory policy for the memory allocation. -- Best Regards, Huang, Ying
On Mon, Jan 22, 2024 at 2:40 PM Huang, Ying <ying.huang@intel.com> wrote: > > Kairui Song <ryncsn@gmail.com> writes: > > > On Mon, Jan 15, 2024 at 9:54 AM Huang, Ying <ying.huang@intel.com> wrote: > >> > >> Kairui Song <ryncsn@gmail.com> writes: > >> > >> > Huang, Ying <ying.huang@intel.com> 于2024年1月9日周二 09:11写道: > >> >> > >> >> Kairui Song <ryncsn@gmail.com> writes: > >> >> > >> >> > From: Kairui Song <kasong@tencent.com> > >> >> > > >> >> > There are two places where swapin is not caused by direct anon page fault: > >> >> > - shmem swapin, invoked indirectly through shmem mapping > >> >> > - swapoff > >> >> > > >> >> > They used to construct a pseudo vmfault struct for swapin function. > >> >> > Shmem has dropped the pseudo vmfault recently in commit ddc1a5cbc05d > >> >> > ("mempolicy: alloc_pages_mpol() for NUMA policy without vma"). Swapoff > >> >> > path is still using one. > >> >> > > >> >> > Introduce a helper for them both, this help save stack usage for swapoff > >> >> > path, and help apply a unified swapin cache and readahead policy check. > >> >> > > >> >> > Due to missing vmfault info, the caller have to pass in mempolicy > >> >> > explicitly, make it different from swapin_entry and name it > >> >> > swapin_entry_mpol. > >> >> > > >> >> > This commit convert swapoff to use this helper, follow-up commits will > >> >> > convert shmem to use it too. > >> >> > > >> >> > Signed-off-by: Kairui Song <kasong@tencent.com> > >> >> > --- > >> >> > mm/swap.h | 9 +++++++++ > >> >> > mm/swap_state.c | 40 ++++++++++++++++++++++++++++++++-------- > >> >> > mm/swapfile.c | 15 ++++++--------- > >> >> > 3 files changed, 47 insertions(+), 17 deletions(-) > >> >> > > >> >> > diff --git a/mm/swap.h b/mm/swap.h > >> >> > index 9180411afcfe..8f790a67b948 100644 > >> >> > --- a/mm/swap.h > >> >> > +++ b/mm/swap.h > >> >> > @@ -73,6 +73,9 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t flag, > >> >> > struct mempolicy *mpol, pgoff_t ilx); > >> >> > struct folio *swapin_entry(swp_entry_t entry, gfp_t flag, > >> >> > struct vm_fault *vmf, enum swap_cache_result *result); > >> >> > +struct folio *swapin_entry_mpol(swp_entry_t entry, gfp_t gfp_mask, > >> >> > + struct mempolicy *mpol, pgoff_t ilx, > >> >> > + enum swap_cache_result *result); > >> >> > > >> >> > static inline unsigned int folio_swap_flags(struct folio *folio) > >> >> > { > >> >> > @@ -109,6 +112,12 @@ static inline struct folio *swapin_entry(swp_entry_t swp, gfp_t gfp_mask, > >> >> > return NULL; > >> >> > } > >> >> > > >> >> > +static inline struct page *swapin_entry_mpol(swp_entry_t entry, gfp_t gfp_mask, > >> >> > + struct mempolicy *mpol, pgoff_t ilx, enum swap_cache_result *result) > >> >> > +{ > >> >> > + return NULL; > >> >> > +} > >> >> > + > >> >> > static inline int swap_writepage(struct page *p, struct writeback_control *wbc) > >> >> > { > >> >> > return 0; > >> >> > diff --git a/mm/swap_state.c b/mm/swap_state.c > >> >> > index 21badd4f0fc7..3edf4b63158d 100644 > >> >> > --- a/mm/swap_state.c > >> >> > +++ b/mm/swap_state.c > >> >> > @@ -880,14 +880,13 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, > >> >> > * in. > >> >> > */ > >> >> > static struct folio *swapin_direct(swp_entry_t entry, gfp_t gfp_mask, > >> >> > - struct vm_fault *vmf, void *shadow) > >> >> > + struct mempolicy *mpol, pgoff_t ilx, > >> >> > + void *shadow) > >> >> > { > >> >> > - struct vm_area_struct *vma = vmf->vma; > >> >> > struct folio *folio; > >> >> > > >> >> > - /* skip swapcache */ > >> >> > - folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, > >> >> > - vma, vmf->address, false); > >> >> > + folio = (struct folio *)alloc_pages_mpol(gfp_mask, 0, > >> >> > + mpol, ilx, numa_node_id()); > >> >> > if (folio) { > >> >> > if (mem_cgroup_swapin_charge_folio(folio, NULL, > >> >> > GFP_KERNEL, entry)) { > >> >> > @@ -943,18 +942,18 @@ struct folio *swapin_entry(swp_entry_t entry, gfp_t gfp_mask, > >> >> > goto done; > >> >> > } > >> >> > > >> >> > + mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx); > >> >> > if (swap_use_no_readahead(swp_swap_info(entry), entry)) { > >> >> > - folio = swapin_direct(entry, gfp_mask, vmf, shadow); > >> >> > + folio = swapin_direct(entry, gfp_mask, mpol, ilx, shadow); > >> >> > cache_result = SWAP_CACHE_BYPASS; > >> >> > } else { > >> >> > - mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx); > >> >> > if (swap_use_vma_readahead()) > >> >> > folio = swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf); > >> >> > else > >> >> > folio = swap_cluster_readahead(entry, gfp_mask, mpol, ilx); > >> >> > - mpol_cond_put(mpol); > >> >> > cache_result = SWAP_CACHE_MISS; > >> >> > } > >> >> > + mpol_cond_put(mpol); > >> >> > done: > >> >> > if (result) > >> >> > *result = cache_result; > >> >> > @@ -962,6 +961,31 @@ struct folio *swapin_entry(swp_entry_t entry, gfp_t gfp_mask, > >> >> > return folio; > >> >> > } > >> >> > > >> >> > +struct folio *swapin_entry_mpol(swp_entry_t entry, gfp_t gfp_mask, > >> >> > + struct mempolicy *mpol, pgoff_t ilx, > >> >> > + enum swap_cache_result *result) > >> >> > +{ > >> >> > + enum swap_cache_result cache_result; > >> >> > + void *shadow = NULL; > >> >> > + struct folio *folio; > >> >> > + > >> >> > + folio = swap_cache_get_folio(entry, NULL, 0, &shadow); > >> >> > + if (folio) { > >> >> > + cache_result = SWAP_CACHE_HIT; > >> >> > + } else if (swap_use_no_readahead(swp_swap_info(entry), entry)) { > >> >> > + folio = swapin_direct(entry, gfp_mask, mpol, ilx, shadow); > >> >> > + cache_result = SWAP_CACHE_BYPASS; > >> >> > + } else { > >> >> > + folio = swap_cluster_readahead(entry, gfp_mask, mpol, ilx); > >> >> > + cache_result = SWAP_CACHE_MISS; > >> >> > + } > >> >> > + > >> >> > + if (result) > >> >> > + *result = cache_result; > >> >> > + > >> >> > + return folio; > >> >> > +} > >> >> > + > >> >> > #ifdef CONFIG_SYSFS > >> >> > static ssize_t vma_ra_enabled_show(struct kobject *kobj, > >> >> > struct kobj_attribute *attr, char *buf) > >> >> > diff --git a/mm/swapfile.c b/mm/swapfile.c > >> >> > index 5aa44de11edc..2f77bf143af8 100644 > >> >> > --- a/mm/swapfile.c > >> >> > +++ b/mm/swapfile.c > >> >> > @@ -1840,18 +1840,13 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, > >> >> > do { > >> >> > struct folio *folio; > >> >> > unsigned long offset; > >> >> > + struct mempolicy *mpol; > >> >> > unsigned char swp_count; > >> >> > swp_entry_t entry; > >> >> > + pgoff_t ilx; > >> >> > int ret; > >> >> > pte_t ptent; > >> >> > > >> >> > - struct vm_fault vmf = { > >> >> > - .vma = vma, > >> >> > - .address = addr, > >> >> > - .real_address = addr, > >> >> > - .pmd = pmd, > >> >> > - }; > >> >> > - > >> >> > if (!pte++) { > >> >> > pte = pte_offset_map(pmd, addr); > >> >> > if (!pte) > >> >> > @@ -1871,8 +1866,10 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, > >> >> > pte_unmap(pte); > >> >> > pte = NULL; > >> >> > > >> >> > - folio = swapin_entry(entry, GFP_HIGHUSER_MOVABLE, > >> >> > - &vmf, NULL); > >> >> > + mpol = get_vma_policy(vma, addr, 0, &ilx); > >> >> > + folio = swapin_entry_mpol(entry, GFP_HIGHUSER_MOVABLE, > >> >> > + mpol, ilx, NULL); > >> >> > + mpol_cond_put(mpol); > >> >> > if (!folio) { > >> >> > /* > >> >> > * The entry could have been freed, and will not > >> >> > >> >> IIUC, after the change, we will always use cluster readahead for > >> >> swapoff. This may be OK. But, at least we need some test results which > >> >> show that this will not cause any issue for this behavior change. And > >> >> the behavior change should be described explicitly in patch description. > >> > > >> > Hi Ying > >> > > >> > Actually there is a swap_use_no_readahead check in swapin_entry_mpol, > >> > so when readahaed is not needed (SYNC_IO), it's just skipped. > >> > > >> > And I think VMA readahead is not helpful swapoff, swapoff is already > >> > walking the VMA, mostly uninterrupted in kernel space. With VMA > >> > readahead or not, it will issue IO page by page. > >> > The benchmark result I posted before is actually VMA readhead vs > >> > no-readahead for ZRAM, sorry I didn't make it clear. It's obvious > >> > no-readahead is faster. > >> > > >> > For actual block device, cluster readahead might be a good choice for > >> > swapoff, since all pages will be read for swapoff, there has to be > >> > enough memory for all swapcached page to stay in memory or swapoff > >> > will fail anyway, and cluster read is faster for block devices. > >> > >> It's possible. But please run the tests on some actual block devices > >> and show your results. Random memory accessing pattern should be > >> tested, and the swap space usage should be > 50% to show some not so > >> friendly situation. > >> > > > > Hi Ying, > > > > I setup a test environment and did following test, and found that > > cluster readahaed for swapoff is actually much worse in default setup: > > > > 1. Setup MySQL server using 2G memcg, with 28G buffer pool, and 24G NVME swap > > 2. Stress test with sysbench for 15min. > > 3. Remove the 2G memcg limit and swapoff. > > > > Before this patch, swapoff will take about 9m. > > After this patch, swapoff will take about 30m. > > Thanks for data! > > > After some analysis I found the reason is that cluster readahead is > > almost disabled (window == 1 or 2) during swapoff, because it will > > detect a very low hit rate on fragmented swap. But VMA readhead is > > much more aggressive here since swapoff is walking the VMA, with a > > very high hit rate. > > > > But If I force cluster readahead to use a large window for swapoff, > > the swapoff performance boost by a lot: > > By adding following change in swap_cluster_readahead: > > > > if (unlikely(!(si->flags & SWP_WRITEOK))) > > mask = max_t(unsigned long, 1 << READ_ONCE(page_cluster), PMD_SIZE > > / PAGE_SIZE) - 1; > > > > The swapoff will take only 40s to finish, more than 10x faster than > > the VMA readahead path (9m), because VMA readhead is still doing 4K > > random IO just with a longer queue due to async readahead. But cluster > > readhead will be doing 2M IO now. > > I think PMD size window is good here since it still keep a balance > > between good IO performance and the swapoff progress can still be > > interrupted, and the system is responsible. And in most cases we > > expect swapoff to success, if it fail, the RA windows should still > > keep the side effect of extra swapcache being generated acceptable. > > swapoff performance isn't very important because swapoff is a very rare > operation. It's OK to optimize it if the change is simple and doesn't > compromise other stuff. But, as you said below, using large readahead > window makes mempolicy issue more serious. Why isn't the original > swapoff performance good enough for you? Thanks for the reply. I think I'll just keep the original VMA readahead policy here then. Just I noticed that VMA readhead will also violate ranged memory policy too... That's some different issue, looks trivial though. > > > But this showed a bad effect of ignoring mem policy. Actually this is > > not a new issue, cluster readhead is already violating VMA's mem > > policy since it does readhead only based on entry value not VMA, the > > entry being swapped in is not aware of which VMA it belongs. > > > > And I was thinking, maybe we can just drop the mpol all the way, and > > use the nid from page shadow to alloc pages, that may save a lot of > > effort, and make cluster readhead more usable in general, also might > > simplify a lot of code. How do you think? If this is acceptable, I > > think I can send out a new series first and then rework this one > > later. > > The "shadow" node can be reclaimed, please take a look at > scan_shadow_nodes(). Although this hasn't been implemented, it may be > implemented someday. Right, I noticed upstream commit 5649d113ffce ("swap_state: update shadow_nodes for anonymous page") started reclaiming anon pages shadows now, thanks for the info.
Kairui Song <ryncsn@gmail.com> writes: > On Mon, Jan 22, 2024 at 2:40 PM Huang, Ying <ying.huang@intel.com> wrote: >> >> Kairui Song <ryncsn@gmail.com> writes: >> >> > On Mon, Jan 15, 2024 at 9:54 AM Huang, Ying <ying.huang@intel.com> wrote: >> >> >> >> Kairui Song <ryncsn@gmail.com> writes: >> >> >> >> > Huang, Ying <ying.huang@intel.com> 于2024年1月9日周二 09:11写道: >> >> >> >> >> >> Kairui Song <ryncsn@gmail.com> writes: >> >> >> >> >> >> > From: Kairui Song <kasong@tencent.com> >> >> >> > >> >> >> > There are two places where swapin is not caused by direct anon page fault: >> >> >> > - shmem swapin, invoked indirectly through shmem mapping >> >> >> > - swapoff >> >> >> > >> >> >> > They used to construct a pseudo vmfault struct for swapin function. >> >> >> > Shmem has dropped the pseudo vmfault recently in commit ddc1a5cbc05d >> >> >> > ("mempolicy: alloc_pages_mpol() for NUMA policy without vma"). Swapoff >> >> >> > path is still using one. >> >> >> > >> >> >> > Introduce a helper for them both, this help save stack usage for swapoff >> >> >> > path, and help apply a unified swapin cache and readahead policy check. >> >> >> > >> >> >> > Due to missing vmfault info, the caller have to pass in mempolicy >> >> >> > explicitly, make it different from swapin_entry and name it >> >> >> > swapin_entry_mpol. >> >> >> > >> >> >> > This commit convert swapoff to use this helper, follow-up commits will >> >> >> > convert shmem to use it too. >> >> >> > >> >> >> > Signed-off-by: Kairui Song <kasong@tencent.com> >> >> >> > --- >> >> >> > mm/swap.h | 9 +++++++++ >> >> >> > mm/swap_state.c | 40 ++++++++++++++++++++++++++++++++-------- >> >> >> > mm/swapfile.c | 15 ++++++--------- >> >> >> > 3 files changed, 47 insertions(+), 17 deletions(-) >> >> >> > >> >> >> > diff --git a/mm/swap.h b/mm/swap.h >> >> >> > index 9180411afcfe..8f790a67b948 100644 >> >> >> > --- a/mm/swap.h >> >> >> > +++ b/mm/swap.h >> >> >> > @@ -73,6 +73,9 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t flag, >> >> >> > struct mempolicy *mpol, pgoff_t ilx); >> >> >> > struct folio *swapin_entry(swp_entry_t entry, gfp_t flag, >> >> >> > struct vm_fault *vmf, enum swap_cache_result *result); >> >> >> > +struct folio *swapin_entry_mpol(swp_entry_t entry, gfp_t gfp_mask, >> >> >> > + struct mempolicy *mpol, pgoff_t ilx, >> >> >> > + enum swap_cache_result *result); >> >> >> > >> >> >> > static inline unsigned int folio_swap_flags(struct folio *folio) >> >> >> > { >> >> >> > @@ -109,6 +112,12 @@ static inline struct folio *swapin_entry(swp_entry_t swp, gfp_t gfp_mask, >> >> >> > return NULL; >> >> >> > } >> >> >> > >> >> >> > +static inline struct page *swapin_entry_mpol(swp_entry_t entry, gfp_t gfp_mask, >> >> >> > + struct mempolicy *mpol, pgoff_t ilx, enum swap_cache_result *result) >> >> >> > +{ >> >> >> > + return NULL; >> >> >> > +} >> >> >> > + >> >> >> > static inline int swap_writepage(struct page *p, struct writeback_control *wbc) >> >> >> > { >> >> >> > return 0; >> >> >> > diff --git a/mm/swap_state.c b/mm/swap_state.c >> >> >> > index 21badd4f0fc7..3edf4b63158d 100644 >> >> >> > --- a/mm/swap_state.c >> >> >> > +++ b/mm/swap_state.c >> >> >> > @@ -880,14 +880,13 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, >> >> >> > * in. >> >> >> > */ >> >> >> > static struct folio *swapin_direct(swp_entry_t entry, gfp_t gfp_mask, >> >> >> > - struct vm_fault *vmf, void *shadow) >> >> >> > + struct mempolicy *mpol, pgoff_t ilx, >> >> >> > + void *shadow) >> >> >> > { >> >> >> > - struct vm_area_struct *vma = vmf->vma; >> >> >> > struct folio *folio; >> >> >> > >> >> >> > - /* skip swapcache */ >> >> >> > - folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, >> >> >> > - vma, vmf->address, false); >> >> >> > + folio = (struct folio *)alloc_pages_mpol(gfp_mask, 0, >> >> >> > + mpol, ilx, numa_node_id()); >> >> >> > if (folio) { >> >> >> > if (mem_cgroup_swapin_charge_folio(folio, NULL, >> >> >> > GFP_KERNEL, entry)) { >> >> >> > @@ -943,18 +942,18 @@ struct folio *swapin_entry(swp_entry_t entry, gfp_t gfp_mask, >> >> >> > goto done; >> >> >> > } >> >> >> > >> >> >> > + mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx); >> >> >> > if (swap_use_no_readahead(swp_swap_info(entry), entry)) { >> >> >> > - folio = swapin_direct(entry, gfp_mask, vmf, shadow); >> >> >> > + folio = swapin_direct(entry, gfp_mask, mpol, ilx, shadow); >> >> >> > cache_result = SWAP_CACHE_BYPASS; >> >> >> > } else { >> >> >> > - mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx); >> >> >> > if (swap_use_vma_readahead()) >> >> >> > folio = swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf); >> >> >> > else >> >> >> > folio = swap_cluster_readahead(entry, gfp_mask, mpol, ilx); >> >> >> > - mpol_cond_put(mpol); >> >> >> > cache_result = SWAP_CACHE_MISS; >> >> >> > } >> >> >> > + mpol_cond_put(mpol); >> >> >> > done: >> >> >> > if (result) >> >> >> > *result = cache_result; >> >> >> > @@ -962,6 +961,31 @@ struct folio *swapin_entry(swp_entry_t entry, gfp_t gfp_mask, >> >> >> > return folio; >> >> >> > } >> >> >> > >> >> >> > +struct folio *swapin_entry_mpol(swp_entry_t entry, gfp_t gfp_mask, >> >> >> > + struct mempolicy *mpol, pgoff_t ilx, >> >> >> > + enum swap_cache_result *result) >> >> >> > +{ >> >> >> > + enum swap_cache_result cache_result; >> >> >> > + void *shadow = NULL; >> >> >> > + struct folio *folio; >> >> >> > + >> >> >> > + folio = swap_cache_get_folio(entry, NULL, 0, &shadow); >> >> >> > + if (folio) { >> >> >> > + cache_result = SWAP_CACHE_HIT; >> >> >> > + } else if (swap_use_no_readahead(swp_swap_info(entry), entry)) { >> >> >> > + folio = swapin_direct(entry, gfp_mask, mpol, ilx, shadow); >> >> >> > + cache_result = SWAP_CACHE_BYPASS; >> >> >> > + } else { >> >> >> > + folio = swap_cluster_readahead(entry, gfp_mask, mpol, ilx); >> >> >> > + cache_result = SWAP_CACHE_MISS; >> >> >> > + } >> >> >> > + >> >> >> > + if (result) >> >> >> > + *result = cache_result; >> >> >> > + >> >> >> > + return folio; >> >> >> > +} >> >> >> > + >> >> >> > #ifdef CONFIG_SYSFS >> >> >> > static ssize_t vma_ra_enabled_show(struct kobject *kobj, >> >> >> > struct kobj_attribute *attr, char *buf) >> >> >> > diff --git a/mm/swapfile.c b/mm/swapfile.c >> >> >> > index 5aa44de11edc..2f77bf143af8 100644 >> >> >> > --- a/mm/swapfile.c >> >> >> > +++ b/mm/swapfile.c >> >> >> > @@ -1840,18 +1840,13 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, >> >> >> > do { >> >> >> > struct folio *folio; >> >> >> > unsigned long offset; >> >> >> > + struct mempolicy *mpol; >> >> >> > unsigned char swp_count; >> >> >> > swp_entry_t entry; >> >> >> > + pgoff_t ilx; >> >> >> > int ret; >> >> >> > pte_t ptent; >> >> >> > >> >> >> > - struct vm_fault vmf = { >> >> >> > - .vma = vma, >> >> >> > - .address = addr, >> >> >> > - .real_address = addr, >> >> >> > - .pmd = pmd, >> >> >> > - }; >> >> >> > - >> >> >> > if (!pte++) { >> >> >> > pte = pte_offset_map(pmd, addr); >> >> >> > if (!pte) >> >> >> > @@ -1871,8 +1866,10 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, >> >> >> > pte_unmap(pte); >> >> >> > pte = NULL; >> >> >> > >> >> >> > - folio = swapin_entry(entry, GFP_HIGHUSER_MOVABLE, >> >> >> > - &vmf, NULL); >> >> >> > + mpol = get_vma_policy(vma, addr, 0, &ilx); >> >> >> > + folio = swapin_entry_mpol(entry, GFP_HIGHUSER_MOVABLE, >> >> >> > + mpol, ilx, NULL); >> >> >> > + mpol_cond_put(mpol); >> >> >> > if (!folio) { >> >> >> > /* >> >> >> > * The entry could have been freed, and will not >> >> >> >> >> >> IIUC, after the change, we will always use cluster readahead for >> >> >> swapoff. This may be OK. But, at least we need some test results which >> >> >> show that this will not cause any issue for this behavior change. And >> >> >> the behavior change should be described explicitly in patch description. >> >> > >> >> > Hi Ying >> >> > >> >> > Actually there is a swap_use_no_readahead check in swapin_entry_mpol, >> >> > so when readahaed is not needed (SYNC_IO), it's just skipped. >> >> > >> >> > And I think VMA readahead is not helpful swapoff, swapoff is already >> >> > walking the VMA, mostly uninterrupted in kernel space. With VMA >> >> > readahead or not, it will issue IO page by page. >> >> > The benchmark result I posted before is actually VMA readhead vs >> >> > no-readahead for ZRAM, sorry I didn't make it clear. It's obvious >> >> > no-readahead is faster. >> >> > >> >> > For actual block device, cluster readahead might be a good choice for >> >> > swapoff, since all pages will be read for swapoff, there has to be >> >> > enough memory for all swapcached page to stay in memory or swapoff >> >> > will fail anyway, and cluster read is faster for block devices. >> >> >> >> It's possible. But please run the tests on some actual block devices >> >> and show your results. Random memory accessing pattern should be >> >> tested, and the swap space usage should be > 50% to show some not so >> >> friendly situation. >> >> >> > >> > Hi Ying, >> > >> > I setup a test environment and did following test, and found that >> > cluster readahaed for swapoff is actually much worse in default setup: >> > >> > 1. Setup MySQL server using 2G memcg, with 28G buffer pool, and 24G NVME swap >> > 2. Stress test with sysbench for 15min. >> > 3. Remove the 2G memcg limit and swapoff. >> > >> > Before this patch, swapoff will take about 9m. >> > After this patch, swapoff will take about 30m. >> >> Thanks for data! >> >> > After some analysis I found the reason is that cluster readahead is >> > almost disabled (window == 1 or 2) during swapoff, because it will >> > detect a very low hit rate on fragmented swap. But VMA readhead is >> > much more aggressive here since swapoff is walking the VMA, with a >> > very high hit rate. >> > >> > But If I force cluster readahead to use a large window for swapoff, >> > the swapoff performance boost by a lot: >> > By adding following change in swap_cluster_readahead: >> > >> > if (unlikely(!(si->flags & SWP_WRITEOK))) >> > mask = max_t(unsigned long, 1 << READ_ONCE(page_cluster), PMD_SIZE >> > / PAGE_SIZE) - 1; >> > >> > The swapoff will take only 40s to finish, more than 10x faster than >> > the VMA readahead path (9m), because VMA readhead is still doing 4K >> > random IO just with a longer queue due to async readahead. But cluster >> > readhead will be doing 2M IO now. >> > I think PMD size window is good here since it still keep a balance >> > between good IO performance and the swapoff progress can still be >> > interrupted, and the system is responsible. And in most cases we >> > expect swapoff to success, if it fail, the RA windows should still >> > keep the side effect of extra swapcache being generated acceptable. >> >> swapoff performance isn't very important because swapoff is a very rare >> operation. It's OK to optimize it if the change is simple and doesn't >> compromise other stuff. But, as you said below, using large readahead >> window makes mempolicy issue more serious. Why isn't the original >> swapoff performance good enough for you? > > Thanks for the reply. > > I think I'll just keep the original VMA readahead policy here then. > Just I noticed that VMA readhead will also violate ranged memory > policy too... That's some different issue, looks trivial though. During reviewing your patch, I found that too. I think that they can be fixed because we have enough information. -- Best Regards, Huang, Ying >> >> > But this showed a bad effect of ignoring mem policy. Actually this is >> > not a new issue, cluster readhead is already violating VMA's mem >> > policy since it does readhead only based on entry value not VMA, the >> > entry being swapped in is not aware of which VMA it belongs. >> > >> > And I was thinking, maybe we can just drop the mpol all the way, and >> > use the nid from page shadow to alloc pages, that may save a lot of >> > effort, and make cluster readhead more usable in general, also might >> > simplify a lot of code. How do you think? If this is acceptable, I >> > think I can send out a new series first and then rework this one >> > later. >> >> The "shadow" node can be reclaimed, please take a look at >> scan_shadow_nodes(). Although this hasn't been implemented, it may be >> implemented someday. > > Right, I noticed upstream commit 5649d113ffce ("swap_state: update > shadow_nodes for anonymous page") started reclaiming anon pages > shadows now, thanks for the info.
diff --git a/mm/swap.h b/mm/swap.h index 9180411afcfe..8f790a67b948 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -73,6 +73,9 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t flag, struct mempolicy *mpol, pgoff_t ilx); struct folio *swapin_entry(swp_entry_t entry, gfp_t flag, struct vm_fault *vmf, enum swap_cache_result *result); +struct folio *swapin_entry_mpol(swp_entry_t entry, gfp_t gfp_mask, + struct mempolicy *mpol, pgoff_t ilx, + enum swap_cache_result *result); static inline unsigned int folio_swap_flags(struct folio *folio) { @@ -109,6 +112,12 @@ static inline struct folio *swapin_entry(swp_entry_t swp, gfp_t gfp_mask, return NULL; } +static inline struct page *swapin_entry_mpol(swp_entry_t entry, gfp_t gfp_mask, + struct mempolicy *mpol, pgoff_t ilx, enum swap_cache_result *result) +{ + return NULL; +} + static inline int swap_writepage(struct page *p, struct writeback_control *wbc) { return 0; diff --git a/mm/swap_state.c b/mm/swap_state.c index 21badd4f0fc7..3edf4b63158d 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -880,14 +880,13 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, * in. */ static struct folio *swapin_direct(swp_entry_t entry, gfp_t gfp_mask, - struct vm_fault *vmf, void *shadow) + struct mempolicy *mpol, pgoff_t ilx, + void *shadow) { - struct vm_area_struct *vma = vmf->vma; struct folio *folio; - /* skip swapcache */ - folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, - vma, vmf->address, false); + folio = (struct folio *)alloc_pages_mpol(gfp_mask, 0, + mpol, ilx, numa_node_id()); if (folio) { if (mem_cgroup_swapin_charge_folio(folio, NULL, GFP_KERNEL, entry)) { @@ -943,18 +942,18 @@ struct folio *swapin_entry(swp_entry_t entry, gfp_t gfp_mask, goto done; } + mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx); if (swap_use_no_readahead(swp_swap_info(entry), entry)) { - folio = swapin_direct(entry, gfp_mask, vmf, shadow); + folio = swapin_direct(entry, gfp_mask, mpol, ilx, shadow); cache_result = SWAP_CACHE_BYPASS; } else { - mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx); if (swap_use_vma_readahead()) folio = swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf); else folio = swap_cluster_readahead(entry, gfp_mask, mpol, ilx); - mpol_cond_put(mpol); cache_result = SWAP_CACHE_MISS; } + mpol_cond_put(mpol); done: if (result) *result = cache_result; @@ -962,6 +961,31 @@ struct folio *swapin_entry(swp_entry_t entry, gfp_t gfp_mask, return folio; } +struct folio *swapin_entry_mpol(swp_entry_t entry, gfp_t gfp_mask, + struct mempolicy *mpol, pgoff_t ilx, + enum swap_cache_result *result) +{ + enum swap_cache_result cache_result; + void *shadow = NULL; + struct folio *folio; + + folio = swap_cache_get_folio(entry, NULL, 0, &shadow); + if (folio) { + cache_result = SWAP_CACHE_HIT; + } else if (swap_use_no_readahead(swp_swap_info(entry), entry)) { + folio = swapin_direct(entry, gfp_mask, mpol, ilx, shadow); + cache_result = SWAP_CACHE_BYPASS; + } else { + folio = swap_cluster_readahead(entry, gfp_mask, mpol, ilx); + cache_result = SWAP_CACHE_MISS; + } + + if (result) + *result = cache_result; + + return folio; +} + #ifdef CONFIG_SYSFS static ssize_t vma_ra_enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) diff --git a/mm/swapfile.c b/mm/swapfile.c index 5aa44de11edc..2f77bf143af8 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1840,18 +1840,13 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, do { struct folio *folio; unsigned long offset; + struct mempolicy *mpol; unsigned char swp_count; swp_entry_t entry; + pgoff_t ilx; int ret; pte_t ptent; - struct vm_fault vmf = { - .vma = vma, - .address = addr, - .real_address = addr, - .pmd = pmd, - }; - if (!pte++) { pte = pte_offset_map(pmd, addr); if (!pte) @@ -1871,8 +1866,10 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, pte_unmap(pte); pte = NULL; - folio = swapin_entry(entry, GFP_HIGHUSER_MOVABLE, - &vmf, NULL); + mpol = get_vma_policy(vma, addr, 0, &ilx); + folio = swapin_entry_mpol(entry, GFP_HIGHUSER_MOVABLE, + mpol, ilx, NULL); + mpol_cond_put(mpol); if (!folio) { /* * The entry could have been freed, and will not