Message ID | 20240408183946.2991168-6-ryan.roberts@arm.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | Swap-out mTHP without splitting | expand |
On Tue, Apr 9, 2024 at 6:40 AM Ryan Roberts <ryan.roberts@arm.com> wrote: > > Multi-size THP enables performance improvements by allocating large, > pte-mapped folios for anonymous memory. However I've observed that on an > arm64 system running a parallel workload (e.g. kernel compilation) > across many cores, under high memory pressure, the speed regresses. This > is due to bottlenecking on the increased number of TLBIs added due to > all the extra folio splitting when the large folios are swapped out. > > Therefore, solve this regression by adding support for swapping out mTHP > without needing to split the folio, just like is already done for > PMD-sized THP. This change only applies when CONFIG_THP_SWAP is enabled, > and when the swap backing store is a non-rotating block device. These > are the same constraints as for the existing PMD-sized THP swap-out > support. > > Note that no attempt is made to swap-in (m)THP here - this is still done > page-by-page, like for PMD-sized THP. But swapping-out mTHP is a > prerequisite for swapping-in mTHP. > > The main change here is to improve the swap entry allocator so that it > can allocate any power-of-2 number of contiguous entries between [1, (1 > << PMD_ORDER)]. This is done by allocating a cluster for each distinct > order and allocating sequentially from it until the cluster is full. > This ensures that we don't need to search the map and we get no > fragmentation due to alignment padding for different orders in the > cluster. If there is no current cluster for a given order, we attempt to > allocate a free cluster from the list. If there are no free clusters, we > fail the allocation and the caller can fall back to splitting the folio > and allocates individual entries (as per existing PMD-sized THP > fallback). > > The per-order current clusters are maintained per-cpu using the existing > infrastructure. This is done to avoid interleving pages from different > tasks, which would prevent IO being batched. This is already done for > the order-0 allocations so we follow the same pattern. > > As is done for order-0 per-cpu clusters, the scanner now can steal > order-0 entries from any per-cpu-per-order reserved cluster. This > ensures that when the swap file is getting full, space doesn't get tied > up in the per-cpu reserves. > > This change only modifies swap to be able to accept any order mTHP. It > doesn't change the callers to elide doing the actual split. That will be > done in separate changes. > > Reviewed-by: "Huang, Ying" <ying.huang@intel.com> > Signed-off-by: Ryan Roberts <ryan.roberts@arm.com> > --- > include/linux/swap.h | 8 ++- > mm/swapfile.c | 162 ++++++++++++++++++++++++------------------- > 2 files changed, 98 insertions(+), 72 deletions(-) > > diff --git a/include/linux/swap.h b/include/linux/swap.h > index b888e1080a94..11c53692f65f 100644 > --- a/include/linux/swap.h > +++ b/include/linux/swap.h > @@ -268,13 +268,19 @@ struct swap_cluster_info { > */ > #define SWAP_NEXT_INVALID 0 > > +#ifdef CONFIG_THP_SWAP > +#define SWAP_NR_ORDERS (PMD_ORDER + 1) > +#else > +#define SWAP_NR_ORDERS 1 > +#endif > + > /* > * We assign a cluster to each CPU, so each CPU can allocate swap entry from > * its own cluster and swapout sequentially. The purpose is to optimize swapout > * throughput. > */ > struct percpu_cluster { > - unsigned int next; /* Likely next allocation offset */ > + unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */ > }; > > struct swap_cluster_list { > diff --git a/mm/swapfile.c b/mm/swapfile.c > index d2e3d3cd439f..148ef08f19dd 100644 > --- a/mm/swapfile.c > +++ b/mm/swapfile.c > @@ -551,10 +551,12 @@ static void free_cluster(struct swap_info_struct *si, unsigned long idx) > > /* > * The cluster corresponding to page_nr will be used. The cluster will be > - * removed from free cluster list and its usage counter will be increased. > + * removed from free cluster list and its usage counter will be increased by > + * count. > */ > -static void inc_cluster_info_page(struct swap_info_struct *p, > - struct swap_cluster_info *cluster_info, unsigned long page_nr) > +static void add_cluster_info_page(struct swap_info_struct *p, > + struct swap_cluster_info *cluster_info, unsigned long page_nr, > + unsigned long count) > { > unsigned long idx = page_nr / SWAPFILE_CLUSTER; > > @@ -563,9 +565,19 @@ static void inc_cluster_info_page(struct swap_info_struct *p, > if (cluster_is_free(&cluster_info[idx])) > alloc_cluster(p, idx); > > - VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER); > + VM_BUG_ON(cluster_count(&cluster_info[idx]) + count > SWAPFILE_CLUSTER); > cluster_set_count(&cluster_info[idx], > - cluster_count(&cluster_info[idx]) + 1); > + cluster_count(&cluster_info[idx]) + count); > +} > + > +/* > + * The cluster corresponding to page_nr will be used. The cluster will be > + * removed from free cluster list and its usage counter will be increased by 1. > + */ > +static void inc_cluster_info_page(struct swap_info_struct *p, > + struct swap_cluster_info *cluster_info, unsigned long page_nr) > +{ > + add_cluster_info_page(p, cluster_info, page_nr, 1); > } > > /* > @@ -595,7 +607,7 @@ static void dec_cluster_info_page(struct swap_info_struct *p, > */ > static bool > scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si, > - unsigned long offset) > + unsigned long offset, int order) > { > struct percpu_cluster *percpu_cluster; > bool conflict; > @@ -609,24 +621,39 @@ scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si, > return false; > > percpu_cluster = this_cpu_ptr(si->percpu_cluster); > - percpu_cluster->next = SWAP_NEXT_INVALID; > + percpu_cluster->next[order] = SWAP_NEXT_INVALID; > + return true; > +} > + > +static inline bool swap_range_empty(char *swap_map, unsigned int start, > + unsigned int nr_pages) > +{ > + unsigned int i; > + > + for (i = 0; i < nr_pages; i++) { > + if (swap_map[start + i]) > + return false; > + } > + > return true; > } > > /* > - * Try to get a swap entry from current cpu's swap entry pool (a cluster). This > - * might involve allocating a new cluster for current CPU too. > + * Try to get swap entries with specified order from current cpu's swap entry > + * pool (a cluster). This might involve allocating a new cluster for current CPU > + * too. > */ > static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, > - unsigned long *offset, unsigned long *scan_base) > + unsigned long *offset, unsigned long *scan_base, int order) > { > + unsigned int nr_pages = 1 << order; > struct percpu_cluster *cluster; > struct swap_cluster_info *ci; > unsigned int tmp, max; > > new_cluster: > cluster = this_cpu_ptr(si->percpu_cluster); > - tmp = cluster->next; > + tmp = cluster->next[order]; > if (tmp == SWAP_NEXT_INVALID) { > if (!cluster_list_empty(&si->free_clusters)) { > tmp = cluster_next(&si->free_clusters.head) * > @@ -647,26 +674,27 @@ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, > > /* > * Other CPUs can use our cluster if they can't find a free cluster, > - * check if there is still free entry in the cluster > + * check if there is still free entry in the cluster, maintaining > + * natural alignment. > */ > max = min_t(unsigned long, si->max, ALIGN(tmp + 1, SWAPFILE_CLUSTER)); > if (tmp < max) { > ci = lock_cluster(si, tmp); > while (tmp < max) { > - if (!si->swap_map[tmp]) > + if (swap_range_empty(si->swap_map, tmp, nr_pages)) > break; > - tmp++; > + tmp += nr_pages; > } > unlock_cluster(ci); > } > if (tmp >= max) { > - cluster->next = SWAP_NEXT_INVALID; > + cluster->next[order] = SWAP_NEXT_INVALID; > goto new_cluster; > } > *offset = tmp; > *scan_base = tmp; > - tmp += 1; > - cluster->next = tmp < max ? tmp : SWAP_NEXT_INVALID; > + tmp += nr_pages; > + cluster->next[order] = tmp < max ? tmp : SWAP_NEXT_INVALID; > return true; > } > > @@ -796,13 +824,14 @@ static bool swap_offset_available_and_locked(struct swap_info_struct *si, > > static int scan_swap_map_slots(struct swap_info_struct *si, > unsigned char usage, int nr, > - swp_entry_t slots[]) > + swp_entry_t slots[], int order) > { > struct swap_cluster_info *ci; > unsigned long offset; > unsigned long scan_base; > unsigned long last_in_cluster = 0; > int latency_ration = LATENCY_LIMIT; > + unsigned int nr_pages = 1 << order; > int n_ret = 0; > bool scanned_many = false; > > @@ -817,6 +846,25 @@ static int scan_swap_map_slots(struct swap_info_struct *si, > * And we let swap pages go all over an SSD partition. Hugh > */ > > + if (order > 0) { > + /* > + * Should not even be attempting large allocations when huge > + * page swap is disabled. Warn and fail the allocation. > + */ > + if (!IS_ENABLED(CONFIG_THP_SWAP) || > + nr_pages > SWAPFILE_CLUSTER) { > + VM_WARN_ON_ONCE(1); > + return 0; > + } > + > + /* > + * Swapfile is not block device or not using clusters so unable > + * to allocate large entries. > + */ > + if (!(si->flags & SWP_BLKDEV) || !si->cluster_info) > + return 0; > + } > + > si->flags += SWP_SCANNING; > /* > * Use percpu scan base for SSD to reduce lock contention on > @@ -831,8 +879,11 @@ static int scan_swap_map_slots(struct swap_info_struct *si, > > /* SSD algorithm */ > if (si->cluster_info) { > - if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) > + if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base, order)) { Hi Ryan, Sorry for bringing up an old thread. During the initial hour of utilizing an Android phone with 64KiB mTHP, we noticed that the anon_swpout_fallback rate was less than 10%. However, after several hours of phone usage, we observed a significant increase in the anon_swpout_fallback rate, reaching 100%. As I checked the code of scan_swap_map_try_ssd_cluster(), static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, unsigned long *offset, unsigned long *scan_base, int order) { unsigned int nr_pages = 1 << order; struct percpu_cluster *cluster; struct swap_cluster_info *ci; unsigned int tmp, max; new_cluster: cluster = this_cpu_ptr(si->percpu_cluster); tmp = cluster->next[order]; if (tmp == SWAP_NEXT_INVALID) { if (!cluster_list_empty(&si->free_clusters)) { tmp = cluster_next(&si->free_clusters.head) * SWAPFILE_CLUSTER; } else if (!cluster_list_empty(&si->discard_clusters)) { /* * we don't have free cluster but have some clusters in * discarding, do discard now and reclaim them, then * reread cluster_next_cpu since we dropped si->lock */ swap_do_scheduled_discard(si); *scan_base = this_cpu_read(*si->cluster_next_cpu); *offset = *scan_base; goto new_cluster; } else return false; } ... } Considering the cluster_list_empty() checks, is it necessary to have free_cluster to ensure a continuous allocation of swap slots for large folio swap out? For instance, if numerous clusters still possess ample free swap slots, could we potentially miss out on them due to a lack of execution of a slow scan? I'm not saying your patchset has problems, just that I have some questions. Thanks Barry
On 13/05/2024 08:30, Barry Song wrote: > On Tue, Apr 9, 2024 at 6:40 AM Ryan Roberts <ryan.roberts@arm.com> wrote: >> >> Multi-size THP enables performance improvements by allocating large, >> pte-mapped folios for anonymous memory. However I've observed that on an >> arm64 system running a parallel workload (e.g. kernel compilation) >> across many cores, under high memory pressure, the speed regresses. This >> is due to bottlenecking on the increased number of TLBIs added due to >> all the extra folio splitting when the large folios are swapped out. >> >> Therefore, solve this regression by adding support for swapping out mTHP >> without needing to split the folio, just like is already done for >> PMD-sized THP. This change only applies when CONFIG_THP_SWAP is enabled, >> and when the swap backing store is a non-rotating block device. These >> are the same constraints as for the existing PMD-sized THP swap-out >> support. >> >> Note that no attempt is made to swap-in (m)THP here - this is still done >> page-by-page, like for PMD-sized THP. But swapping-out mTHP is a >> prerequisite for swapping-in mTHP. >> >> The main change here is to improve the swap entry allocator so that it >> can allocate any power-of-2 number of contiguous entries between [1, (1 >> << PMD_ORDER)]. This is done by allocating a cluster for each distinct >> order and allocating sequentially from it until the cluster is full. >> This ensures that we don't need to search the map and we get no >> fragmentation due to alignment padding for different orders in the >> cluster. If there is no current cluster for a given order, we attempt to >> allocate a free cluster from the list. If there are no free clusters, we >> fail the allocation and the caller can fall back to splitting the folio >> and allocates individual entries (as per existing PMD-sized THP >> fallback). >> >> The per-order current clusters are maintained per-cpu using the existing >> infrastructure. This is done to avoid interleving pages from different >> tasks, which would prevent IO being batched. This is already done for >> the order-0 allocations so we follow the same pattern. >> >> As is done for order-0 per-cpu clusters, the scanner now can steal >> order-0 entries from any per-cpu-per-order reserved cluster. This >> ensures that when the swap file is getting full, space doesn't get tied >> up in the per-cpu reserves. >> >> This change only modifies swap to be able to accept any order mTHP. It >> doesn't change the callers to elide doing the actual split. That will be >> done in separate changes. [...] > > Hi Ryan, > > Sorry for bringing up an old thread. No problem - thanks for the report! > > During the initial hour of utilizing an Android phone with 64KiB mTHP, > we noticed that the > anon_swpout_fallback rate was less than 10%. However, after several > hours of phone > usage, we observed a significant increase in the anon_swpout_fallback > rate, reaching > 100%. I suspect this is due to fragmentation of the clusters; If there is just one page left in a cluster then the cluster can't be freed and once the cluster free list is empty a new cluster allcoation will fail and this will cause fallback to order-0. > > As I checked the code of scan_swap_map_try_ssd_cluster(), > > static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, > unsigned long *offset, unsigned long *scan_base, int order) > { > unsigned int nr_pages = 1 << order; > struct percpu_cluster *cluster; > struct swap_cluster_info *ci; > unsigned int tmp, max; > > new_cluster: > cluster = this_cpu_ptr(si->percpu_cluster); > tmp = cluster->next[order]; > if (tmp == SWAP_NEXT_INVALID) { > if (!cluster_list_empty(&si->free_clusters)) { > tmp = cluster_next(&si->free_clusters.head) * > SWAPFILE_CLUSTER; > } else if (!cluster_list_empty(&si->discard_clusters)) { > /* > * we don't have free cluster but have some clusters in > * discarding, do discard now and reclaim them, then > * reread cluster_next_cpu since we dropped si->lock > */ > swap_do_scheduled_discard(si); > *scan_base = this_cpu_read(*si->cluster_next_cpu); > *offset = *scan_base; > goto new_cluster; > } else > return false; > } > ... > > } > > Considering the cluster_list_empty() checks, is it necessary to have > free_cluster to > ensure a continuous allocation of swap slots for large folio swap out? Yes, currently that is done by design; if we can't allocate a free cluster then we only scan for free space in an already allocated cluster for order-0 allocations. I did this for a couple of reasons; 1: Simplicity. 2: Keep behavior the same as PMD-order allocations, which are never scanned (although the cluster is the same size as the PMD so scanning would be pointless there - so perhaps this is not a good argument for not scanning smaller high orders). 3: If scanning for a high order fails then we would fall back to order-0 and scan again, so I was trying to avoid the potential for 2 scans (although once you split the page, you'll end up scanning per-page, so perhaps its not a real argument either). > For instance, > if numerous clusters still possess ample free swap slots, could we > potentially miss > out on them due to a lack of execution of a slow scan? I think it would definitely be possible to add support for scanning high orders and from memory, I don't think it would be too difficult. Based on your experience, it sounds like this would be valuable. I'm going to be out on Paternity leave for 3 weeks from end of today, so I won't personally be able to do this until I get back. I might find some time to review if you were to post something though :) > > I'm not saying your patchset has problems, just that I have some questions. Let's call it "opportunity for further improvement" rather than problems. :) I suspect swap-in of large folios may help reduce the fragmentation a bit since we are less likely to keep parts of a previously swapped-out mTHP in swap. Also, I understand that Chris Li has been doing some thinking around an indirection layer which would remove the requirement for pages of a large folio to be stored contiguously in the swap file. I think he is planning to talk about that at LSFMM? (which I sadly won't be attending). Thanks, Ryan > > Thanks > Barry
On Mon, May 13, 2024 at 8:43 PM Ryan Roberts <ryan.roberts@arm.com> wrote: > > On 13/05/2024 08:30, Barry Song wrote: > > On Tue, Apr 9, 2024 at 6:40 AM Ryan Roberts <ryan.roberts@arm.com> wrote: > >> > >> Multi-size THP enables performance improvements by allocating large, > >> pte-mapped folios for anonymous memory. However I've observed that on an > >> arm64 system running a parallel workload (e.g. kernel compilation) > >> across many cores, under high memory pressure, the speed regresses. This > >> is due to bottlenecking on the increased number of TLBIs added due to > >> all the extra folio splitting when the large folios are swapped out. > >> > >> Therefore, solve this regression by adding support for swapping out mTHP > >> without needing to split the folio, just like is already done for > >> PMD-sized THP. This change only applies when CONFIG_THP_SWAP is enabled, > >> and when the swap backing store is a non-rotating block device. These > >> are the same constraints as for the existing PMD-sized THP swap-out > >> support. > >> > >> Note that no attempt is made to swap-in (m)THP here - this is still done > >> page-by-page, like for PMD-sized THP. But swapping-out mTHP is a > >> prerequisite for swapping-in mTHP. > >> > >> The main change here is to improve the swap entry allocator so that it > >> can allocate any power-of-2 number of contiguous entries between [1, (1 > >> << PMD_ORDER)]. This is done by allocating a cluster for each distinct > >> order and allocating sequentially from it until the cluster is full. > >> This ensures that we don't need to search the map and we get no > >> fragmentation due to alignment padding for different orders in the > >> cluster. If there is no current cluster for a given order, we attempt to > >> allocate a free cluster from the list. If there are no free clusters, we > >> fail the allocation and the caller can fall back to splitting the folio > >> and allocates individual entries (as per existing PMD-sized THP > >> fallback). > >> > >> The per-order current clusters are maintained per-cpu using the existing > >> infrastructure. This is done to avoid interleving pages from different > >> tasks, which would prevent IO being batched. This is already done for > >> the order-0 allocations so we follow the same pattern. > >> > >> As is done for order-0 per-cpu clusters, the scanner now can steal > >> order-0 entries from any per-cpu-per-order reserved cluster. This > >> ensures that when the swap file is getting full, space doesn't get tied > >> up in the per-cpu reserves. > >> > >> This change only modifies swap to be able to accept any order mTHP. It > >> doesn't change the callers to elide doing the actual split. That will be > >> done in separate changes. > > [...] > > > > > Hi Ryan, > > > > Sorry for bringing up an old thread. > > No problem - thanks for the report! > > > > > During the initial hour of utilizing an Android phone with 64KiB mTHP, > > we noticed that the > > anon_swpout_fallback rate was less than 10%. However, after several > > hours of phone > > usage, we observed a significant increase in the anon_swpout_fallback > > rate, reaching > > 100%. > > I suspect this is due to fragmentation of the clusters; If there is just one > page left in a cluster then the cluster can't be freed and once the cluster free > list is empty a new cluster allcoation will fail and this will cause fallback to > order-0. > > > > > As I checked the code of scan_swap_map_try_ssd_cluster(), > > > > static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, > > unsigned long *offset, unsigned long *scan_base, int order) > > { > > unsigned int nr_pages = 1 << order; > > struct percpu_cluster *cluster; > > struct swap_cluster_info *ci; > > unsigned int tmp, max; > > > > new_cluster: > > cluster = this_cpu_ptr(si->percpu_cluster); > > tmp = cluster->next[order]; > > if (tmp == SWAP_NEXT_INVALID) { > > if (!cluster_list_empty(&si->free_clusters)) { > > tmp = cluster_next(&si->free_clusters.head) * > > SWAPFILE_CLUSTER; > > } else if (!cluster_list_empty(&si->discard_clusters)) { > > /* > > * we don't have free cluster but have some clusters in > > * discarding, do discard now and reclaim them, then > > * reread cluster_next_cpu since we dropped si->lock > > */ > > swap_do_scheduled_discard(si); > > *scan_base = this_cpu_read(*si->cluster_next_cpu); > > *offset = *scan_base; > > goto new_cluster; > > } else > > return false; > > } > > ... > > > > } > > > > Considering the cluster_list_empty() checks, is it necessary to have > > free_cluster to > > ensure a continuous allocation of swap slots for large folio swap out? > > Yes, currently that is done by design; if we can't allocate a free cluster then > we only scan for free space in an already allocated cluster for order-0 > allocations. I did this for a couple of reasons; > > 1: Simplicity. > > 2: Keep behavior the same as PMD-order allocations, which are never scanned > (although the cluster is the same size as the PMD so scanning would be pointless > there - so perhaps this is not a good argument for not scanning smaller high > orders). > > 3: If scanning for a high order fails then we would fall back to order-0 and > scan again, so I was trying to avoid the potential for 2 scans (although once > you split the page, you'll end up scanning per-page, so perhaps its not a real > argument either). > > > For instance, > > if numerous clusters still possess ample free swap slots, could we > > potentially miss > > out on them due to a lack of execution of a slow scan? > > I think it would definitely be possible to add support for scanning high orders > and from memory, I don't think it would be too difficult. Based on your > experience, it sounds like this would be valuable. > > I'm going to be out on Paternity leave for 3 weeks from end of today, so I won't > personally be able to do this until I get back. I might find some time to review > if you were to post something though :) Congratulations on the arrival of your precious little one! Forget about the swap and mTHP, enjoy your time with the family :-) > > > > > I'm not saying your patchset has problems, just that I have some questions. > > Let's call it "opportunity for further improvement" rather than problems. :) > > I suspect swap-in of large folios may help reduce the fragmentation a bit since > we are less likely to keep parts of a previously swapped-out mTHP in swap. > > Also, I understand that Chris Li has been doing some thinking around an > indirection layer which would remove the requirement for pages of a large folio > to be stored contiguously in the swap file. I think he is planning to talk about > that at LSFMM? (which I sadly won't be attending). > > Thanks, > Ryan > > > Thanks Barry
diff --git a/include/linux/swap.h b/include/linux/swap.h index b888e1080a94..11c53692f65f 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -268,13 +268,19 @@ struct swap_cluster_info { */ #define SWAP_NEXT_INVALID 0 +#ifdef CONFIG_THP_SWAP +#define SWAP_NR_ORDERS (PMD_ORDER + 1) +#else +#define SWAP_NR_ORDERS 1 +#endif + /* * We assign a cluster to each CPU, so each CPU can allocate swap entry from * its own cluster and swapout sequentially. The purpose is to optimize swapout * throughput. */ struct percpu_cluster { - unsigned int next; /* Likely next allocation offset */ + unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */ }; struct swap_cluster_list { diff --git a/mm/swapfile.c b/mm/swapfile.c index d2e3d3cd439f..148ef08f19dd 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -551,10 +551,12 @@ static void free_cluster(struct swap_info_struct *si, unsigned long idx) /* * The cluster corresponding to page_nr will be used. The cluster will be - * removed from free cluster list and its usage counter will be increased. + * removed from free cluster list and its usage counter will be increased by + * count. */ -static void inc_cluster_info_page(struct swap_info_struct *p, - struct swap_cluster_info *cluster_info, unsigned long page_nr) +static void add_cluster_info_page(struct swap_info_struct *p, + struct swap_cluster_info *cluster_info, unsigned long page_nr, + unsigned long count) { unsigned long idx = page_nr / SWAPFILE_CLUSTER; @@ -563,9 +565,19 @@ static void inc_cluster_info_page(struct swap_info_struct *p, if (cluster_is_free(&cluster_info[idx])) alloc_cluster(p, idx); - VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER); + VM_BUG_ON(cluster_count(&cluster_info[idx]) + count > SWAPFILE_CLUSTER); cluster_set_count(&cluster_info[idx], - cluster_count(&cluster_info[idx]) + 1); + cluster_count(&cluster_info[idx]) + count); +} + +/* + * The cluster corresponding to page_nr will be used. The cluster will be + * removed from free cluster list and its usage counter will be increased by 1. + */ +static void inc_cluster_info_page(struct swap_info_struct *p, + struct swap_cluster_info *cluster_info, unsigned long page_nr) +{ + add_cluster_info_page(p, cluster_info, page_nr, 1); } /* @@ -595,7 +607,7 @@ static void dec_cluster_info_page(struct swap_info_struct *p, */ static bool scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si, - unsigned long offset) + unsigned long offset, int order) { struct percpu_cluster *percpu_cluster; bool conflict; @@ -609,24 +621,39 @@ scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si, return false; percpu_cluster = this_cpu_ptr(si->percpu_cluster); - percpu_cluster->next = SWAP_NEXT_INVALID; + percpu_cluster->next[order] = SWAP_NEXT_INVALID; + return true; +} + +static inline bool swap_range_empty(char *swap_map, unsigned int start, + unsigned int nr_pages) +{ + unsigned int i; + + for (i = 0; i < nr_pages; i++) { + if (swap_map[start + i]) + return false; + } + return true; } /* - * Try to get a swap entry from current cpu's swap entry pool (a cluster). This - * might involve allocating a new cluster for current CPU too. + * Try to get swap entries with specified order from current cpu's swap entry + * pool (a cluster). This might involve allocating a new cluster for current CPU + * too. */ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, - unsigned long *offset, unsigned long *scan_base) + unsigned long *offset, unsigned long *scan_base, int order) { + unsigned int nr_pages = 1 << order; struct percpu_cluster *cluster; struct swap_cluster_info *ci; unsigned int tmp, max; new_cluster: cluster = this_cpu_ptr(si->percpu_cluster); - tmp = cluster->next; + tmp = cluster->next[order]; if (tmp == SWAP_NEXT_INVALID) { if (!cluster_list_empty(&si->free_clusters)) { tmp = cluster_next(&si->free_clusters.head) * @@ -647,26 +674,27 @@ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, /* * Other CPUs can use our cluster if they can't find a free cluster, - * check if there is still free entry in the cluster + * check if there is still free entry in the cluster, maintaining + * natural alignment. */ max = min_t(unsigned long, si->max, ALIGN(tmp + 1, SWAPFILE_CLUSTER)); if (tmp < max) { ci = lock_cluster(si, tmp); while (tmp < max) { - if (!si->swap_map[tmp]) + if (swap_range_empty(si->swap_map, tmp, nr_pages)) break; - tmp++; + tmp += nr_pages; } unlock_cluster(ci); } if (tmp >= max) { - cluster->next = SWAP_NEXT_INVALID; + cluster->next[order] = SWAP_NEXT_INVALID; goto new_cluster; } *offset = tmp; *scan_base = tmp; - tmp += 1; - cluster->next = tmp < max ? tmp : SWAP_NEXT_INVALID; + tmp += nr_pages; + cluster->next[order] = tmp < max ? tmp : SWAP_NEXT_INVALID; return true; } @@ -796,13 +824,14 @@ static bool swap_offset_available_and_locked(struct swap_info_struct *si, static int scan_swap_map_slots(struct swap_info_struct *si, unsigned char usage, int nr, - swp_entry_t slots[]) + swp_entry_t slots[], int order) { struct swap_cluster_info *ci; unsigned long offset; unsigned long scan_base; unsigned long last_in_cluster = 0; int latency_ration = LATENCY_LIMIT; + unsigned int nr_pages = 1 << order; int n_ret = 0; bool scanned_many = false; @@ -817,6 +846,25 @@ static int scan_swap_map_slots(struct swap_info_struct *si, * And we let swap pages go all over an SSD partition. Hugh */ + if (order > 0) { + /* + * Should not even be attempting large allocations when huge + * page swap is disabled. Warn and fail the allocation. + */ + if (!IS_ENABLED(CONFIG_THP_SWAP) || + nr_pages > SWAPFILE_CLUSTER) { + VM_WARN_ON_ONCE(1); + return 0; + } + + /* + * Swapfile is not block device or not using clusters so unable + * to allocate large entries. + */ + if (!(si->flags & SWP_BLKDEV) || !si->cluster_info) + return 0; + } + si->flags += SWP_SCANNING; /* * Use percpu scan base for SSD to reduce lock contention on @@ -831,8 +879,11 @@ static int scan_swap_map_slots(struct swap_info_struct *si, /* SSD algorithm */ if (si->cluster_info) { - if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) + if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base, order)) { + if (order > 0) + goto no_page; goto scan; + } } else if (unlikely(!si->cluster_nr--)) { if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { si->cluster_nr = SWAPFILE_CLUSTER - 1; @@ -874,13 +925,16 @@ static int scan_swap_map_slots(struct swap_info_struct *si, checks: if (si->cluster_info) { - while (scan_swap_map_ssd_cluster_conflict(si, offset)) { + while (scan_swap_map_ssd_cluster_conflict(si, offset, order)) { /* take a break if we already got some slots */ if (n_ret) goto done; if (!scan_swap_map_try_ssd_cluster(si, &offset, - &scan_base)) + &scan_base, order)) { + if (order > 0) + goto no_page; goto scan; + } } } if (!(si->flags & SWP_WRITEOK)) @@ -911,11 +965,11 @@ static int scan_swap_map_slots(struct swap_info_struct *si, else goto done; } - WRITE_ONCE(si->swap_map[offset], usage); - inc_cluster_info_page(si, si->cluster_info, offset); + memset(si->swap_map + offset, usage, nr_pages); + add_cluster_info_page(si, si->cluster_info, offset, nr_pages); unlock_cluster(ci); - swap_range_alloc(si, offset, 1); + swap_range_alloc(si, offset, nr_pages); slots[n_ret++] = swp_entry(si->type, offset); /* got enough slots or reach max slots? */ @@ -936,8 +990,10 @@ static int scan_swap_map_slots(struct swap_info_struct *si, /* try to get more slots in cluster */ if (si->cluster_info) { - if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) + if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base, order)) goto checks; + if (order > 0) + goto done; } else if (si->cluster_nr && !si->swap_map[++offset]) { /* non-ssd case, still more slots in cluster? */ --si->cluster_nr; @@ -964,11 +1020,13 @@ static int scan_swap_map_slots(struct swap_info_struct *si, } done: - set_cluster_next(si, offset + 1); + if (order == 0) + set_cluster_next(si, offset + 1); si->flags -= SWP_SCANNING; return n_ret; scan: + VM_WARN_ON(order > 0); spin_unlock(&si->lock); while (++offset <= READ_ONCE(si->highest_bit)) { if (unlikely(--latency_ration < 0)) { @@ -997,38 +1055,6 @@ static int scan_swap_map_slots(struct swap_info_struct *si, return n_ret; } -static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot) -{ - unsigned long idx; - struct swap_cluster_info *ci; - unsigned long offset; - - /* - * Should not even be attempting cluster allocations when huge - * page swap is disabled. Warn and fail the allocation. - */ - if (!IS_ENABLED(CONFIG_THP_SWAP)) { - VM_WARN_ON_ONCE(1); - return 0; - } - - if (cluster_list_empty(&si->free_clusters)) - return 0; - - idx = cluster_list_first(&si->free_clusters); - offset = idx * SWAPFILE_CLUSTER; - ci = lock_cluster(si, offset); - alloc_cluster(si, idx); - cluster_set_count(ci, SWAPFILE_CLUSTER); - - memset(si->swap_map + offset, SWAP_HAS_CACHE, SWAPFILE_CLUSTER); - unlock_cluster(ci); - swap_range_alloc(si, offset, SWAPFILE_CLUSTER); - *slot = swp_entry(si->type, offset); - - return 1; -} - static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) { unsigned long offset = idx * SWAPFILE_CLUSTER; @@ -1051,9 +1077,6 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order) int n_ret = 0; int node; - /* Only single cluster request supported */ - WARN_ON_ONCE(n_goal > 1 && size == SWAPFILE_CLUSTER); - spin_lock(&swap_avail_lock); avail_pgs = atomic_long_read(&nr_swap_pages) / size; @@ -1089,14 +1112,10 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order) spin_unlock(&si->lock); goto nextsi; } - if (size == SWAPFILE_CLUSTER) { - if (si->flags & SWP_BLKDEV) - n_ret = swap_alloc_cluster(si, swp_entries); - } else - n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, - n_goal, swp_entries); + n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, + n_goal, swp_entries, order); spin_unlock(&si->lock); - if (n_ret || size == SWAPFILE_CLUSTER) + if (n_ret || size > 1) goto check_out; cond_resched(); @@ -1673,7 +1692,7 @@ swp_entry_t get_swap_page_of_type(int type) /* This is called for allocating swap entry, not cache */ spin_lock(&si->lock); - if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry)) + if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry, 0)) atomic_long_dec(&nr_swap_pages); spin_unlock(&si->lock); fail: @@ -3127,7 +3146,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) p->flags |= SWP_SYNCHRONOUS_IO; if (p->bdev && bdev_nonrot(p->bdev)) { - int cpu; + int cpu, i; unsigned long ci, nr_cluster; p->flags |= SWP_SOLIDSTATE; @@ -3165,7 +3184,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) struct percpu_cluster *cluster; cluster = per_cpu_ptr(p->percpu_cluster, cpu); - cluster->next = SWAP_NEXT_INVALID; + for (i = 0; i < SWAP_NR_ORDERS; i++) + cluster->next[i] = SWAP_NEXT_INVALID; } } else { atomic_inc(&nr_rotate_swap);