Message ID | 20240612124750.2220726-2-usamaarif642@gmail.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | mm: store zero pages to be swapped out in a bitmap | expand |
On Wed, Jun 12, 2024 at 01:43:35PM +0100, Usama Arif wrote: [..] Hi Usama, A few more comments/questions, sorry for not looking closely earlier. > diff --git a/mm/swapfile.c b/mm/swapfile.c > index f1e559e216bd..48d8dca0b94b 100644 > --- a/mm/swapfile.c > +++ b/mm/swapfile.c > @@ -453,6 +453,8 @@ static unsigned int cluster_list_del_first(struct swap_cluster_list *list, > static void swap_cluster_schedule_discard(struct swap_info_struct *si, > unsigned int idx) > { > + unsigned int i; > + > /* > * If scan_swap_map_slots() can't find a free cluster, it will check > * si->swap_map directly. To make sure the discarding cluster isn't > @@ -461,6 +463,13 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si, > */ > memset(si->swap_map + idx * SWAPFILE_CLUSTER, > SWAP_MAP_BAD, SWAPFILE_CLUSTER); > + /* > + * zeromap can see updates from concurrent swap_writepage() and swap_read_folio() > + * call on other slots, hence use atomic clear_bit for zeromap instead of the > + * non-atomic bitmap_clear. > + */ I don't think this is accurate. swap_read_folio() does not update the zeromap. I think the need for an atomic operation here is because we may be updating adjacent bits simulatenously, so we may cause lost updates otherwise (i.e. corrupting adjacent bits). > + for (i = 0; i < SWAPFILE_CLUSTER; i++) > + clear_bit(idx * SWAPFILE_CLUSTER + i, si->zeromap); Could you explain why we need to clear the zeromap here? swap_cluster_schedule_discard() is called from: - swap_free_cluster() -> free_cluster() This is already covered below. - swap_entry_free() -> dec_cluster_info_page() -> free_cluster() Each entry in the cluster should have its zeromap bit cleared in swap_entry_free() before the entire cluster is free and we call free_cluster(). Am I missing something? > > cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx); > > @@ -482,7 +491,7 @@ static void __free_cluster(struct swap_info_struct *si, unsigned long idx) > static void swap_do_scheduled_discard(struct swap_info_struct *si) > { > struct swap_cluster_info *info, *ci; > - unsigned int idx; > + unsigned int idx, i; > > info = si->cluster_info; > > @@ -498,6 +507,8 @@ static void swap_do_scheduled_discard(struct swap_info_struct *si) > __free_cluster(si, idx); > memset(si->swap_map + idx * SWAPFILE_CLUSTER, > 0, SWAPFILE_CLUSTER); > + for (i = 0; i < SWAPFILE_CLUSTER; i++) > + clear_bit(idx * SWAPFILE_CLUSTER + i, si->zeromap); Same here. I didn't look into the specific code paths, but shouldn't the cluster be unused (and hence its zeromap bits already cleared?). > unlock_cluster(ci); > } > } > @@ -1059,9 +1070,12 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) > { > unsigned long offset = idx * SWAPFILE_CLUSTER; > struct swap_cluster_info *ci; > + unsigned int i; > > ci = lock_cluster(si, offset); > memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER); > + for (i = 0; i < SWAPFILE_CLUSTER; i++) > + clear_bit(offset + i, si->zeromap); > cluster_set_count_flag(ci, 0, 0); > free_cluster(si, idx); > unlock_cluster(ci); > @@ -1336,6 +1350,7 @@ static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry) > count = p->swap_map[offset]; > VM_BUG_ON(count != SWAP_HAS_CACHE); > p->swap_map[offset] = 0; > + clear_bit(offset, p->zeromap); I think instead of clearing the zeromap in swap_free_cluster() and here separately, we can just do it in swap_range_free(). I suspect this may be the only place we really need to clear the zero in the swapfile code. > dec_cluster_info_page(p, p->cluster_info, offset); > unlock_cluster(ci); > > @@ -2597,6 +2612,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) > free_percpu(p->cluster_next_cpu); > p->cluster_next_cpu = NULL; > vfree(swap_map); > + bitmap_free(p->zeromap); > kvfree(cluster_info); > /* Destroy swap account information */ > swap_cgroup_swapoff(p->type); > @@ -3123,6 +3139,12 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) > goto bad_swap_unlock_inode; > } > > + p->zeromap = bitmap_zalloc(maxpages, GFP_KERNEL); > + if (!p->zeromap) { > + error = -ENOMEM; > + goto bad_swap_unlock_inode; > + } > + > if (p->bdev && bdev_stable_writes(p->bdev)) > p->flags |= SWP_STABLE_WRITES; > > -- > 2.43.0 >
On 12/06/2024 21:13, Yosry Ahmed wrote: > On Wed, Jun 12, 2024 at 01:43:35PM +0100, Usama Arif wrote: > [..] > > Hi Usama, > > A few more comments/questions, sorry for not looking closely earlier. No worries, Thanks for the reviews! >> diff --git a/mm/swapfile.c b/mm/swapfile.c >> index f1e559e216bd..48d8dca0b94b 100644 >> --- a/mm/swapfile.c >> +++ b/mm/swapfile.c >> @@ -453,6 +453,8 @@ static unsigned int cluster_list_del_first(struct swap_cluster_list *list, >> static void swap_cluster_schedule_discard(struct swap_info_struct *si, >> unsigned int idx) >> { >> + unsigned int i; >> + >> /* >> * If scan_swap_map_slots() can't find a free cluster, it will check >> * si->swap_map directly. To make sure the discarding cluster isn't >> @@ -461,6 +463,13 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si, >> */ >> memset(si->swap_map + idx * SWAPFILE_CLUSTER, >> SWAP_MAP_BAD, SWAPFILE_CLUSTER); >> + /* >> + * zeromap can see updates from concurrent swap_writepage() and swap_read_folio() >> + * call on other slots, hence use atomic clear_bit for zeromap instead of the >> + * non-atomic bitmap_clear. >> + */ > I don't think this is accurate. swap_read_folio() does not update the > zeromap. I think the need for an atomic operation here is because we may > be updating adjacent bits simulatenously, so we may cause lost updates > otherwise (i.e. corrupting adjacent bits). Thanks, will change to "Use atomic clear_bit instead of non-atomic bitmap_clear to prevent adjacent bits corruption due to simultaneous writes." in the next revision > >> + for (i = 0; i < SWAPFILE_CLUSTER; i++) >> + clear_bit(idx * SWAPFILE_CLUSTER + i, si->zeromap); > Could you explain why we need to clear the zeromap here? > > swap_cluster_schedule_discard() is called from: > - swap_free_cluster() -> free_cluster() > > This is already covered below. > > - swap_entry_free() -> dec_cluster_info_page() -> free_cluster() > > Each entry in the cluster should have its zeromap bit cleared in > swap_entry_free() before the entire cluster is free and we call > free_cluster(). > > Am I missing something? Yes, it looks like this one is not needed as swap_entry_free and swap_free_cluster would already have cleared the bit. Will remove it. I had initially started checking what codepaths zeromap would need to be cleared. But then thought I could do it wherever si->swap_map is cleared or set to SWAP_MAP_BAD, which is why I added it here. >> >> cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx); >> >> @@ -482,7 +491,7 @@ static void __free_cluster(struct swap_info_struct *si, unsigned long idx) >> static void swap_do_scheduled_discard(struct swap_info_struct *si) >> { >> struct swap_cluster_info *info, *ci; >> - unsigned int idx; >> + unsigned int idx, i; >> >> info = si->cluster_info; >> >> @@ -498,6 +507,8 @@ static void swap_do_scheduled_discard(struct swap_info_struct *si) >> __free_cluster(si, idx); >> memset(si->swap_map + idx * SWAPFILE_CLUSTER, >> 0, SWAPFILE_CLUSTER); >> + for (i = 0; i < SWAPFILE_CLUSTER; i++) >> + clear_bit(idx * SWAPFILE_CLUSTER + i, si->zeromap); > Same here. I didn't look into the specific code paths, but shouldn't the > cluster be unused (and hence its zeromap bits already cleared?). > I think this one is needed (or atleast very good to have). There are 2 paths: 1) swap_cluster_schedule_discard (clears zeromap) -> swap_discard_work -> swap_do_scheduled_discard (clears zeromap) Path 1 doesnt need it as swap_cluster_schedule_discard already clears it. 2) scan_swap_map_slots -> scan_swap_map_try_ssd_cluster -> swap_do_scheduled_discard (clears zeromap) Path 2 might need it as zeromap isnt cleared earlier I believe (eventhough I think it might already be 0). Even if its cleared in path 2, I think its good to keep this one, as the function is swap_do_scheduled_discard, i.e. incase it gets directly called or si->discard_work gets scheduled anywhere else in the future, it should do as the function name suggests, i.e. swap discard(clear zeromap). >> unlock_cluster(ci); >> } >> } >> @@ -1059,9 +1070,12 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) >> { >> unsigned long offset = idx * SWAPFILE_CLUSTER; >> struct swap_cluster_info *ci; >> + unsigned int i; >> >> ci = lock_cluster(si, offset); >> memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER); >> + for (i = 0; i < SWAPFILE_CLUSTER; i++) >> + clear_bit(offset + i, si->zeromap); >> cluster_set_count_flag(ci, 0, 0); >> free_cluster(si, idx); >> unlock_cluster(ci); >> @@ -1336,6 +1350,7 @@ static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry) >> count = p->swap_map[offset]; >> VM_BUG_ON(count != SWAP_HAS_CACHE); >> p->swap_map[offset] = 0; >> + clear_bit(offset, p->zeromap); > I think instead of clearing the zeromap in swap_free_cluster() and here > separately, we can just do it in swap_range_free(). I suspect this may > be the only place we really need to clear the zero in the swapfile code. Sure, we could move it to swap_range_free, but then also move the clearing of swap_map. When it comes to clearing zeromap, I think its just generally a good idea to clear it wherever swap_map is cleared. So the diff over v4 looks like below (should address all comments but not remove it from swap_do_scheduled_discard, and move si->swap_map/zeromap clearing from swap_free_cluster/swap_entry_free to swap_range_free): diff --git a/mm/swapfile.c b/mm/swapfile.c index 48d8dca0b94b..39cad0d09525 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -463,13 +463,6 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si, */ memset(si->swap_map + idx * SWAPFILE_CLUSTER, SWAP_MAP_BAD, SWAPFILE_CLUSTER); - /* - * zeromap can see updates from concurrent swap_writepage() and swap_read_folio() - * call on other slots, hence use atomic clear_bit for zeromap instead of the - * non-atomic bitmap_clear. - */ - for (i = 0; i < SWAPFILE_CLUSTER; i++) - clear_bit(idx * SWAPFILE_CLUSTER + i, si->zeromap); cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx); @@ -758,6 +751,15 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, unsigned long begin = offset; unsigned long end = offset + nr_entries - 1; void (*swap_slot_free_notify)(struct block_device *, unsigned long); + unsigned int i; + + memset(si->swap_map + offset, 0, nr_entries); + /* + * Use atomic clear_bit operations only on zeromap instead of non-atomic + * bitmap_clear to prevent adjacent bits corruption due to simultaneous writes. + */ + for (i = 0; i < nr_entries; i++) + clear_bit(offset + i, si->zeromap); if (offset < si->lowest_bit) si->lowest_bit = offset; @@ -1070,12 +1072,8 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) { unsigned long offset = idx * SWAPFILE_CLUSTER; struct swap_cluster_info *ci; - unsigned int i; ci = lock_cluster(si, offset); - memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER); - for (i = 0; i < SWAPFILE_CLUSTER; i++) - clear_bit(offset + i, si->zeromap); cluster_set_count_flag(ci, 0, 0); free_cluster(si, idx); unlock_cluster(ci); @@ -1349,8 +1347,6 @@ static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry) ci = lock_cluster(p, offset); count = p->swap_map[offset]; VM_BUG_ON(count != SWAP_HAS_CACHE); - p->swap_map[offset] = 0; - clear_bit(offset, p->zeromap); dec_cluster_info_page(p, p->cluster_info, offset); unlock_cluster(ci);
[..] > > > > >> + for (i = 0; i < SWAPFILE_CLUSTER; i++) > >> + clear_bit(idx * SWAPFILE_CLUSTER + i, si->zeromap); > > Could you explain why we need to clear the zeromap here? > > > > swap_cluster_schedule_discard() is called from: > > - swap_free_cluster() -> free_cluster() > > > > This is already covered below. > > > > - swap_entry_free() -> dec_cluster_info_page() -> free_cluster() > > > > Each entry in the cluster should have its zeromap bit cleared in > > swap_entry_free() before the entire cluster is free and we call > > free_cluster(). > > > > Am I missing something? > > Yes, it looks like this one is not needed as swap_entry_free and > swap_free_cluster would already have cleared the bit. Will remove it. > > I had initially started checking what codepaths zeromap would need to be > cleared. But then thought I could do it wherever si->swap_map is cleared > or set to SWAP_MAP_BAD, which is why I added it here. > > >> > >> cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx); > >> > >> @@ -482,7 +491,7 @@ static void __free_cluster(struct swap_info_struct *si, unsigned long idx) > >> static void swap_do_scheduled_discard(struct swap_info_struct *si) > >> { > >> struct swap_cluster_info *info, *ci; > >> - unsigned int idx; > >> + unsigned int idx, i; > >> > >> info = si->cluster_info; > >> > >> @@ -498,6 +507,8 @@ static void swap_do_scheduled_discard(struct swap_info_struct *si) > >> __free_cluster(si, idx); > >> memset(si->swap_map + idx * SWAPFILE_CLUSTER, > >> 0, SWAPFILE_CLUSTER); > >> + for (i = 0; i < SWAPFILE_CLUSTER; i++) > >> + clear_bit(idx * SWAPFILE_CLUSTER + i, si->zeromap); > > Same here. I didn't look into the specific code paths, but shouldn't the > > cluster be unused (and hence its zeromap bits already cleared?). > > > I think this one is needed (or atleast very good to have). There are 2 > paths: > > 1) swap_cluster_schedule_discard (clears zeromap) -> swap_discard_work > -> swap_do_scheduled_discard (clears zeromap) > > Path 1 doesnt need it as swap_cluster_schedule_discard already clears it. > > 2) scan_swap_map_slots -> scan_swap_map_try_ssd_cluster -> > swap_do_scheduled_discard (clears zeromap) > > Path 2 might need it as zeromap isnt cleared earlier I believe > (eventhough I think it might already be 0). Aren't the clusters in the discard list free by definition? It seems like we add a cluster there from swap_cluster_schedule_discard(), which we establish above that it gets called on a free cluster, right? > > Even if its cleared in path 2, I think its good to keep this one, as the > function is swap_do_scheduled_discard, i.e. incase it gets directly > called or si->discard_work gets scheduled anywhere else in the future, > it should do as the function name suggests, i.e. swap discard(clear > zeromap). I think we just set the swap map to SWAP_MAP_BAD in swap_cluster_schedule_discard() and then clear it in swap_do_scheduled_discard(), and the clusters are already freed at that point. Ying could set me straight if I am wrong here. It is confusing to me to keep an unnecessary call tbh, it makes sense to clear zeromap bits once, when the swap entry/cluster is not being used anymore and before it's reallocated. > > >> unlock_cluster(ci); > >> } > >> } > >> @@ -1059,9 +1070,12 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) > >> { > >> unsigned long offset = idx * SWAPFILE_CLUSTER; > >> struct swap_cluster_info *ci; > >> + unsigned int i; > >> > >> ci = lock_cluster(si, offset); > >> memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER); > >> + for (i = 0; i < SWAPFILE_CLUSTER; i++) > >> + clear_bit(offset + i, si->zeromap); > >> cluster_set_count_flag(ci, 0, 0); > >> free_cluster(si, idx); > >> unlock_cluster(ci); > >> @@ -1336,6 +1350,7 @@ static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry) > >> count = p->swap_map[offset]; > >> VM_BUG_ON(count != SWAP_HAS_CACHE); > >> p->swap_map[offset] = 0; > >> + clear_bit(offset, p->zeromap); > > I think instead of clearing the zeromap in swap_free_cluster() and here > > separately, we can just do it in swap_range_free(). I suspect this may > > be the only place we really need to clear the zero in the swapfile code. > > Sure, we could move it to swap_range_free, but then also move the > clearing of swap_map. > > When it comes to clearing zeromap, I think its just generally a good > idea to clear it wherever swap_map is cleared. I am not convinced about this argument. The swap_map is used for multiple reasons beyond just keeping track of whether a swap entry is in-use. The zeromap on the other hand is simpler and just needs to be cleared once when an entry is being freed. Unless others disagree, I prefer to only clear the zeromap once in swap_range_free() and keep the swap_map code as-is for now. If we think there is value in moving clearing the swap_map to swap_range_free(), it should at least be in a separate patch to be evaluated separately. Just my 2c.
On 13/06/2024 17:38, Yosry Ahmed wrote: > [..] >>>> + for (i = 0; i < SWAPFILE_CLUSTER; i++) >>>> + clear_bit(idx * SWAPFILE_CLUSTER + i, si->zeromap); >>> Could you explain why we need to clear the zeromap here? >>> >>> swap_cluster_schedule_discard() is called from: >>> - swap_free_cluster() -> free_cluster() >>> >>> This is already covered below. >>> >>> - swap_entry_free() -> dec_cluster_info_page() -> free_cluster() >>> >>> Each entry in the cluster should have its zeromap bit cleared in >>> swap_entry_free() before the entire cluster is free and we call >>> free_cluster(). >>> >>> Am I missing something? >> Yes, it looks like this one is not needed as swap_entry_free and >> swap_free_cluster would already have cleared the bit. Will remove it. >> >> I had initially started checking what codepaths zeromap would need to be >> cleared. But then thought I could do it wherever si->swap_map is cleared >> or set to SWAP_MAP_BAD, which is why I added it here. >> >>>> cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx); >>>> >>>> @@ -482,7 +491,7 @@ static void __free_cluster(struct swap_info_struct *si, unsigned long idx) >>>> static void swap_do_scheduled_discard(struct swap_info_struct *si) >>>> { >>>> struct swap_cluster_info *info, *ci; >>>> - unsigned int idx; >>>> + unsigned int idx, i; >>>> >>>> info = si->cluster_info; >>>> >>>> @@ -498,6 +507,8 @@ static void swap_do_scheduled_discard(struct swap_info_struct *si) >>>> __free_cluster(si, idx); >>>> memset(si->swap_map + idx * SWAPFILE_CLUSTER, >>>> 0, SWAPFILE_CLUSTER); >>>> + for (i = 0; i < SWAPFILE_CLUSTER; i++) >>>> + clear_bit(idx * SWAPFILE_CLUSTER + i, si->zeromap); >>> Same here. I didn't look into the specific code paths, but shouldn't the >>> cluster be unused (and hence its zeromap bits already cleared?). >>> >> I think this one is needed (or atleast very good to have). There are 2 >> paths: >> >> 1) swap_cluster_schedule_discard (clears zeromap) -> swap_discard_work >> -> swap_do_scheduled_discard (clears zeromap) >> >> Path 1 doesnt need it as swap_cluster_schedule_discard already clears it. >> >> 2) scan_swap_map_slots -> scan_swap_map_try_ssd_cluster -> >> swap_do_scheduled_discard (clears zeromap) >> >> Path 2 might need it as zeromap isnt cleared earlier I believe >> (eventhough I think it might already be 0). > Aren't the clusters in the discard list free by definition? It seems > like we add a cluster there from swap_cluster_schedule_discard(), > which we establish above that it gets called on a free cluster, right? You mean for path 2? Its not from swap_cluster_schedule_discard. The whole call path is get_swap_pages -> scan_swap_map_slots -> scan_swap_map_try_ssd_cluster -> swap_do_scheduled_discard. Nowhere up until swap_do_scheduled_discard was the zeromap cleared, which is why I think we should add it here. >> Even if its cleared in path 2, I think its good to keep this one, as the >> function is swap_do_scheduled_discard, i.e. incase it gets directly >> called or si->discard_work gets scheduled anywhere else in the future, >> it should do as the function name suggests, i.e. swap discard(clear >> zeromap). > I think we just set the swap map to SWAP_MAP_BAD in > swap_cluster_schedule_discard() and then clear it in > swap_do_scheduled_discard(), and the clusters are already freed at > that point. Ying could set me straight if I am wrong here. I think you might be mixing up path 1 and path 2 above? swap_cluster_schedule_discard is not called in Path 2 where swap_do_scheduled_discard ends up being called, which is why I think we would need to clear the zeromap here. > It is confusing to me to keep an unnecessary call tbh, it makes sense > to clear zeromap bits once, when the swap entry/cluster is not being > used anymore and before it's reallocated. > >>>> unlock_cluster(ci); >>>> } >>>> } >>>> @@ -1059,9 +1070,12 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) >>>> { >>>> unsigned long offset = idx * SWAPFILE_CLUSTER; >>>> struct swap_cluster_info *ci; >>>> + unsigned int i; >>>> >>>> ci = lock_cluster(si, offset); >>>> memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER); >>>> + for (i = 0; i < SWAPFILE_CLUSTER; i++) >>>> + clear_bit(offset + i, si->zeromap); >>>> cluster_set_count_flag(ci, 0, 0); >>>> free_cluster(si, idx); >>>> unlock_cluster(ci); >>>> @@ -1336,6 +1350,7 @@ static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry) >>>> count = p->swap_map[offset]; >>>> VM_BUG_ON(count != SWAP_HAS_CACHE); >>>> p->swap_map[offset] = 0; >>>> + clear_bit(offset, p->zeromap); >>> I think instead of clearing the zeromap in swap_free_cluster() and here >>> separately, we can just do it in swap_range_free(). I suspect this may >>> be the only place we really need to clear the zero in the swapfile code. >> Sure, we could move it to swap_range_free, but then also move the >> clearing of swap_map. >> >> When it comes to clearing zeromap, I think its just generally a good >> idea to clear it wherever swap_map is cleared. > I am not convinced about this argument. The swap_map is used for > multiple reasons beyond just keeping track of whether a swap entry is > in-use. The zeromap on the other hand is simpler and just needs to be > cleared once when an entry is being freed. > > Unless others disagree, I prefer to only clear the zeromap once in > swap_range_free() and keep the swap_map code as-is for now. If we > think there is value in moving clearing the swap_map to > swap_range_free(), it should at least be in a separate patch to be > evaluated separately. > > Just my 2c. Sure, I am indifferent to this. I dont think it makes a difference if the zeromap is cleared in swap_free_cluster + swap_entry_free or later on in a common swap_range_free function, so will just move it in the next revision. Wont move swap_map clearing code.
[..] > >>>> @@ -498,6 +507,8 @@ static void swap_do_scheduled_discard(struct swap_info_struct *si) > >>>> __free_cluster(si, idx); > >>>> memset(si->swap_map + idx * SWAPFILE_CLUSTER, > >>>> 0, SWAPFILE_CLUSTER); > >>>> + for (i = 0; i < SWAPFILE_CLUSTER; i++) > >>>> + clear_bit(idx * SWAPFILE_CLUSTER + i, si->zeromap); > >>> Same here. I didn't look into the specific code paths, but shouldn't the > >>> cluster be unused (and hence its zeromap bits already cleared?). > >>> > >> I think this one is needed (or atleast very good to have). There are 2 > >> paths: > >> > >> 1) swap_cluster_schedule_discard (clears zeromap) -> swap_discard_work > >> -> swap_do_scheduled_discard (clears zeromap) > >> > >> Path 1 doesnt need it as swap_cluster_schedule_discard already clears it. > >> > >> 2) scan_swap_map_slots -> scan_swap_map_try_ssd_cluster -> > >> swap_do_scheduled_discard (clears zeromap) > >> > >> Path 2 might need it as zeromap isnt cleared earlier I believe > >> (eventhough I think it might already be 0). > > Aren't the clusters in the discard list free by definition? It seems > > like we add a cluster there from swap_cluster_schedule_discard(), > > which we establish above that it gets called on a free cluster, right? > > You mean for path 2? Its not from swap_cluster_schedule_discard. The > whole call path is > > get_swap_pages -> scan_swap_map_slots -> scan_swap_map_try_ssd_cluster > -> swap_do_scheduled_discard. Nowhere up until swap_do_scheduled_discard > was the zeromap cleared, which is why I think we should add it here. swap_do_scheduled_discard() iterates over clusters from si->discard_clusters. Clusters are added to that list from swap_cluster_schedule_discard(). IOW, swap_cluster_schedule_discard() schedules freed clusters to be discarded, and swap_do_scheduled_discard() later does the actual discarding, whether it's through si->discard_work scheduled by swap_cluster_schedule_discard(), or when looking for a free cluster through scan_swap_map_try_ssd_cluster(). Did I miss anything?
On 13/06/2024 20:26, Yosry Ahmed wrote: > [..] >>>>>> @@ -498,6 +507,8 @@ static void swap_do_scheduled_discard(struct swap_info_struct *si) >>>>>> __free_cluster(si, idx); >>>>>> memset(si->swap_map + idx * SWAPFILE_CLUSTER, >>>>>> 0, SWAPFILE_CLUSTER); >>>>>> + for (i = 0; i < SWAPFILE_CLUSTER; i++) >>>>>> + clear_bit(idx * SWAPFILE_CLUSTER + i, si->zeromap); >>>>> Same here. I didn't look into the specific code paths, but shouldn't the >>>>> cluster be unused (and hence its zeromap bits already cleared?). >>>>> >>>> I think this one is needed (or atleast very good to have). There are 2 >>>> paths: >>>> >>>> 1) swap_cluster_schedule_discard (clears zeromap) -> swap_discard_work >>>> -> swap_do_scheduled_discard (clears zeromap) >>>> >>>> Path 1 doesnt need it as swap_cluster_schedule_discard already clears it. >>>> >>>> 2) scan_swap_map_slots -> scan_swap_map_try_ssd_cluster -> >>>> swap_do_scheduled_discard (clears zeromap) >>>> >>>> Path 2 might need it as zeromap isnt cleared earlier I believe >>>> (eventhough I think it might already be 0). >>> Aren't the clusters in the discard list free by definition? It seems >>> like we add a cluster there from swap_cluster_schedule_discard(), >>> which we establish above that it gets called on a free cluster, right? >> You mean for path 2? Its not from swap_cluster_schedule_discard. The >> whole call path is >> >> get_swap_pages -> scan_swap_map_slots -> scan_swap_map_try_ssd_cluster >> -> swap_do_scheduled_discard. Nowhere up until swap_do_scheduled_discard >> was the zeromap cleared, which is why I think we should add it here. > swap_do_scheduled_discard() iterates over clusters from > si->discard_clusters. Clusters are added to that list from > swap_cluster_schedule_discard(). > > IOW, swap_cluster_schedule_discard() schedules freed clusters to be > discarded, and swap_do_scheduled_discard() later does the actual > discarding, whether it's through si->discard_work scheduled by > swap_cluster_schedule_discard(), or when looking for a free cluster > through scan_swap_map_try_ssd_cluster(). > > Did I miss anything? Ah ok, and the schedule_discard in free_cluster wont be called scheduled before swap_range_free. Will only keep the one in swap_range_free. Thanks!
On Thu, Jun 13, 2024 at 12:48 AM Usama Arif <usamaarif642@gmail.com> wrote: > > Approximately 10-20% of pages to be swapped out are zero pages [1]. > Rather than reading/writing these pages to flash resulting > in increased I/O and flash wear, a bitmap can be used to mark these > pages as zero at write time, and the pages can be filled at > read time if the bit corresponding to the page is set. > With this patch, NVMe writes in Meta server fleet decreased > by almost 10% with conventional swap setup (zswap disabled). > > [1] https://lore.kernel.org/all/20171018104832epcms5p1b2232e2236258de3d03d1344dde9fce0@epcms5p1/ > > Signed-off-by: Usama Arif <usamaarif642@gmail.com> > --- > include/linux/swap.h | 1 + > mm/page_io.c | 114 ++++++++++++++++++++++++++++++++++++++++++- > mm/swapfile.c | 24 ++++++++- > 3 files changed, 136 insertions(+), 3 deletions(-) > > diff --git a/include/linux/swap.h b/include/linux/swap.h > index a11c75e897ec..e88563978441 100644 > --- a/include/linux/swap.h > +++ b/include/linux/swap.h > @@ -299,6 +299,7 @@ struct swap_info_struct { > signed char type; /* strange name for an index */ > unsigned int max; /* extent of the swap_map */ > unsigned char *swap_map; /* vmalloc'ed array of usage counts */ > + unsigned long *zeromap; /* vmalloc'ed bitmap to track zero pages */ > struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ > struct swap_cluster_list free_clusters; /* free clusters list */ > unsigned int lowest_bit; /* index of first free in swap_map */ > diff --git a/mm/page_io.c b/mm/page_io.c > index a360857cf75d..39fc3919ce15 100644 > --- a/mm/page_io.c > +++ b/mm/page_io.c > @@ -172,6 +172,88 @@ int generic_swapfile_activate(struct swap_info_struct *sis, > goto out; > } > > +static bool is_folio_page_zero_filled(struct folio *folio, int i) > +{ > + unsigned long *data; > + unsigned int pos, last_pos = PAGE_SIZE / sizeof(*data) - 1; > + bool ret = false; > + > + data = kmap_local_folio(folio, i * PAGE_SIZE); > + if (data[last_pos]) > + goto out; > + for (pos = 0; pos < PAGE_SIZE / sizeof(*data); pos++) { > + if (data[pos]) > + goto out; > + } > + ret = true; > +out: > + kunmap_local(data); > + return ret; > +} > + > +static bool is_folio_zero_filled(struct folio *folio) > +{ > + unsigned int i; > + > + for (i = 0; i < folio_nr_pages(folio); i++) { > + if (!is_folio_page_zero_filled(folio, i)) > + return false; > + } > + return true; > +} > + > +static void folio_zero_fill(struct folio *folio) > +{ > + unsigned int i; > + > + for (i = 0; i < folio_nr_pages(folio); i++) > + clear_highpage(folio_page(folio, i)); > +} > + > +static void swap_zeromap_folio_set(struct folio *folio) > +{ > + struct swap_info_struct *sis = swp_swap_info(folio->swap); > + swp_entry_t entry; > + unsigned int i; > + > + for (i = 0; i < folio_nr_pages(folio); i++) { > + entry = page_swap_entry(folio_page(folio, i)); > + set_bit(swp_offset(entry), sis->zeromap); > + } > +} > + > +static void swap_zeromap_folio_clear(struct folio *folio) > +{ > + struct swap_info_struct *sis = swp_swap_info(folio->swap); > + swp_entry_t entry; > + unsigned int i; > + > + for (i = 0; i < folio_nr_pages(folio); i++) { > + entry = page_swap_entry(folio_page(folio, i)); > + clear_bit(swp_offset(entry), sis->zeromap); > + } > +} > + > +/* > + * Return the index of the first subpage which is not zero-filled > + * according to swap_info_struct->zeromap. > + * If all pages are zero-filled according to zeromap, it will return > + * folio_nr_pages(folio). > + */ > +static unsigned int swap_zeromap_folio_test(struct folio *folio) > +{ > + struct swap_info_struct *sis = swp_swap_info(folio->swap); > + swp_entry_t entry; > + unsigned int i; > + > + for (i = 0; i < folio_nr_pages(folio); i++) { > + entry = page_swap_entry(folio_page(folio, i)); > + if (!test_bit(swp_offset(entry), sis->zeromap)) > + return i; > + } > + return i; > +} > + > /* > * We may have stale swap cache pages in memory: notice > * them here and get rid of the unnecessary final write. > @@ -195,6 +277,13 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) > folio_unlock(folio); > return ret; > } > + > + if (is_folio_zero_filled(folio)) { > + swap_zeromap_folio_set(folio); > + folio_unlock(folio); > + return 0; > + } > + swap_zeromap_folio_clear(folio); > if (zswap_store(folio)) { > folio_start_writeback(folio); > folio_unlock(folio); > @@ -426,6 +515,26 @@ static void sio_read_complete(struct kiocb *iocb, long ret) > mempool_free(sio, sio_pool); > } > > +static bool swap_read_folio_zeromap(struct folio *folio) > +{ > + unsigned int idx = swap_zeromap_folio_test(folio); > + > + if (idx == 0) > + return false; > + > + /* > + * Swapping in a large folio that is partially in the zeromap is not > + * currently handled. Return true without marking the folio uptodate so > + * that an IO error is emitted (e.g. do_swap_page() will sigbus). > + */ > + if (WARN_ON_ONCE(idx < folio_nr_pages(folio))) > + return true; Hi Usama, Yosry, I feel the warning is wrong as we could have the case where idx==0 is not zeromap but idx=1 is zeromap. idx == 0 doesn't necessarily mean we should return false. What about the below change which both fixes the warning and unblocks large folios swap-in? diff --git a/mm/page_io.c b/mm/page_io.c index 4bc77d1c6bfa..7d7ff7064e2b 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -226,26 +226,6 @@ static void swap_zeromap_folio_clear(struct folio *folio) } } -/* - * Return the index of the first subpage which is not zero-filled - * according to swap_info_struct->zeromap. - * If all pages are zero-filled according to zeromap, it will return - * folio_nr_pages(folio). - */ -static unsigned int swap_zeromap_folio_test(struct folio *folio) -{ - struct swap_info_struct *sis = swp_swap_info(folio->swap); - swp_entry_t entry; - unsigned int i; - - for (i = 0; i < folio_nr_pages(folio); i++) { - entry = page_swap_entry(folio_page(folio, i)); - if (!test_bit(swp_offset(entry), sis->zeromap)) - return i; - } - return i; -} - /* * We may have stale swap cache pages in memory: notice * them here and get rid of the unnecessary final write. @@ -524,9 +504,10 @@ static void sio_read_complete(struct kiocb *iocb, long ret) static bool swap_read_folio_zeromap(struct folio *folio) { - unsigned int idx = swap_zeromap_folio_test(folio); + unsigned int nr_pages = folio_nr_pages(folio); + unsigned int nr = swap_zeromap_entries_count(folio->swap, nr_pages); - if (idx == 0) + if (nr == 0) return false; /* @@ -534,7 +515,7 @@ static bool swap_read_folio_zeromap(struct folio *folio) * currently handled. Return true without marking the folio uptodate so * that an IO error is emitted (e.g. do_swap_page() will sigbus). */ - if (WARN_ON_ONCE(idx < folio_nr_pages(folio))) + if (WARN_ON_ONCE(nr < nr_pages)) return true; folio_zero_range(folio, 0, folio_size(folio)); diff --git a/mm/swap.h b/mm/swap.h index f8711ff82f84..2d59e9d89e95 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -80,6 +80,32 @@ static inline unsigned int folio_swap_flags(struct folio *folio) { return swp_swap_info(folio->swap)->flags; } + +/* + * Return the number of entries which are zero-filled according to + * swap_info_struct->zeromap. It isn't precise if the return value + * is larger than 0 and smaller than nr to avoid extra iterations, + * In this case, it means entries haven't consistent zeromap. + */ +static inline unsigned int swap_zeromap_entries_count(swp_entry_t entry, int nr) +{ + struct swap_info_struct *sis = swp_swap_info(entry); + unsigned long offset = swp_offset(entry); + unsigned int type = swp_type(entry); + unsigned int n = 0; + + for (int i = 0; i < nr; i++) { + entry = swp_entry(type, offset + i); + if (test_bit(offset + i, sis->zeromap)) { + if (i != n) + return i; + n++; + } + } + + return n; +} + #else /* CONFIG_SWAP */ struct swap_iocb; static inline void swap_read_folio(struct folio *folio, struct swap_iocb **plug) @@ -171,6 +197,11 @@ static inline unsigned int folio_swap_flags(struct folio *folio) { return 0; } + +static inline unsigned int swap_zeromap_entries_count(swp_entry_t entry, int nr) +{ + return 0; +} #endif /* CONFIG_SWAP */ #endif /* _MM_SWAP_H */ > + > + folio_zero_fill(folio); > + folio_mark_uptodate(folio); > + return true; > +} > + > static void swap_read_folio_fs(struct folio *folio, struct swap_iocb **plug) > { > struct swap_info_struct *sis = swp_swap_info(folio->swap); > @@ -515,8 +624,9 @@ void swap_read_folio(struct folio *folio, bool synchronous, > psi_memstall_enter(&pflags); > } > delayacct_swapin_start(); > - > - if (zswap_load(folio)) { > + if (swap_read_folio_zeromap(folio)) { > + folio_unlock(folio); > + } else if (zswap_load(folio)) { > folio_mark_uptodate(folio); > folio_unlock(folio); > } else if (data_race(sis->flags & SWP_FS_OPS)) { > diff --git a/mm/swapfile.c b/mm/swapfile.c > index f1e559e216bd..48d8dca0b94b 100644 > --- a/mm/swapfile.c > +++ b/mm/swapfile.c > @@ -453,6 +453,8 @@ static unsigned int cluster_list_del_first(struct swap_cluster_list *list, > static void swap_cluster_schedule_discard(struct swap_info_struct *si, > unsigned int idx) > { > + unsigned int i; > + > /* > * If scan_swap_map_slots() can't find a free cluster, it will check > * si->swap_map directly. To make sure the discarding cluster isn't > @@ -461,6 +463,13 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si, > */ > memset(si->swap_map + idx * SWAPFILE_CLUSTER, > SWAP_MAP_BAD, SWAPFILE_CLUSTER); > + /* > + * zeromap can see updates from concurrent swap_writepage() and swap_read_folio() > + * call on other slots, hence use atomic clear_bit for zeromap instead of the > + * non-atomic bitmap_clear. > + */ > + for (i = 0; i < SWAPFILE_CLUSTER; i++) > + clear_bit(idx * SWAPFILE_CLUSTER + i, si->zeromap); > > cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx); > > @@ -482,7 +491,7 @@ static void __free_cluster(struct swap_info_struct *si, unsigned long idx) > static void swap_do_scheduled_discard(struct swap_info_struct *si) > { > struct swap_cluster_info *info, *ci; > - unsigned int idx; > + unsigned int idx, i; > > info = si->cluster_info; > > @@ -498,6 +507,8 @@ static void swap_do_scheduled_discard(struct swap_info_struct *si) > __free_cluster(si, idx); > memset(si->swap_map + idx * SWAPFILE_CLUSTER, > 0, SWAPFILE_CLUSTER); > + for (i = 0; i < SWAPFILE_CLUSTER; i++) > + clear_bit(idx * SWAPFILE_CLUSTER + i, si->zeromap); > unlock_cluster(ci); > } > } > @@ -1059,9 +1070,12 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) > { > unsigned long offset = idx * SWAPFILE_CLUSTER; > struct swap_cluster_info *ci; > + unsigned int i; > > ci = lock_cluster(si, offset); > memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER); > + for (i = 0; i < SWAPFILE_CLUSTER; i++) > + clear_bit(offset + i, si->zeromap); > cluster_set_count_flag(ci, 0, 0); > free_cluster(si, idx); > unlock_cluster(ci); > @@ -1336,6 +1350,7 @@ static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry) > count = p->swap_map[offset]; > VM_BUG_ON(count != SWAP_HAS_CACHE); > p->swap_map[offset] = 0; > + clear_bit(offset, p->zeromap); > dec_cluster_info_page(p, p->cluster_info, offset); > unlock_cluster(ci); > > @@ -2597,6 +2612,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) > free_percpu(p->cluster_next_cpu); > p->cluster_next_cpu = NULL; > vfree(swap_map); > + bitmap_free(p->zeromap); > kvfree(cluster_info); > /* Destroy swap account information */ > swap_cgroup_swapoff(p->type); > @@ -3123,6 +3139,12 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) > goto bad_swap_unlock_inode; > } > > + p->zeromap = bitmap_zalloc(maxpages, GFP_KERNEL); > + if (!p->zeromap) { > + error = -ENOMEM; > + goto bad_swap_unlock_inode; > + } > + > if (p->bdev && bdev_stable_writes(p->bdev)) > p->flags |= SWP_STABLE_WRITES; > > -- > 2.43.0 > >
[..] > > @@ -426,6 +515,26 @@ static void sio_read_complete(struct kiocb *iocb, long ret) > > mempool_free(sio, sio_pool); > > } > > > > +static bool swap_read_folio_zeromap(struct folio *folio) > > +{ > > + unsigned int idx = swap_zeromap_folio_test(folio); > > + > > + if (idx == 0) > > + return false; > > + > > + /* > > + * Swapping in a large folio that is partially in the zeromap is not > > + * currently handled. Return true without marking the folio uptodate so > > + * that an IO error is emitted (e.g. do_swap_page() will sigbus). > > + */ > > + if (WARN_ON_ONCE(idx < folio_nr_pages(folio))) > > + return true; > > Hi Usama, Yosry, > > I feel the warning is wrong as we could have the case where idx==0 > is not zeromap but idx=1 is zeromap. idx == 0 doesn't necessarily > mean we should return false. Good catch. Yeah if idx == 0 is not in the zeromap but other indices are we will mistakenly read the entire folio from swap. > > What about the below change which both fixes the warning and unblocks > large folios swap-in? But I don't see how that unblocks the large folios swap-in work? We still need to actually handle the case where a large folio being swapped in is partially in the zeromap. Right now we warn and unlock the folio without calling folio_mark_uptodate(), which emits an IO error. > > diff --git a/mm/page_io.c b/mm/page_io.c > index 4bc77d1c6bfa..7d7ff7064e2b 100644 > --- a/mm/page_io.c > +++ b/mm/page_io.c > @@ -226,26 +226,6 @@ static void swap_zeromap_folio_clear(struct folio *folio) > } > } > > -/* > - * Return the index of the first subpage which is not zero-filled > - * according to swap_info_struct->zeromap. > - * If all pages are zero-filled according to zeromap, it will return > - * folio_nr_pages(folio). > - */ > -static unsigned int swap_zeromap_folio_test(struct folio *folio) > -{ > - struct swap_info_struct *sis = swp_swap_info(folio->swap); > - swp_entry_t entry; > - unsigned int i; > - > - for (i = 0; i < folio_nr_pages(folio); i++) { > - entry = page_swap_entry(folio_page(folio, i)); > - if (!test_bit(swp_offset(entry), sis->zeromap)) > - return i; > - } > - return i; > -} > - > /* > * We may have stale swap cache pages in memory: notice > * them here and get rid of the unnecessary final write. > @@ -524,9 +504,10 @@ static void sio_read_complete(struct kiocb *iocb, long ret) > > static bool swap_read_folio_zeromap(struct folio *folio) > { > - unsigned int idx = swap_zeromap_folio_test(folio); > + unsigned int nr_pages = folio_nr_pages(folio); > + unsigned int nr = swap_zeromap_entries_count(folio->swap, nr_pages); > > - if (idx == 0) > + if (nr == 0) > return false; > > /* > @@ -534,7 +515,7 @@ static bool swap_read_folio_zeromap(struct folio *folio) > * currently handled. Return true without marking the folio uptodate so > * that an IO error is emitted (e.g. do_swap_page() will sigbus). > */ > - if (WARN_ON_ONCE(idx < folio_nr_pages(folio))) > + if (WARN_ON_ONCE(nr < nr_pages)) > return true; > > folio_zero_range(folio, 0, folio_size(folio)); > diff --git a/mm/swap.h b/mm/swap.h > index f8711ff82f84..2d59e9d89e95 100644 > --- a/mm/swap.h > +++ b/mm/swap.h > @@ -80,6 +80,32 @@ static inline unsigned int folio_swap_flags(struct folio *folio) > { > return swp_swap_info(folio->swap)->flags; > } > + > +/* > + * Return the number of entries which are zero-filled according to > + * swap_info_struct->zeromap. It isn't precise if the return value > + * is larger than 0 and smaller than nr to avoid extra iterations, > + * In this case, it means entries haven't consistent zeromap. > + */ > +static inline unsigned int swap_zeromap_entries_count(swp_entry_t entry, int nr) > +{ > + struct swap_info_struct *sis = swp_swap_info(entry); > + unsigned long offset = swp_offset(entry); > + unsigned int type = swp_type(entry); > + unsigned int n = 0; > + > + for (int i = 0; i < nr; i++) { > + entry = swp_entry(type, offset + i); > + if (test_bit(offset + i, sis->zeromap)) { > + if (i != n) > + return i; > + n++; > + } > + } > + > + return n; > +} > + > #else /* CONFIG_SWAP */ > struct swap_iocb; > static inline void swap_read_folio(struct folio *folio, struct swap_iocb **plug) > @@ -171,6 +197,11 @@ static inline unsigned int folio_swap_flags(struct folio *folio) > { > return 0; > } > + > +static inline unsigned int swap_zeromap_entries_count(swp_entry_t entry, int nr) > +{ > + return 0; > +} > #endif /* CONFIG_SWAP */ > > #endif /* _MM_SWAP_H */ > > > + > > + folio_zero_fill(folio); > > + folio_mark_uptodate(folio); > > + return true; > > +} > > + > > static void swap_read_folio_fs(struct folio *folio, struct swap_iocb **plug) > > { > > struct swap_info_struct *sis = swp_swap_info(folio->swap); > > @@ -515,8 +624,9 @@ void swap_read_folio(struct folio *folio, bool synchronous, > > psi_memstall_enter(&pflags); > > } > > delayacct_swapin_start(); > > - > > - if (zswap_load(folio)) { > > + if (swap_read_folio_zeromap(folio)) { > > + folio_unlock(folio); > > + } else if (zswap_load(folio)) { > > folio_mark_uptodate(folio); > > folio_unlock(folio); > > } else if (data_race(sis->flags & SWP_FS_OPS)) { > > diff --git a/mm/swapfile.c b/mm/swapfile.c > > index f1e559e216bd..48d8dca0b94b 100644 > > --- a/mm/swapfile.c > > +++ b/mm/swapfile.c > > @@ -453,6 +453,8 @@ static unsigned int cluster_list_del_first(struct swap_cluster_list *list, > > static void swap_cluster_schedule_discard(struct swap_info_struct *si, > > unsigned int idx) > > { > > + unsigned int i; > > + > > /* > > * If scan_swap_map_slots() can't find a free cluster, it will check > > * si->swap_map directly. To make sure the discarding cluster isn't > > @@ -461,6 +463,13 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si, > > */ > > memset(si->swap_map + idx * SWAPFILE_CLUSTER, > > SWAP_MAP_BAD, SWAPFILE_CLUSTER); > > + /* > > + * zeromap can see updates from concurrent swap_writepage() and swap_read_folio() > > + * call on other slots, hence use atomic clear_bit for zeromap instead of the > > + * non-atomic bitmap_clear. > > + */ > > + for (i = 0; i < SWAPFILE_CLUSTER; i++) > > + clear_bit(idx * SWAPFILE_CLUSTER + i, si->zeromap); > > > > cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx); > > > > @@ -482,7 +491,7 @@ static void __free_cluster(struct swap_info_struct *si, unsigned long idx) > > static void swap_do_scheduled_discard(struct swap_info_struct *si) > > { > > struct swap_cluster_info *info, *ci; > > - unsigned int idx; > > + unsigned int idx, i; > > > > info = si->cluster_info; > > > > @@ -498,6 +507,8 @@ static void swap_do_scheduled_discard(struct swap_info_struct *si) > > __free_cluster(si, idx); > > memset(si->swap_map + idx * SWAPFILE_CLUSTER, > > 0, SWAPFILE_CLUSTER); > > + for (i = 0; i < SWAPFILE_CLUSTER; i++) > > + clear_bit(idx * SWAPFILE_CLUSTER + i, si->zeromap); > > unlock_cluster(ci); > > } > > } > > @@ -1059,9 +1070,12 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) > > { > > unsigned long offset = idx * SWAPFILE_CLUSTER; > > struct swap_cluster_info *ci; > > + unsigned int i; > > > > ci = lock_cluster(si, offset); > > memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER); > > + for (i = 0; i < SWAPFILE_CLUSTER; i++) > > + clear_bit(offset + i, si->zeromap); > > cluster_set_count_flag(ci, 0, 0); > > free_cluster(si, idx); > > unlock_cluster(ci); > > @@ -1336,6 +1350,7 @@ static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry) > > count = p->swap_map[offset]; > > VM_BUG_ON(count != SWAP_HAS_CACHE); > > p->swap_map[offset] = 0; > > + clear_bit(offset, p->zeromap); > > dec_cluster_info_page(p, p->cluster_info, offset); > > unlock_cluster(ci); > > > > @@ -2597,6 +2612,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) > > free_percpu(p->cluster_next_cpu); > > p->cluster_next_cpu = NULL; > > vfree(swap_map); > > + bitmap_free(p->zeromap); > > kvfree(cluster_info); > > /* Destroy swap account information */ > > swap_cgroup_swapoff(p->type); > > @@ -3123,6 +3139,12 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) > > goto bad_swap_unlock_inode; > > } > > > > + p->zeromap = bitmap_zalloc(maxpages, GFP_KERNEL); > > + if (!p->zeromap) { > > + error = -ENOMEM; > > + goto bad_swap_unlock_inode; > > + } > > + > > if (p->bdev && bdev_stable_writes(p->bdev)) > > p->flags |= SWP_STABLE_WRITES; > > > > -- > > 2.43.0 > > > >
On Wed, Sep 4, 2024 at 7:12 PM Yosry Ahmed <yosryahmed@google.com> wrote: > > [..] > > > @@ -426,6 +515,26 @@ static void sio_read_complete(struct kiocb *iocb, long ret) > > > mempool_free(sio, sio_pool); > > > } > > > > > > +static bool swap_read_folio_zeromap(struct folio *folio) > > > +{ > > > + unsigned int idx = swap_zeromap_folio_test(folio); > > > + > > > + if (idx == 0) > > > + return false; > > > + > > > + /* > > > + * Swapping in a large folio that is partially in the zeromap is not > > > + * currently handled. Return true without marking the folio uptodate so > > > + * that an IO error is emitted (e.g. do_swap_page() will sigbus). > > > + */ > > > + if (WARN_ON_ONCE(idx < folio_nr_pages(folio))) > > > + return true; > > > > Hi Usama, Yosry, > > > > I feel the warning is wrong as we could have the case where idx==0 > > is not zeromap but idx=1 is zeromap. idx == 0 doesn't necessarily > > mean we should return false. > > Good catch. Yeah if idx == 0 is not in the zeromap but other indices > are we will mistakenly read the entire folio from swap. > > > > > What about the below change which both fixes the warning and unblocks > > large folios swap-in? > > But I don't see how that unblocks the large folios swap-in work? We > still need to actually handle the case where a large folio being > swapped in is partially in the zeromap. Right now we warn and unlock > the folio without calling folio_mark_uptodate(), which emits an IO > error. I placed this in mm/swap.h so that during swap-in, it can filter out any large folios where swap_zeromap_entries_count() is greater than 0 and less than nr. I believe this case would be quite rare, as it can only occur when some small folios that are swapped out happen to have contiguous and aligned swap slots. > > > > > diff --git a/mm/page_io.c b/mm/page_io.c > > index 4bc77d1c6bfa..7d7ff7064e2b 100644 > > --- a/mm/page_io.c > > +++ b/mm/page_io.c > > @@ -226,26 +226,6 @@ static void swap_zeromap_folio_clear(struct folio *folio) > > } > > } > > > > -/* > > - * Return the index of the first subpage which is not zero-filled > > - * according to swap_info_struct->zeromap. > > - * If all pages are zero-filled according to zeromap, it will return > > - * folio_nr_pages(folio). > > - */ > > -static unsigned int swap_zeromap_folio_test(struct folio *folio) > > -{ > > - struct swap_info_struct *sis = swp_swap_info(folio->swap); > > - swp_entry_t entry; > > - unsigned int i; > > - > > - for (i = 0; i < folio_nr_pages(folio); i++) { > > - entry = page_swap_entry(folio_page(folio, i)); > > - if (!test_bit(swp_offset(entry), sis->zeromap)) > > - return i; > > - } > > - return i; > > -} > > - > > /* > > * We may have stale swap cache pages in memory: notice > > * them here and get rid of the unnecessary final write. > > @@ -524,9 +504,10 @@ static void sio_read_complete(struct kiocb *iocb, long ret) > > > > static bool swap_read_folio_zeromap(struct folio *folio) > > { > > - unsigned int idx = swap_zeromap_folio_test(folio); > > + unsigned int nr_pages = folio_nr_pages(folio); > > + unsigned int nr = swap_zeromap_entries_count(folio->swap, nr_pages); > > > > - if (idx == 0) > > + if (nr == 0) > > return false; > > > > /* > > @@ -534,7 +515,7 @@ static bool swap_read_folio_zeromap(struct folio *folio) > > * currently handled. Return true without marking the folio uptodate so > > * that an IO error is emitted (e.g. do_swap_page() will sigbus). > > */ > > - if (WARN_ON_ONCE(idx < folio_nr_pages(folio))) > > + if (WARN_ON_ONCE(nr < nr_pages)) > > return true; > > > > folio_zero_range(folio, 0, folio_size(folio)); > > diff --git a/mm/swap.h b/mm/swap.h > > index f8711ff82f84..2d59e9d89e95 100644 > > --- a/mm/swap.h > > +++ b/mm/swap.h > > @@ -80,6 +80,32 @@ static inline unsigned int folio_swap_flags(struct folio *folio) > > { > > return swp_swap_info(folio->swap)->flags; > > } > > + > > +/* > > + * Return the number of entries which are zero-filled according to > > + * swap_info_struct->zeromap. It isn't precise if the return value > > + * is larger than 0 and smaller than nr to avoid extra iterations, > > + * In this case, it means entries haven't consistent zeromap. > > + */ > > +static inline unsigned int swap_zeromap_entries_count(swp_entry_t entry, int nr) > > +{ > > + struct swap_info_struct *sis = swp_swap_info(entry); > > + unsigned long offset = swp_offset(entry); > > + unsigned int type = swp_type(entry); > > + unsigned int n = 0; > > + > > + for (int i = 0; i < nr; i++) { > > + entry = swp_entry(type, offset + i); > > + if (test_bit(offset + i, sis->zeromap)) { > > + if (i != n) > > + return i; > > + n++; > > + } > > + } > > + > > + return n; > > +} > > + > > #else /* CONFIG_SWAP */ > > struct swap_iocb; > > static inline void swap_read_folio(struct folio *folio, struct swap_iocb **plug) > > @@ -171,6 +197,11 @@ static inline unsigned int folio_swap_flags(struct folio *folio) > > { > > return 0; > > } > > + > > +static inline unsigned int swap_zeromap_entries_count(swp_entry_t entry, int nr) > > +{ > > + return 0; > > +} > > #endif /* CONFIG_SWAP */ > > > > #endif /* _MM_SWAP_H */ > > > > > + > > > + folio_zero_fill(folio); > > > + folio_mark_uptodate(folio); > > > + return true; > > > +} > > > + > > > static void swap_read_folio_fs(struct folio *folio, struct swap_iocb **plug) > > > { > > > struct swap_info_struct *sis = swp_swap_info(folio->swap); > > > @@ -515,8 +624,9 @@ void swap_read_folio(struct folio *folio, bool synchronous, > > > psi_memstall_enter(&pflags); > > > } > > > delayacct_swapin_start(); > > > - > > > - if (zswap_load(folio)) { > > > + if (swap_read_folio_zeromap(folio)) { > > > + folio_unlock(folio); > > > + } else if (zswap_load(folio)) { > > > folio_mark_uptodate(folio); > > > folio_unlock(folio); > > > } else if (data_race(sis->flags & SWP_FS_OPS)) { > > > diff --git a/mm/swapfile.c b/mm/swapfile.c > > > index f1e559e216bd..48d8dca0b94b 100644 > > > --- a/mm/swapfile.c > > > +++ b/mm/swapfile.c > > > @@ -453,6 +453,8 @@ static unsigned int cluster_list_del_first(struct swap_cluster_list *list, > > > static void swap_cluster_schedule_discard(struct swap_info_struct *si, > > > unsigned int idx) > > > { > > > + unsigned int i; > > > + > > > /* > > > * If scan_swap_map_slots() can't find a free cluster, it will check > > > * si->swap_map directly. To make sure the discarding cluster isn't > > > @@ -461,6 +463,13 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si, > > > */ > > > memset(si->swap_map + idx * SWAPFILE_CLUSTER, > > > SWAP_MAP_BAD, SWAPFILE_CLUSTER); > > > + /* > > > + * zeromap can see updates from concurrent swap_writepage() and swap_read_folio() > > > + * call on other slots, hence use atomic clear_bit for zeromap instead of the > > > + * non-atomic bitmap_clear. > > > + */ > > > + for (i = 0; i < SWAPFILE_CLUSTER; i++) > > > + clear_bit(idx * SWAPFILE_CLUSTER + i, si->zeromap); > > > > > > cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx); > > > > > > @@ -482,7 +491,7 @@ static void __free_cluster(struct swap_info_struct *si, unsigned long idx) > > > static void swap_do_scheduled_discard(struct swap_info_struct *si) > > > { > > > struct swap_cluster_info *info, *ci; > > > - unsigned int idx; > > > + unsigned int idx, i; > > > > > > info = si->cluster_info; > > > > > > @@ -498,6 +507,8 @@ static void swap_do_scheduled_discard(struct swap_info_struct *si) > > > __free_cluster(si, idx); > > > memset(si->swap_map + idx * SWAPFILE_CLUSTER, > > > 0, SWAPFILE_CLUSTER); > > > + for (i = 0; i < SWAPFILE_CLUSTER; i++) > > > + clear_bit(idx * SWAPFILE_CLUSTER + i, si->zeromap); > > > unlock_cluster(ci); > > > } > > > } > > > @@ -1059,9 +1070,12 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) > > > { > > > unsigned long offset = idx * SWAPFILE_CLUSTER; > > > struct swap_cluster_info *ci; > > > + unsigned int i; > > > > > > ci = lock_cluster(si, offset); > > > memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER); > > > + for (i = 0; i < SWAPFILE_CLUSTER; i++) > > > + clear_bit(offset + i, si->zeromap); > > > cluster_set_count_flag(ci, 0, 0); > > > free_cluster(si, idx); > > > unlock_cluster(ci); > > > @@ -1336,6 +1350,7 @@ static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry) > > > count = p->swap_map[offset]; > > > VM_BUG_ON(count != SWAP_HAS_CACHE); > > > p->swap_map[offset] = 0; > > > + clear_bit(offset, p->zeromap); > > > dec_cluster_info_page(p, p->cluster_info, offset); > > > unlock_cluster(ci); > > > > > > @@ -2597,6 +2612,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) > > > free_percpu(p->cluster_next_cpu); > > > p->cluster_next_cpu = NULL; > > > vfree(swap_map); > > > + bitmap_free(p->zeromap); > > > kvfree(cluster_info); > > > /* Destroy swap account information */ > > > swap_cgroup_swapoff(p->type); > > > @@ -3123,6 +3139,12 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) > > > goto bad_swap_unlock_inode; > > > } > > > > > > + p->zeromap = bitmap_zalloc(maxpages, GFP_KERNEL); > > > + if (!p->zeromap) { > > > + error = -ENOMEM; > > > + goto bad_swap_unlock_inode; > > > + } > > > + > > > if (p->bdev && bdev_stable_writes(p->bdev)) > > > p->flags |= SWP_STABLE_WRITES; > > > > > > -- > > > 2.43.0 > > > > > > Thanks barry
On Wed, Sep 4, 2024 at 12:17 AM Barry Song <21cnbao@gmail.com> wrote: > > On Wed, Sep 4, 2024 at 7:12 PM Yosry Ahmed <yosryahmed@google.com> wrote: > > > > [..] > > > > @@ -426,6 +515,26 @@ static void sio_read_complete(struct kiocb *iocb, long ret) > > > > mempool_free(sio, sio_pool); > > > > } > > > > > > > > +static bool swap_read_folio_zeromap(struct folio *folio) > > > > +{ > > > > + unsigned int idx = swap_zeromap_folio_test(folio); > > > > + > > > > + if (idx == 0) > > > > + return false; > > > > + > > > > + /* > > > > + * Swapping in a large folio that is partially in the zeromap is not > > > > + * currently handled. Return true without marking the folio uptodate so > > > > + * that an IO error is emitted (e.g. do_swap_page() will sigbus). > > > > + */ > > > > + if (WARN_ON_ONCE(idx < folio_nr_pages(folio))) > > > > + return true; > > > > > > Hi Usama, Yosry, > > > > > > I feel the warning is wrong as we could have the case where idx==0 > > > is not zeromap but idx=1 is zeromap. idx == 0 doesn't necessarily > > > mean we should return false. > > > > Good catch. Yeah if idx == 0 is not in the zeromap but other indices > > are we will mistakenly read the entire folio from swap. > > > > > > > > What about the below change which both fixes the warning and unblocks > > > large folios swap-in? > > > > But I don't see how that unblocks the large folios swap-in work? We > > still need to actually handle the case where a large folio being > > swapped in is partially in the zeromap. Right now we warn and unlock > > the folio without calling folio_mark_uptodate(), which emits an IO > > error. > > I placed this in mm/swap.h so that during swap-in, it can filter out any large > folios where swap_zeromap_entries_count() is greater than 0 and less than > nr. > > I believe this case would be quite rare, as it can only occur when some small > folios that are swapped out happen to have contiguous and aligned swap > slots. I am assuming this would be near where the zswap_never_enabled() check is today, right? I understand the point of doing this to unblock the synchronous large folio swapin support work, but at some point we're gonna have to actually handle the cases where a large folio being swapped in is partially in the swap cache, zswap, the zeromap, etc. All these cases will need similar-ish handling, and I suspect we won't just skip swapping in large folios in all these cases.
On Wed, Sep 4, 2024 at 7:22 PM Yosry Ahmed <yosryahmed@google.com> wrote: > > On Wed, Sep 4, 2024 at 12:17 AM Barry Song <21cnbao@gmail.com> wrote: > > > > On Wed, Sep 4, 2024 at 7:12 PM Yosry Ahmed <yosryahmed@google.com> wrote: > > > > > > [..] > > > > > @@ -426,6 +515,26 @@ static void sio_read_complete(struct kiocb *iocb, long ret) > > > > > mempool_free(sio, sio_pool); > > > > > } > > > > > > > > > > +static bool swap_read_folio_zeromap(struct folio *folio) > > > > > +{ > > > > > + unsigned int idx = swap_zeromap_folio_test(folio); > > > > > + > > > > > + if (idx == 0) > > > > > + return false; > > > > > + > > > > > + /* > > > > > + * Swapping in a large folio that is partially in the zeromap is not > > > > > + * currently handled. Return true without marking the folio uptodate so > > > > > + * that an IO error is emitted (e.g. do_swap_page() will sigbus). > > > > > + */ > > > > > + if (WARN_ON_ONCE(idx < folio_nr_pages(folio))) > > > > > + return true; > > > > > > > > Hi Usama, Yosry, > > > > > > > > I feel the warning is wrong as we could have the case where idx==0 > > > > is not zeromap but idx=1 is zeromap. idx == 0 doesn't necessarily > > > > mean we should return false. > > > > > > Good catch. Yeah if idx == 0 is not in the zeromap but other indices > > > are we will mistakenly read the entire folio from swap. > > > > > > > > > > > What about the below change which both fixes the warning and unblocks > > > > large folios swap-in? > > > > > > But I don't see how that unblocks the large folios swap-in work? We > > > still need to actually handle the case where a large folio being > > > swapped in is partially in the zeromap. Right now we warn and unlock > > > the folio without calling folio_mark_uptodate(), which emits an IO > > > error. > > > > I placed this in mm/swap.h so that during swap-in, it can filter out any large > > folios where swap_zeromap_entries_count() is greater than 0 and less than > > nr. > > > > I believe this case would be quite rare, as it can only occur when some small > > folios that are swapped out happen to have contiguous and aligned swap > > slots. > > I am assuming this would be near where the zswap_never_enabled() check > is today, right? The code is close to the area, but it doesn't rely on zeromap being disabled. > > I understand the point of doing this to unblock the synchronous large > folio swapin support work, but at some point we're gonna have to > actually handle the cases where a large folio being swapped in is > partially in the swap cache, zswap, the zeromap, etc. > > All these cases will need similar-ish handling, and I suspect we won't > just skip swapping in large folios in all these cases. I agree that this is definitely the goal. `swap_read_folio()` should be a dependable API that always returns reliable data, regardless of whether `zeromap` or `zswap` is involved. Despite these issues, mTHP swap-in shouldn't be held back. Significant efforts are underway to support large folios in `zswap`, and progress is being made. Not to mention we've already allowed `zeromap` to proceed, even though it doesn't support large folios. It's genuinely unfair to let the lack of mTHP support in `zeromap` and `zswap` hold swap-in hostage. Nonetheless, `zeromap` and `zswap` are distinct cases. With `zeromap`, we permit almost all mTHP swap-ins, except for those rare situations where small folios that were swapped out happen to have contiguous and aligned swap slots. swapcache is another quite different story, since our user scenarios begin from the simplest sync io on mobile phones, we don't quite care about swapcache. Thanks Barry
On 04/09/2024 06:55, Barry Song wrote: > On Thu, Jun 13, 2024 at 12:48 AM Usama Arif <usamaarif642@gmail.com> wrote: >> >> Approximately 10-20% of pages to be swapped out are zero pages [1]. >> Rather than reading/writing these pages to flash resulting >> in increased I/O and flash wear, a bitmap can be used to mark these >> pages as zero at write time, and the pages can be filled at >> read time if the bit corresponding to the page is set. >> With this patch, NVMe writes in Meta server fleet decreased >> by almost 10% with conventional swap setup (zswap disabled). >> >> [1] https://lore.kernel.org/all/20171018104832epcms5p1b2232e2236258de3d03d1344dde9fce0@epcms5p1/ >> >> Signed-off-by: Usama Arif <usamaarif642@gmail.com> >> --- >> include/linux/swap.h | 1 + >> mm/page_io.c | 114 ++++++++++++++++++++++++++++++++++++++++++- >> mm/swapfile.c | 24 ++++++++- >> 3 files changed, 136 insertions(+), 3 deletions(-) >> >> diff --git a/include/linux/swap.h b/include/linux/swap.h >> index a11c75e897ec..e88563978441 100644 >> --- a/include/linux/swap.h >> +++ b/include/linux/swap.h >> @@ -299,6 +299,7 @@ struct swap_info_struct { >> signed char type; /* strange name for an index */ >> unsigned int max; /* extent of the swap_map */ >> unsigned char *swap_map; /* vmalloc'ed array of usage counts */ >> + unsigned long *zeromap; /* vmalloc'ed bitmap to track zero pages */ >> struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ >> struct swap_cluster_list free_clusters; /* free clusters list */ >> unsigned int lowest_bit; /* index of first free in swap_map */ >> diff --git a/mm/page_io.c b/mm/page_io.c >> index a360857cf75d..39fc3919ce15 100644 >> --- a/mm/page_io.c >> +++ b/mm/page_io.c >> @@ -172,6 +172,88 @@ int generic_swapfile_activate(struct swap_info_struct *sis, >> goto out; >> } >> >> +static bool is_folio_page_zero_filled(struct folio *folio, int i) >> +{ >> + unsigned long *data; >> + unsigned int pos, last_pos = PAGE_SIZE / sizeof(*data) - 1; >> + bool ret = false; >> + >> + data = kmap_local_folio(folio, i * PAGE_SIZE); >> + if (data[last_pos]) >> + goto out; >> + for (pos = 0; pos < PAGE_SIZE / sizeof(*data); pos++) { >> + if (data[pos]) >> + goto out; >> + } >> + ret = true; >> +out: >> + kunmap_local(data); >> + return ret; >> +} >> + >> +static bool is_folio_zero_filled(struct folio *folio) >> +{ >> + unsigned int i; >> + >> + for (i = 0; i < folio_nr_pages(folio); i++) { >> + if (!is_folio_page_zero_filled(folio, i)) >> + return false; >> + } >> + return true; >> +} >> + >> +static void folio_zero_fill(struct folio *folio) >> +{ >> + unsigned int i; >> + >> + for (i = 0; i < folio_nr_pages(folio); i++) >> + clear_highpage(folio_page(folio, i)); >> +} >> + >> +static void swap_zeromap_folio_set(struct folio *folio) >> +{ >> + struct swap_info_struct *sis = swp_swap_info(folio->swap); >> + swp_entry_t entry; >> + unsigned int i; >> + >> + for (i = 0; i < folio_nr_pages(folio); i++) { >> + entry = page_swap_entry(folio_page(folio, i)); >> + set_bit(swp_offset(entry), sis->zeromap); >> + } >> +} >> + >> +static void swap_zeromap_folio_clear(struct folio *folio) >> +{ >> + struct swap_info_struct *sis = swp_swap_info(folio->swap); >> + swp_entry_t entry; >> + unsigned int i; >> + >> + for (i = 0; i < folio_nr_pages(folio); i++) { >> + entry = page_swap_entry(folio_page(folio, i)); >> + clear_bit(swp_offset(entry), sis->zeromap); >> + } >> +} >> + >> +/* >> + * Return the index of the first subpage which is not zero-filled >> + * according to swap_info_struct->zeromap. >> + * If all pages are zero-filled according to zeromap, it will return >> + * folio_nr_pages(folio). >> + */ >> +static unsigned int swap_zeromap_folio_test(struct folio *folio) >> +{ >> + struct swap_info_struct *sis = swp_swap_info(folio->swap); >> + swp_entry_t entry; >> + unsigned int i; >> + >> + for (i = 0; i < folio_nr_pages(folio); i++) { >> + entry = page_swap_entry(folio_page(folio, i)); >> + if (!test_bit(swp_offset(entry), sis->zeromap)) >> + return i; >> + } >> + return i; >> +} >> + >> /* >> * We may have stale swap cache pages in memory: notice >> * them here and get rid of the unnecessary final write. >> @@ -195,6 +277,13 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) >> folio_unlock(folio); >> return ret; >> } >> + >> + if (is_folio_zero_filled(folio)) { >> + swap_zeromap_folio_set(folio); >> + folio_unlock(folio); >> + return 0; >> + } >> + swap_zeromap_folio_clear(folio); >> if (zswap_store(folio)) { >> folio_start_writeback(folio); >> folio_unlock(folio); >> @@ -426,6 +515,26 @@ static void sio_read_complete(struct kiocb *iocb, long ret) >> mempool_free(sio, sio_pool); >> } >> >> +static bool swap_read_folio_zeromap(struct folio *folio) >> +{ >> + unsigned int idx = swap_zeromap_folio_test(folio); >> + >> + if (idx == 0) >> + return false; >> + >> + /* >> + * Swapping in a large folio that is partially in the zeromap is not >> + * currently handled. Return true without marking the folio uptodate so >> + * that an IO error is emitted (e.g. do_swap_page() will sigbus). >> + */ >> + if (WARN_ON_ONCE(idx < folio_nr_pages(folio))) >> + return true; > > Hi Usama, Yosry, > > I feel the warning is wrong as we could have the case where idx==0 > is not zeromap but idx=1 is zeromap. idx == 0 doesn't necessarily > mean we should return false. > > What about the below change which both fixes the warning and unblocks > large folios swap-in? > Hi Barry, I remembered when resending the zeromap series about the comment Yosry had made earlier, but checked that the mTHP swap-in was not in mm-unstable. I should have checked the mailing list and commented! I have not tested the below diff yet (will do in a few hours). But there might be a small issue with it. Have commented inline. > diff --git a/mm/page_io.c b/mm/page_io.c > index 4bc77d1c6bfa..7d7ff7064e2b 100644 > --- a/mm/page_io.c > +++ b/mm/page_io.c > @@ -226,26 +226,6 @@ static void swap_zeromap_folio_clear(struct folio *folio) > } > } > > -/* > - * Return the index of the first subpage which is not zero-filled > - * according to swap_info_struct->zeromap. > - * If all pages are zero-filled according to zeromap, it will return > - * folio_nr_pages(folio). > - */ > -static unsigned int swap_zeromap_folio_test(struct folio *folio) > -{ > - struct swap_info_struct *sis = swp_swap_info(folio->swap); > - swp_entry_t entry; > - unsigned int i; > - > - for (i = 0; i < folio_nr_pages(folio); i++) { > - entry = page_swap_entry(folio_page(folio, i)); > - if (!test_bit(swp_offset(entry), sis->zeromap)) > - return i; > - } > - return i; > -} > - > /* > * We may have stale swap cache pages in memory: notice > * them here and get rid of the unnecessary final write. > @@ -524,9 +504,10 @@ static void sio_read_complete(struct kiocb *iocb, long ret) > > static bool swap_read_folio_zeromap(struct folio *folio) > { > - unsigned int idx = swap_zeromap_folio_test(folio); > + unsigned int nr_pages = folio_nr_pages(folio); > + unsigned int nr = swap_zeromap_entries_count(folio->swap, nr_pages); > > - if (idx == 0) > + if (nr == 0) > return false; > > /* > @@ -534,7 +515,7 @@ static bool swap_read_folio_zeromap(struct folio *folio) > * currently handled. Return true without marking the folio uptodate so > * that an IO error is emitted (e.g. do_swap_page() will sigbus). > */ > - if (WARN_ON_ONCE(idx < folio_nr_pages(folio))) > + if (WARN_ON_ONCE(nr < nr_pages)) > return true; > > folio_zero_range(folio, 0, folio_size(folio)); > diff --git a/mm/swap.h b/mm/swap.h > index f8711ff82f84..2d59e9d89e95 100644 > --- a/mm/swap.h > +++ b/mm/swap.h > @@ -80,6 +80,32 @@ static inline unsigned int folio_swap_flags(struct folio *folio) > { > return swp_swap_info(folio->swap)->flags; > } > + > +/* > + * Return the number of entries which are zero-filled according to > + * swap_info_struct->zeromap. It isn't precise if the return value > + * is larger than 0 and smaller than nr to avoid extra iterations, > + * In this case, it means entries haven't consistent zeromap. > + */ > +static inline unsigned int swap_zeromap_entries_count(swp_entry_t entry, int nr) > +{ > + struct swap_info_struct *sis = swp_swap_info(entry); > + unsigned long offset = swp_offset(entry); > + unsigned int type = swp_type(entry); > + unsigned int n = 0; > + > + for (int i = 0; i < nr; i++) { > + entry = swp_entry(type, offset + i); > + if (test_bit(offset + i, sis->zeromap)) { Should this be if (test_bit(swp_offset(entry), sis->zeromap)) Also, are you going to use this in alloc_swap_folio? You mentioned above that this unblocks large folios swap-in, but I don't see it in the diff here. I am guessing there is some change in alloc_swap_info that uses swap_zeromap_entries_count? Thanks Usama > + if (i != n) > + return i; > + n++; > + } > + } > + > + return n; > +} > + > #else /* CONFIG_SWAP */ > struct swap_iocb; > static inline void swap_read_folio(struct folio *folio, struct swap_iocb **plug) > @@ -171,6 +197,11 @@ static inline unsigned int folio_swap_flags(struct folio *folio) > { > return 0; > } > + > +static inline unsigned int swap_zeromap_entries_count(swp_entry_t entry, int nr) > +{ > + return 0; > +} > #endif /* CONFIG_SWAP */ > > #endif /* _MM_SWAP_H */ >
[..] > > I understand the point of doing this to unblock the synchronous large > > folio swapin support work, but at some point we're gonna have to > > actually handle the cases where a large folio being swapped in is > > partially in the swap cache, zswap, the zeromap, etc. > > > > All these cases will need similar-ish handling, and I suspect we won't > > just skip swapping in large folios in all these cases. > > I agree that this is definitely the goal. `swap_read_folio()` should be a > dependable API that always returns reliable data, regardless of whether > `zeromap` or `zswap` is involved. Despite these issues, mTHP swap-in shouldn't > be held back. Significant efforts are underway to support large folios in > `zswap`, and progress is being made. Not to mention we've already allowed > `zeromap` to proceed, even though it doesn't support large folios. > > It's genuinely unfair to let the lack of mTHP support in `zeromap` and > `zswap` hold swap-in hostage. Well, two points here: 1. I did not say that we should block the synchronous mTHP swapin work for this :) I said the next item on the TODO list for mTHP swapin support should be handling these cases. 2. I think two things are getting conflated here. Zswap needs to support mTHP swapin*. Zeromap already supports mTHPs AFAICT. What is truly, and is outside the scope of zswap/zeromap, is being able to support hybrid mTHP swapin. When swapping in an mTHP, the swapped entries can be on disk, in the swapcache, in zswap, or in the zeromap. Even if all these things support mTHPs individually, we essentially need support to form an mTHP from swap entries in different backends. That's what I meant. Actually if we have that, we may not really need mTHP swapin support in zswap, because we can just form the large folio in the swap layer from multiple zswap entries. > > Nonetheless, `zeromap` and `zswap` are distinct cases. With `zeromap`, we > permit almost all mTHP swap-ins, except for those rare situations where > small folios that were swapped out happen to have contiguous and aligned > swap slots. > > swapcache is another quite different story, since our user scenarios begin from > the simplest sync io on mobile phones, we don't quite care about swapcache. Right. The reason I bring this up is as I mentioned above, there is a common problem of forming large folios from different sources, which includes the swap cache. The fact that synchronous swapin does not use the swapcache was a happy coincidence for you, as you can add support mTHP swapins without handling this case yet ;)
On Wed, Sep 4, 2024 at 11:14 PM Usama Arif <usamaarif642@gmail.com> wrote: > > > > On 04/09/2024 06:55, Barry Song wrote: > > On Thu, Jun 13, 2024 at 12:48 AM Usama Arif <usamaarif642@gmail.com> wrote: > >> > >> Approximately 10-20% of pages to be swapped out are zero pages [1]. > >> Rather than reading/writing these pages to flash resulting > >> in increased I/O and flash wear, a bitmap can be used to mark these > >> pages as zero at write time, and the pages can be filled at > >> read time if the bit corresponding to the page is set. > >> With this patch, NVMe writes in Meta server fleet decreased > >> by almost 10% with conventional swap setup (zswap disabled). > >> > >> [1] https://lore.kernel.org/all/20171018104832epcms5p1b2232e2236258de3d03d1344dde9fce0@epcms5p1/ > >> > >> Signed-off-by: Usama Arif <usamaarif642@gmail.com> > >> --- > >> include/linux/swap.h | 1 + > >> mm/page_io.c | 114 ++++++++++++++++++++++++++++++++++++++++++- > >> mm/swapfile.c | 24 ++++++++- > >> 3 files changed, 136 insertions(+), 3 deletions(-) > >> > >> diff --git a/include/linux/swap.h b/include/linux/swap.h > >> index a11c75e897ec..e88563978441 100644 > >> --- a/include/linux/swap.h > >> +++ b/include/linux/swap.h > >> @@ -299,6 +299,7 @@ struct swap_info_struct { > >> signed char type; /* strange name for an index */ > >> unsigned int max; /* extent of the swap_map */ > >> unsigned char *swap_map; /* vmalloc'ed array of usage counts */ > >> + unsigned long *zeromap; /* vmalloc'ed bitmap to track zero pages */ > >> struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ > >> struct swap_cluster_list free_clusters; /* free clusters list */ > >> unsigned int lowest_bit; /* index of first free in swap_map */ > >> diff --git a/mm/page_io.c b/mm/page_io.c > >> index a360857cf75d..39fc3919ce15 100644 > >> --- a/mm/page_io.c > >> +++ b/mm/page_io.c > >> @@ -172,6 +172,88 @@ int generic_swapfile_activate(struct swap_info_struct *sis, > >> goto out; > >> } > >> > >> +static bool is_folio_page_zero_filled(struct folio *folio, int i) > >> +{ > >> + unsigned long *data; > >> + unsigned int pos, last_pos = PAGE_SIZE / sizeof(*data) - 1; > >> + bool ret = false; > >> + > >> + data = kmap_local_folio(folio, i * PAGE_SIZE); > >> + if (data[last_pos]) > >> + goto out; > >> + for (pos = 0; pos < PAGE_SIZE / sizeof(*data); pos++) { > >> + if (data[pos]) > >> + goto out; > >> + } > >> + ret = true; > >> +out: > >> + kunmap_local(data); > >> + return ret; > >> +} > >> + > >> +static bool is_folio_zero_filled(struct folio *folio) > >> +{ > >> + unsigned int i; > >> + > >> + for (i = 0; i < folio_nr_pages(folio); i++) { > >> + if (!is_folio_page_zero_filled(folio, i)) > >> + return false; > >> + } > >> + return true; > >> +} > >> + > >> +static void folio_zero_fill(struct folio *folio) > >> +{ > >> + unsigned int i; > >> + > >> + for (i = 0; i < folio_nr_pages(folio); i++) > >> + clear_highpage(folio_page(folio, i)); > >> +} > >> + > >> +static void swap_zeromap_folio_set(struct folio *folio) > >> +{ > >> + struct swap_info_struct *sis = swp_swap_info(folio->swap); > >> + swp_entry_t entry; > >> + unsigned int i; > >> + > >> + for (i = 0; i < folio_nr_pages(folio); i++) { > >> + entry = page_swap_entry(folio_page(folio, i)); > >> + set_bit(swp_offset(entry), sis->zeromap); > >> + } > >> +} > >> + > >> +static void swap_zeromap_folio_clear(struct folio *folio) > >> +{ > >> + struct swap_info_struct *sis = swp_swap_info(folio->swap); > >> + swp_entry_t entry; > >> + unsigned int i; > >> + > >> + for (i = 0; i < folio_nr_pages(folio); i++) { > >> + entry = page_swap_entry(folio_page(folio, i)); > >> + clear_bit(swp_offset(entry), sis->zeromap); > >> + } > >> +} > >> + > >> +/* > >> + * Return the index of the first subpage which is not zero-filled > >> + * according to swap_info_struct->zeromap. > >> + * If all pages are zero-filled according to zeromap, it will return > >> + * folio_nr_pages(folio). > >> + */ > >> +static unsigned int swap_zeromap_folio_test(struct folio *folio) > >> +{ > >> + struct swap_info_struct *sis = swp_swap_info(folio->swap); > >> + swp_entry_t entry; > >> + unsigned int i; > >> + > >> + for (i = 0; i < folio_nr_pages(folio); i++) { > >> + entry = page_swap_entry(folio_page(folio, i)); > >> + if (!test_bit(swp_offset(entry), sis->zeromap)) > >> + return i; > >> + } > >> + return i; > >> +} > >> + > >> /* > >> * We may have stale swap cache pages in memory: notice > >> * them here and get rid of the unnecessary final write. > >> @@ -195,6 +277,13 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) > >> folio_unlock(folio); > >> return ret; > >> } > >> + > >> + if (is_folio_zero_filled(folio)) { > >> + swap_zeromap_folio_set(folio); > >> + folio_unlock(folio); > >> + return 0; > >> + } > >> + swap_zeromap_folio_clear(folio); > >> if (zswap_store(folio)) { > >> folio_start_writeback(folio); > >> folio_unlock(folio); > >> @@ -426,6 +515,26 @@ static void sio_read_complete(struct kiocb *iocb, long ret) > >> mempool_free(sio, sio_pool); > >> } > >> > >> +static bool swap_read_folio_zeromap(struct folio *folio) > >> +{ > >> + unsigned int idx = swap_zeromap_folio_test(folio); > >> + > >> + if (idx == 0) > >> + return false; > >> + > >> + /* > >> + * Swapping in a large folio that is partially in the zeromap is not > >> + * currently handled. Return true without marking the folio uptodate so > >> + * that an IO error is emitted (e.g. do_swap_page() will sigbus). > >> + */ > >> + if (WARN_ON_ONCE(idx < folio_nr_pages(folio))) > >> + return true; > > > > Hi Usama, Yosry, > > > > I feel the warning is wrong as we could have the case where idx==0 > > is not zeromap but idx=1 is zeromap. idx == 0 doesn't necessarily > > mean we should return false. > > > > What about the below change which both fixes the warning and unblocks > > large folios swap-in? > > > Hi Barry, > > I remembered when resending the zeromap series about the comment Yosry had made earlier, but checked that the mTHP swap-in was not in mm-unstable. > I should have checked the mailing list and commented! > > I have not tested the below diff yet (will do in a few hours). But there might be a small issue with it. Have commented inline. > > > diff --git a/mm/page_io.c b/mm/page_io.c > > index 4bc77d1c6bfa..7d7ff7064e2b 100644 > > --- a/mm/page_io.c > > +++ b/mm/page_io.c > > @@ -226,26 +226,6 @@ static void swap_zeromap_folio_clear(struct folio *folio) > > } > > } > > > > -/* > > - * Return the index of the first subpage which is not zero-filled > > - * according to swap_info_struct->zeromap. > > - * If all pages are zero-filled according to zeromap, it will return > > - * folio_nr_pages(folio). > > - */ > > -static unsigned int swap_zeromap_folio_test(struct folio *folio) > > -{ > > - struct swap_info_struct *sis = swp_swap_info(folio->swap); > > - swp_entry_t entry; > > - unsigned int i; > > - > > - for (i = 0; i < folio_nr_pages(folio); i++) { > > - entry = page_swap_entry(folio_page(folio, i)); > > - if (!test_bit(swp_offset(entry), sis->zeromap)) > > - return i; > > - } > > - return i; > > -} > > - > > /* > > * We may have stale swap cache pages in memory: notice > > * them here and get rid of the unnecessary final write. > > @@ -524,9 +504,10 @@ static void sio_read_complete(struct kiocb *iocb, long ret) > > > > static bool swap_read_folio_zeromap(struct folio *folio) > > { > > - unsigned int idx = swap_zeromap_folio_test(folio); > > + unsigned int nr_pages = folio_nr_pages(folio); > > + unsigned int nr = swap_zeromap_entries_count(folio->swap, nr_pages); > > > > - if (idx == 0) > > + if (nr == 0) > > return false; > > > > /* > > @@ -534,7 +515,7 @@ static bool swap_read_folio_zeromap(struct folio *folio) > > * currently handled. Return true without marking the folio uptodate so > > * that an IO error is emitted (e.g. do_swap_page() will sigbus). > > */ > > - if (WARN_ON_ONCE(idx < folio_nr_pages(folio))) > > + if (WARN_ON_ONCE(nr < nr_pages)) > > return true; > > > > folio_zero_range(folio, 0, folio_size(folio)); > > diff --git a/mm/swap.h b/mm/swap.h > > index f8711ff82f84..2d59e9d89e95 100644 > > --- a/mm/swap.h > > +++ b/mm/swap.h > > @@ -80,6 +80,32 @@ static inline unsigned int folio_swap_flags(struct folio *folio) > > { > > return swp_swap_info(folio->swap)->flags; > > } > > + > > +/* > > + * Return the number of entries which are zero-filled according to > > + * swap_info_struct->zeromap. It isn't precise if the return value > > + * is larger than 0 and smaller than nr to avoid extra iterations, > > + * In this case, it means entries haven't consistent zeromap. > > + */ > > +static inline unsigned int swap_zeromap_entries_count(swp_entry_t entry, int nr) > > +{ > > + struct swap_info_struct *sis = swp_swap_info(entry); > > + unsigned long offset = swp_offset(entry); > > + unsigned int type = swp_type(entry); > > + unsigned int n = 0; > > + > > + for (int i = 0; i < nr; i++) { > > + entry = swp_entry(type, offset + i); > > + if (test_bit(offset + i, sis->zeromap)) { > > Should this be if (test_bit(swp_offset(entry), sis->zeromap)) > well. i feel i have a much cheaper way to implement this, which can entirely iteration even in your original code: +/* + * Return the number of entries which are zero-filled according to + * swap_info_struct->zeromap. It isn't precise if the return value + * is 1 for nr > 1. In this case, it means entries have inconsistent + * zeromap. + */ +static inline unsigned int swap_zeromap_entries_count(swp_entry_t entry, int nr) +{ + struct swap_info_struct *sis = swp_swap_info(entry); + unsigned long start = swp_offset(entry); + unsigned long end = start + nr; + unsigned long idx = 0; + + idx = find_next_bit(sis->zeromap, end, start); + if (idx == end) + return 0; + if (idx > start) + return 1; + return nr; +} + > > Also, are you going to use this in alloc_swap_folio? > You mentioned above that this unblocks large folios swap-in, but I don't see > it in the diff here. I am guessing there is some change in alloc_swap_info that > uses swap_zeromap_entries_count? > > Thanks > Usama > > > + if (i != n) > > + return i; > > + n++; > > + } > > + } > > + > > + return n; > > +} > > + > > #else /* CONFIG_SWAP */ > > struct swap_iocb; > > static inline void swap_read_folio(struct folio *folio, struct swap_iocb **plug) > > @@ -171,6 +197,11 @@ static inline unsigned int folio_swap_flags(struct folio *folio) > > { > > return 0; > > } > > + > > +static inline unsigned int swap_zeromap_entries_count(swp_entry_t entry, int nr) > > +{ > > + return 0; > > +} > > #endif /* CONFIG_SWAP */ > > > > #endif /* _MM_SWAP_H */ > > Thanks Barry
On Thu, Sep 5, 2024 at 11:44 AM Barry Song <21cnbao@gmail.com> wrote: > > On Wed, Sep 4, 2024 at 11:14 PM Usama Arif <usamaarif642@gmail.com> wrote: > > > > > > > > On 04/09/2024 06:55, Barry Song wrote: > > > On Thu, Jun 13, 2024 at 12:48 AM Usama Arif <usamaarif642@gmail.com> wrote: > > >> > > >> Approximately 10-20% of pages to be swapped out are zero pages [1]. > > >> Rather than reading/writing these pages to flash resulting > > >> in increased I/O and flash wear, a bitmap can be used to mark these > > >> pages as zero at write time, and the pages can be filled at > > >> read time if the bit corresponding to the page is set. > > >> With this patch, NVMe writes in Meta server fleet decreased > > >> by almost 10% with conventional swap setup (zswap disabled). > > >> > > >> [1] https://lore.kernel.org/all/20171018104832epcms5p1b2232e2236258de3d03d1344dde9fce0@epcms5p1/ > > >> > > >> Signed-off-by: Usama Arif <usamaarif642@gmail.com> > > >> --- > > >> include/linux/swap.h | 1 + > > >> mm/page_io.c | 114 ++++++++++++++++++++++++++++++++++++++++++- > > >> mm/swapfile.c | 24 ++++++++- > > >> 3 files changed, 136 insertions(+), 3 deletions(-) > > >> > > >> diff --git a/include/linux/swap.h b/include/linux/swap.h > > >> index a11c75e897ec..e88563978441 100644 > > >> --- a/include/linux/swap.h > > >> +++ b/include/linux/swap.h > > >> @@ -299,6 +299,7 @@ struct swap_info_struct { > > >> signed char type; /* strange name for an index */ > > >> unsigned int max; /* extent of the swap_map */ > > >> unsigned char *swap_map; /* vmalloc'ed array of usage counts */ > > >> + unsigned long *zeromap; /* vmalloc'ed bitmap to track zero pages */ > > >> struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ > > >> struct swap_cluster_list free_clusters; /* free clusters list */ > > >> unsigned int lowest_bit; /* index of first free in swap_map */ > > >> diff --git a/mm/page_io.c b/mm/page_io.c > > >> index a360857cf75d..39fc3919ce15 100644 > > >> --- a/mm/page_io.c > > >> +++ b/mm/page_io.c > > >> @@ -172,6 +172,88 @@ int generic_swapfile_activate(struct swap_info_struct *sis, > > >> goto out; > > >> } > > >> > > >> +static bool is_folio_page_zero_filled(struct folio *folio, int i) > > >> +{ > > >> + unsigned long *data; > > >> + unsigned int pos, last_pos = PAGE_SIZE / sizeof(*data) - 1; > > >> + bool ret = false; > > >> + > > >> + data = kmap_local_folio(folio, i * PAGE_SIZE); > > >> + if (data[last_pos]) > > >> + goto out; > > >> + for (pos = 0; pos < PAGE_SIZE / sizeof(*data); pos++) { > > >> + if (data[pos]) > > >> + goto out; > > >> + } > > >> + ret = true; > > >> +out: > > >> + kunmap_local(data); > > >> + return ret; > > >> +} > > >> + > > >> +static bool is_folio_zero_filled(struct folio *folio) > > >> +{ > > >> + unsigned int i; > > >> + > > >> + for (i = 0; i < folio_nr_pages(folio); i++) { > > >> + if (!is_folio_page_zero_filled(folio, i)) > > >> + return false; > > >> + } > > >> + return true; > > >> +} > > >> + > > >> +static void folio_zero_fill(struct folio *folio) > > >> +{ > > >> + unsigned int i; > > >> + > > >> + for (i = 0; i < folio_nr_pages(folio); i++) > > >> + clear_highpage(folio_page(folio, i)); > > >> +} > > >> + > > >> +static void swap_zeromap_folio_set(struct folio *folio) > > >> +{ > > >> + struct swap_info_struct *sis = swp_swap_info(folio->swap); > > >> + swp_entry_t entry; > > >> + unsigned int i; > > >> + > > >> + for (i = 0; i < folio_nr_pages(folio); i++) { > > >> + entry = page_swap_entry(folio_page(folio, i)); > > >> + set_bit(swp_offset(entry), sis->zeromap); > > >> + } > > >> +} > > >> + > > >> +static void swap_zeromap_folio_clear(struct folio *folio) > > >> +{ > > >> + struct swap_info_struct *sis = swp_swap_info(folio->swap); > > >> + swp_entry_t entry; > > >> + unsigned int i; > > >> + > > >> + for (i = 0; i < folio_nr_pages(folio); i++) { > > >> + entry = page_swap_entry(folio_page(folio, i)); > > >> + clear_bit(swp_offset(entry), sis->zeromap); > > >> + } > > >> +} > > >> + > > >> +/* > > >> + * Return the index of the first subpage which is not zero-filled > > >> + * according to swap_info_struct->zeromap. > > >> + * If all pages are zero-filled according to zeromap, it will return > > >> + * folio_nr_pages(folio). > > >> + */ > > >> +static unsigned int swap_zeromap_folio_test(struct folio *folio) > > >> +{ > > >> + struct swap_info_struct *sis = swp_swap_info(folio->swap); > > >> + swp_entry_t entry; > > >> + unsigned int i; > > >> + > > >> + for (i = 0; i < folio_nr_pages(folio); i++) { > > >> + entry = page_swap_entry(folio_page(folio, i)); > > >> + if (!test_bit(swp_offset(entry), sis->zeromap)) > > >> + return i; > > >> + } > > >> + return i; > > >> +} > > >> + > > >> /* > > >> * We may have stale swap cache pages in memory: notice > > >> * them here and get rid of the unnecessary final write. > > >> @@ -195,6 +277,13 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) > > >> folio_unlock(folio); > > >> return ret; > > >> } > > >> + > > >> + if (is_folio_zero_filled(folio)) { > > >> + swap_zeromap_folio_set(folio); > > >> + folio_unlock(folio); > > >> + return 0; > > >> + } > > >> + swap_zeromap_folio_clear(folio); > > >> if (zswap_store(folio)) { > > >> folio_start_writeback(folio); > > >> folio_unlock(folio); > > >> @@ -426,6 +515,26 @@ static void sio_read_complete(struct kiocb *iocb, long ret) > > >> mempool_free(sio, sio_pool); > > >> } > > >> > > >> +static bool swap_read_folio_zeromap(struct folio *folio) > > >> +{ > > >> + unsigned int idx = swap_zeromap_folio_test(folio); > > >> + > > >> + if (idx == 0) > > >> + return false; > > >> + > > >> + /* > > >> + * Swapping in a large folio that is partially in the zeromap is not > > >> + * currently handled. Return true without marking the folio uptodate so > > >> + * that an IO error is emitted (e.g. do_swap_page() will sigbus). > > >> + */ > > >> + if (WARN_ON_ONCE(idx < folio_nr_pages(folio))) > > >> + return true; > > > > > > Hi Usama, Yosry, > > > > > > I feel the warning is wrong as we could have the case where idx==0 > > > is not zeromap but idx=1 is zeromap. idx == 0 doesn't necessarily > > > mean we should return false. > > > > > > What about the below change which both fixes the warning and unblocks > > > large folios swap-in? > > > > > Hi Barry, > > > > I remembered when resending the zeromap series about the comment Yosry had made earlier, but checked that the mTHP swap-in was not in mm-unstable. > > I should have checked the mailing list and commented! > > > > I have not tested the below diff yet (will do in a few hours). But there might be a small issue with it. Have commented inline. > > > > > diff --git a/mm/page_io.c b/mm/page_io.c > > > index 4bc77d1c6bfa..7d7ff7064e2b 100644 > > > --- a/mm/page_io.c > > > +++ b/mm/page_io.c > > > @@ -226,26 +226,6 @@ static void swap_zeromap_folio_clear(struct folio *folio) > > > } > > > } > > > > > > -/* > > > - * Return the index of the first subpage which is not zero-filled > > > - * according to swap_info_struct->zeromap. > > > - * If all pages are zero-filled according to zeromap, it will return > > > - * folio_nr_pages(folio). > > > - */ > > > -static unsigned int swap_zeromap_folio_test(struct folio *folio) > > > -{ > > > - struct swap_info_struct *sis = swp_swap_info(folio->swap); > > > - swp_entry_t entry; > > > - unsigned int i; > > > - > > > - for (i = 0; i < folio_nr_pages(folio); i++) { > > > - entry = page_swap_entry(folio_page(folio, i)); > > > - if (!test_bit(swp_offset(entry), sis->zeromap)) > > > - return i; > > > - } > > > - return i; > > > -} > > > - > > > /* > > > * We may have stale swap cache pages in memory: notice > > > * them here and get rid of the unnecessary final write. > > > @@ -524,9 +504,10 @@ static void sio_read_complete(struct kiocb *iocb, long ret) > > > > > > static bool swap_read_folio_zeromap(struct folio *folio) > > > { > > > - unsigned int idx = swap_zeromap_folio_test(folio); > > > + unsigned int nr_pages = folio_nr_pages(folio); > > > + unsigned int nr = swap_zeromap_entries_count(folio->swap, nr_pages); > > > > > > - if (idx == 0) > > > + if (nr == 0) > > > return false; > > > > > > /* > > > @@ -534,7 +515,7 @@ static bool swap_read_folio_zeromap(struct folio *folio) > > > * currently handled. Return true without marking the folio uptodate so > > > * that an IO error is emitted (e.g. do_swap_page() will sigbus). > > > */ > > > - if (WARN_ON_ONCE(idx < folio_nr_pages(folio))) > > > + if (WARN_ON_ONCE(nr < nr_pages)) > > > return true; > > > > > > folio_zero_range(folio, 0, folio_size(folio)); > > > diff --git a/mm/swap.h b/mm/swap.h > > > index f8711ff82f84..2d59e9d89e95 100644 > > > --- a/mm/swap.h > > > +++ b/mm/swap.h > > > @@ -80,6 +80,32 @@ static inline unsigned int folio_swap_flags(struct folio *folio) > > > { > > > return swp_swap_info(folio->swap)->flags; > > > } > > > + > > > +/* > > > + * Return the number of entries which are zero-filled according to > > > + * swap_info_struct->zeromap. It isn't precise if the return value > > > + * is larger than 0 and smaller than nr to avoid extra iterations, > > > + * In this case, it means entries haven't consistent zeromap. > > > + */ > > > +static inline unsigned int swap_zeromap_entries_count(swp_entry_t entry, int nr) > > > +{ > > > + struct swap_info_struct *sis = swp_swap_info(entry); > > > + unsigned long offset = swp_offset(entry); > > > + unsigned int type = swp_type(entry); > > > + unsigned int n = 0; > > > + > > > + for (int i = 0; i < nr; i++) { > > > + entry = swp_entry(type, offset + i); > > > + if (test_bit(offset + i, sis->zeromap)) { > > > > Should this be if (test_bit(swp_offset(entry), sis->zeromap)) > > > > well. i feel i have a much cheaper way to implement this, which > can entirely iteration even in your original code: > > +/* > + * Return the number of entries which are zero-filled according to > + * swap_info_struct->zeromap. It isn't precise if the return value > + * is 1 for nr > 1. In this case, it means entries have inconsistent > + * zeromap. > + */ > +static inline unsigned int swap_zeromap_entries_count(swp_entry_t > entry, int nr) > +{ > + struct swap_info_struct *sis = swp_swap_info(entry); > + unsigned long start = swp_offset(entry); > + unsigned long end = start + nr; > + unsigned long idx = 0; > + > + idx = find_next_bit(sis->zeromap, end, start); > + if (idx == end) > + return 0; > + if (idx > start) > + return 1; missed a case here: if (nr > 1 && find_next_zero_bit(sis->zeromap, end, start) != end) return 1; > + return nr; > +} > + > > > > > > Also, are you going to use this in alloc_swap_folio? > > You mentioned above that this unblocks large folios swap-in, but I don't see > > it in the diff here. I am guessing there is some change in alloc_swap_info that > > uses swap_zeromap_entries_count? > > > > Thanks > > Usama > > > > > + if (i != n) > > > + return i; > > > + n++; > > > + } > > > + } > > > + > > > + return n; > > > +} > > > + > > > #else /* CONFIG_SWAP */ > > > struct swap_iocb; > > > static inline void swap_read_folio(struct folio *folio, struct swap_iocb **plug) > > > @@ -171,6 +197,11 @@ static inline unsigned int folio_swap_flags(struct folio *folio) > > > { > > > return 0; > > > } > > > + > > > +static inline unsigned int swap_zeromap_entries_count(swp_entry_t entry, int nr) > > > +{ > > > + return 0; > > > +} > > > #endif /* CONFIG_SWAP */ > > > > > > #endif /* _MM_SWAP_H */ > > > > > Thanks > Barry
[..] > well. i feel i have a much cheaper way to implement this, which > can entirely iteration even in your original code: > > +/* > + * Return the number of entries which are zero-filled according to > + * swap_info_struct->zeromap. It isn't precise if the return value > + * is 1 for nr > 1. In this case, it means entries have inconsistent > + * zeromap. > + */ > +static inline unsigned int swap_zeromap_entries_count(swp_entry_t > entry, int nr) FWIW I am not really a fan of the count() function not returning an actual count. I think an enum with three states is more appropriate here, and renaming the function to swap_zeromap_entries_check() or similar. > +{ > + struct swap_info_struct *sis = swp_swap_info(entry); > + unsigned long start = swp_offset(entry); > + unsigned long end = start + nr; > + unsigned long idx = 0; > + > + idx = find_next_bit(sis->zeromap, end, start); > + if (idx == end) > + return 0; > + if (idx > start) > + return 1; > + return nr; > +} > + > >
On Thu, Sep 5, 2024 at 11:57 AM Yosry Ahmed <yosryahmed@google.com> wrote: > > [..] > > well. i feel i have a much cheaper way to implement this, which > > can entirely iteration even in your original code: > > > > +/* > > + * Return the number of entries which are zero-filled according to > > + * swap_info_struct->zeromap. It isn't precise if the return value > > + * is 1 for nr > 1. In this case, it means entries have inconsistent > > + * zeromap. > > + */ > > +static inline unsigned int swap_zeromap_entries_count(swp_entry_t > > entry, int nr) > > FWIW I am not really a fan of the count() function not returning an > actual count. I think an enum with three states is more appropriate > here, and renaming the function to swap_zeromap_entries_check() or > similar. > Make sense to me, what about the below? From 24228a1e8426b8b05711a5efcfaae70abeb012c4 Mon Sep 17 00:00:00 2001 From: Barry Song <v-songbaohua@oppo.com> Date: Thu, 5 Sep 2024 11:56:03 +1200 Subject: [PATCH] mm: fix handling zero for large folios with partial zeromap There could be a corner case where the first entry is non-zeromap, but a subsequent entry is zeromap. In this case, we should not return false. Additionally, the iteration of test_bit() is unnecessary and can be replaced with bitmap operations, which are more efficient. Since swap_read_folio() can't handle reading a large folio that's partially zeromap and partially non-zeromap, we've moved the code to mm/swap.h so that others, like those working on swap-in, can access it. Fixes: 0ca0c24e3211 ("mm: store zero pages to be swapped out in a bitmap") Signed-off-by: Barry Song <v-songbaohua@oppo.com> --- mm/page_io.c | 27 ++++----------------------- mm/swap.h | 29 +++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 23 deletions(-) diff --git a/mm/page_io.c b/mm/page_io.c index 4bc77d1c6bfa..46907c9dd20b 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -226,26 +226,6 @@ static void swap_zeromap_folio_clear(struct folio *folio) } } -/* - * Return the index of the first subpage which is not zero-filled - * according to swap_info_struct->zeromap. - * If all pages are zero-filled according to zeromap, it will return - * folio_nr_pages(folio). - */ -static unsigned int swap_zeromap_folio_test(struct folio *folio) -{ - struct swap_info_struct *sis = swp_swap_info(folio->swap); - swp_entry_t entry; - unsigned int i; - - for (i = 0; i < folio_nr_pages(folio); i++) { - entry = page_swap_entry(folio_page(folio, i)); - if (!test_bit(swp_offset(entry), sis->zeromap)) - return i; - } - return i; -} - /* * We may have stale swap cache pages in memory: notice * them here and get rid of the unnecessary final write. @@ -524,9 +504,10 @@ static void sio_read_complete(struct kiocb *iocb, long ret) static bool swap_read_folio_zeromap(struct folio *folio) { - unsigned int idx = swap_zeromap_folio_test(folio); + unsigned int nr_pages = folio_nr_pages(folio); + zeromap_stat_t stat = swap_zeromap_entries_check(folio->swap, nr_pages); - if (idx == 0) + if (stat == SWAP_ZEROMAP_NON) return false; /* @@ -534,7 +515,7 @@ static bool swap_read_folio_zeromap(struct folio *folio) * currently handled. Return true without marking the folio uptodate so * that an IO error is emitted (e.g. do_swap_page() will sigbus). */ - if (WARN_ON_ONCE(idx < folio_nr_pages(folio))) + if (WARN_ON_ONCE(stat == SWAP_ZEROMAP_PARTIAL)) return true; folio_zero_range(folio, 0, folio_size(folio)); diff --git a/mm/swap.h b/mm/swap.h index f8711ff82f84..f8e3fa061c1d 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -4,6 +4,12 @@ struct mempolicy; +typedef enum { + SWAP_ZEROMAP_NON, + SWAP_ZEROMAP_FULL, + SWAP_ZEROMAP_PARTIAL +} zeromap_stat_t; + #ifdef CONFIG_SWAP #include <linux/swapops.h> /* for swp_offset */ #include <linux/blk_types.h> /* for bio_end_io_t */ @@ -80,6 +86,24 @@ static inline unsigned int folio_swap_flags(struct folio *folio) { return swp_swap_info(folio->swap)->flags; } + +/* + * Check if nr entries are all zeromap, non-zeromap or partially zeromap + */ +static inline zeromap_stat_t swap_zeromap_entries_check(swp_entry_t entry, int nr) +{ + struct swap_info_struct *sis = swp_swap_info(entry); + unsigned long start = swp_offset(entry); + unsigned long end = start + nr; + + if (find_next_bit(sis->zeromap, end, start) == end) + return SWAP_ZEROMAP_NON; + if (find_next_zero_bit(sis->zeromap, end, start) == end) + return SWAP_ZEROMAP_FULL; + + return SWAP_ZEROMAP_PARTIAL; +} + #else /* CONFIG_SWAP */ struct swap_iocb; static inline void swap_read_folio(struct folio *folio, struct swap_iocb **plug) @@ -171,6 +195,11 @@ static inline unsigned int folio_swap_flags(struct folio *folio) { return 0; } + +static inline zeromap_stat_t swap_zeromap_entries_check(swp_entry_t entry, int nr) +{ + return SWAP_ZEROMAP_NONE; +} #endif /* CONFIG_SWAP */ #endif /* _MM_SWAP_H */
On Thu, Sep 5, 2024 at 5:41 AM Yosry Ahmed <yosryahmed@google.com> wrote: > > [..] > > > I understand the point of doing this to unblock the synchronous large > > > folio swapin support work, but at some point we're gonna have to > > > actually handle the cases where a large folio being swapped in is > > > partially in the swap cache, zswap, the zeromap, etc. > > > > > > All these cases will need similar-ish handling, and I suspect we won't > > > just skip swapping in large folios in all these cases. > > > > I agree that this is definitely the goal. `swap_read_folio()` should be a > > dependable API that always returns reliable data, regardless of whether > > `zeromap` or `zswap` is involved. Despite these issues, mTHP swap-in shouldn't > > be held back. Significant efforts are underway to support large folios in > > `zswap`, and progress is being made. Not to mention we've already allowed > > `zeromap` to proceed, even though it doesn't support large folios. > > > > It's genuinely unfair to let the lack of mTHP support in `zeromap` and > > `zswap` hold swap-in hostage. > Hi Yosry, > Well, two points here: > > 1. I did not say that we should block the synchronous mTHP swapin work > for this :) I said the next item on the TODO list for mTHP swapin > support should be handling these cases. Thanks for your clarification! > > 2. I think two things are getting conflated here. Zswap needs to > support mTHP swapin*. Zeromap already supports mTHPs AFAICT. What is > truly, and is outside the scope of zswap/zeromap, is being able to > support hybrid mTHP swapin. > > When swapping in an mTHP, the swapped entries can be on disk, in the > swapcache, in zswap, or in the zeromap. Even if all these things > support mTHPs individually, we essentially need support to form an > mTHP from swap entries in different backends. That's what I meant. > Actually if we have that, we may not really need mTHP swapin support > in zswap, because we can just form the large folio in the swap layer > from multiple zswap entries. > After further consideration, I've actually started to disagree with the idea of supporting hybrid swapin (forming an mTHP from swap entries in different backends). My reasoning is as follows: 1. The scenario where an mTHP is partially zeromap, partially zswap, etc., would be an extremely rare case, as long as we're swapping out the mTHP as a whole and all the modules are handling it accordingly. It's highly unlikely to form this mix of zeromap, zswap, and swapcache unless the contiguous VMA virtual address happens to get some small folios with aligned and contiguous swap slots. Even then, they would need to be partially zeromap and partially non-zeromap, zswap, etc. As you mentioned, zeromap handles mTHP as a whole during swapping out, marking all subpages of the entire mTHP as zeromap rather than just a subset of them. And swap-in can also entirely map a swapcache which is a large folio based on our previous patchset which has been in mainline: "mm: swap: entirely map large folios found in swapcache" https://lore.kernel.org/all/20240529082824.150954-1-21cnbao@gmail.com/ It seems the only thing we're missing is zswap support for mTHP. 2. Implementing hybrid swap-in would be extremely tricky and could disrupt several software layers. I can share some pseudo code below: swap_read_folio() { if (zeromap_full) folio_read_from_zeromap() else if (zswap_map_full) folio_read_from_zswap() else { folio_read_from_swapfile() if (zeromap_partial) folio_read_from_zeromap_fixup() /* fill zero for partially zeromap subpages */ if (zwap_partial) folio_read_from_zswap_fixup() /* zswap_load for partially zswap-mapped subpages */ folio_mark_uptodate() folio_unlock() } We'd also need to modify folio_read_from_swapfile() to skip folio_mark_uptodate() and folio_unlock() after completing the BIO. This approach seems to entirely disrupt the software layers. This could also lead to unnecessary IO operations for subpages that require fixup. Since such cases are quite rare, I believe the added complexity isn't worth it. My point is that we should simply check that all PTEs have consistent zeromap, zswap, and swapcache statuses before proceeding, otherwise fall back to the next lower order if needed. This approach improves performance and avoids complex corner cases. So once zswap mTHP is there, I would also expect an API similar to swap_zeromap_entries_check() for example: zswap_entries_check(entry, nr) which can return if we are having full, non, and partial zswap to replace the existing zswap_never_enabled(). Though I am not sure how cheap zswap can implement it, swap_zeromap_entries_check() could be two simple bit operations: +static inline zeromap_stat_t swap_zeromap_entries_check(swp_entry_t entry, int nr) +{ + struct swap_info_struct *sis = swp_swap_info(entry); + unsigned long start = swp_offset(entry); + unsigned long end = start + nr; + + if (find_next_bit(sis->zeromap, end, start) == end) + return SWAP_ZEROMAP_NON; + if (find_next_zero_bit(sis->zeromap, end, start) == end) + return SWAP_ZEROMAP_FULL; + + return SWAP_ZEROMAP_PARTIAL; +} 3. swapcache is different from zeromap and zswap. Swapcache indicates that the memory is still available and should be re-mapped rather than allocating a new folio. Our previous patchset has implemented a full re-map of an mTHP in do_swap_page() as mentioned in 1. For the same reason as point 1, partial swapcache is a rare edge case. Not re-mapping it and instead allocating a new folio would add significant complexity. > > > > Nonetheless, `zeromap` and `zswap` are distinct cases. With `zeromap`, we > > permit almost all mTHP swap-ins, except for those rare situations where > > small folios that were swapped out happen to have contiguous and aligned > > swap slots. > > > > swapcache is another quite different story, since our user scenarios begin from > > the simplest sync io on mobile phones, we don't quite care about swapcache. > > Right. The reason I bring this up is as I mentioned above, there is a > common problem of forming large folios from different sources, which > includes the swap cache. The fact that synchronous swapin does not use > the swapcache was a happy coincidence for you, as you can add support > mTHP swapins without handling this case yet ;) As I mentioned above, I'd really rather filter out those corner cases than support them, not just for the current situation to unlock swap-in series :-) Thanks Barry
On Wed, Sep 4, 2024 at 5:29 PM Barry Song <21cnbao@gmail.com> wrote: > > On Thu, Sep 5, 2024 at 11:57 AM Yosry Ahmed <yosryahmed@google.com> wrote: > > > > [..] > > > well. i feel i have a much cheaper way to implement this, which > > > can entirely iteration even in your original code: > > > > > > +/* > > > + * Return the number of entries which are zero-filled according to > > > + * swap_info_struct->zeromap. It isn't precise if the return value > > > + * is 1 for nr > 1. In this case, it means entries have inconsistent > > > + * zeromap. > > > + */ > > > +static inline unsigned int swap_zeromap_entries_count(swp_entry_t > > > entry, int nr) > > > > FWIW I am not really a fan of the count() function not returning an > > actual count. I think an enum with three states is more appropriate > > here, and renaming the function to swap_zeromap_entries_check() or > > similar. > > > > Make sense to me, what about the below? LGTM from a high level, but I didn't look closely at the bitmap interface usage. I am specifically unsure if we can pass 'end' as the size, but it seems like it should be fine. I will let Usama take a look. Thanks! > > From 24228a1e8426b8b05711a5efcfaae70abeb012c4 Mon Sep 17 00:00:00 2001 > From: Barry Song <v-songbaohua@oppo.com> > Date: Thu, 5 Sep 2024 11:56:03 +1200 > Subject: [PATCH] mm: fix handling zero for large folios with partial zeromap > > There could be a corner case where the first entry is non-zeromap, > but a subsequent entry is zeromap. In this case, we should not > return false. > > Additionally, the iteration of test_bit() is unnecessary and > can be replaced with bitmap operations, which are more efficient. > > Since swap_read_folio() can't handle reading a large folio that's > partially zeromap and partially non-zeromap, we've moved the code > to mm/swap.h so that others, like those working on swap-in, can > access it. > > Fixes: 0ca0c24e3211 ("mm: store zero pages to be swapped out in a bitmap") > Signed-off-by: Barry Song <v-songbaohua@oppo.com> > --- > mm/page_io.c | 27 ++++----------------------- > mm/swap.h | 29 +++++++++++++++++++++++++++++ > 2 files changed, 33 insertions(+), 23 deletions(-) > > diff --git a/mm/page_io.c b/mm/page_io.c > index 4bc77d1c6bfa..46907c9dd20b 100644 > --- a/mm/page_io.c > +++ b/mm/page_io.c > @@ -226,26 +226,6 @@ static void swap_zeromap_folio_clear(struct folio *folio) > } > } > > -/* > - * Return the index of the first subpage which is not zero-filled > - * according to swap_info_struct->zeromap. > - * If all pages are zero-filled according to zeromap, it will return > - * folio_nr_pages(folio). > - */ > -static unsigned int swap_zeromap_folio_test(struct folio *folio) > -{ > - struct swap_info_struct *sis = swp_swap_info(folio->swap); > - swp_entry_t entry; > - unsigned int i; > - > - for (i = 0; i < folio_nr_pages(folio); i++) { > - entry = page_swap_entry(folio_page(folio, i)); > - if (!test_bit(swp_offset(entry), sis->zeromap)) > - return i; > - } > - return i; > -} > - > /* > * We may have stale swap cache pages in memory: notice > * them here and get rid of the unnecessary final write. > @@ -524,9 +504,10 @@ static void sio_read_complete(struct kiocb *iocb, long ret) > > static bool swap_read_folio_zeromap(struct folio *folio) > { > - unsigned int idx = swap_zeromap_folio_test(folio); > + unsigned int nr_pages = folio_nr_pages(folio); > + zeromap_stat_t stat = swap_zeromap_entries_check(folio->swap, nr_pages); > > - if (idx == 0) > + if (stat == SWAP_ZEROMAP_NON) > return false; > > /* > @@ -534,7 +515,7 @@ static bool swap_read_folio_zeromap(struct folio *folio) > * currently handled. Return true without marking the folio uptodate so > * that an IO error is emitted (e.g. do_swap_page() will sigbus). > */ > - if (WARN_ON_ONCE(idx < folio_nr_pages(folio))) > + if (WARN_ON_ONCE(stat == SWAP_ZEROMAP_PARTIAL)) > return true; > > folio_zero_range(folio, 0, folio_size(folio)); > diff --git a/mm/swap.h b/mm/swap.h > index f8711ff82f84..f8e3fa061c1d 100644 > --- a/mm/swap.h > +++ b/mm/swap.h > @@ -4,6 +4,12 @@ > > struct mempolicy; > > +typedef enum { > + SWAP_ZEROMAP_NON, > + SWAP_ZEROMAP_FULL, > + SWAP_ZEROMAP_PARTIAL > +} zeromap_stat_t; > + > #ifdef CONFIG_SWAP > #include <linux/swapops.h> /* for swp_offset */ > #include <linux/blk_types.h> /* for bio_end_io_t */ > @@ -80,6 +86,24 @@ static inline unsigned int folio_swap_flags(struct folio *folio) > { > return swp_swap_info(folio->swap)->flags; > } > + > +/* > + * Check if nr entries are all zeromap, non-zeromap or partially zeromap > + */ > +static inline zeromap_stat_t swap_zeromap_entries_check(swp_entry_t entry, int nr) > +{ > + struct swap_info_struct *sis = swp_swap_info(entry); > + unsigned long start = swp_offset(entry); > + unsigned long end = start + nr; > + > + if (find_next_bit(sis->zeromap, end, start) == end) > + return SWAP_ZEROMAP_NON; > + if (find_next_zero_bit(sis->zeromap, end, start) == end) > + return SWAP_ZEROMAP_FULL; > + > + return SWAP_ZEROMAP_PARTIAL; > +} > + > #else /* CONFIG_SWAP */ > struct swap_iocb; > static inline void swap_read_folio(struct folio *folio, struct swap_iocb **plug) > @@ -171,6 +195,11 @@ static inline unsigned int folio_swap_flags(struct folio *folio) > { > return 0; > } > + > +static inline zeromap_stat_t swap_zeromap_entries_check(swp_entry_t entry, int nr) > +{ > + return SWAP_ZEROMAP_NONE; > +} > #endif /* CONFIG_SWAP */ > > #endif /* _MM_SWAP_H */ > -- > 2.34.1 > >
On Thu, Sep 5, 2024 at 12:03 AM Barry Song <21cnbao@gmail.com> wrote: > > On Thu, Sep 5, 2024 at 5:41 AM Yosry Ahmed <yosryahmed@google.com> wrote: > > > > [..] > > > > I understand the point of doing this to unblock the synchronous large > > > > folio swapin support work, but at some point we're gonna have to > > > > actually handle the cases where a large folio being swapped in is > > > > partially in the swap cache, zswap, the zeromap, etc. > > > > > > > > All these cases will need similar-ish handling, and I suspect we won't > > > > just skip swapping in large folios in all these cases. > > > > > > I agree that this is definitely the goal. `swap_read_folio()` should be a > > > dependable API that always returns reliable data, regardless of whether > > > `zeromap` or `zswap` is involved. Despite these issues, mTHP swap-in shouldn't > > > be held back. Significant efforts are underway to support large folios in > > > `zswap`, and progress is being made. Not to mention we've already allowed > > > `zeromap` to proceed, even though it doesn't support large folios. > > > > > > It's genuinely unfair to let the lack of mTHP support in `zeromap` and > > > `zswap` hold swap-in hostage. > > > > Hi Yosry, > > > Well, two points here: > > > > 1. I did not say that we should block the synchronous mTHP swapin work > > for this :) I said the next item on the TODO list for mTHP swapin > > support should be handling these cases. > > Thanks for your clarification! > > > > > 2. I think two things are getting conflated here. Zswap needs to > > support mTHP swapin*. Zeromap already supports mTHPs AFAICT. What is > > truly, and is outside the scope of zswap/zeromap, is being able to > > support hybrid mTHP swapin. > > > > When swapping in an mTHP, the swapped entries can be on disk, in the > > swapcache, in zswap, or in the zeromap. Even if all these things > > support mTHPs individually, we essentially need support to form an > > mTHP from swap entries in different backends. That's what I meant. > > Actually if we have that, we may not really need mTHP swapin support > > in zswap, because we can just form the large folio in the swap layer > > from multiple zswap entries. > > > > After further consideration, I've actually started to disagree with the idea > of supporting hybrid swapin (forming an mTHP from swap entries in different > backends). My reasoning is as follows: I do not have any data about this, so you could very well be right here. Handling hybrid swapin could be simply falling back to the smallest order we can swapin from a single backend. We can at least start with this, and collect data about how many mTHP swapins fallback due to hybrid backends. This way we only take the complexity if needed. I did imagine though that it's possible for two virtually contiguous folios to be swapped out to contiguous swap entries and end up in different media (e.g. if only one of them is zero-filled). I am not sure how rare it would be in practice. > > 1. The scenario where an mTHP is partially zeromap, partially zswap, etc., > would be an extremely rare case, as long as we're swapping out the mTHP as > a whole and all the modules are handling it accordingly. It's highly > unlikely to form this mix of zeromap, zswap, and swapcache unless the > contiguous VMA virtual address happens to get some small folios with > aligned and contiguous swap slots. Even then, they would need to be > partially zeromap and partially non-zeromap, zswap, etc. As I mentioned, we can start simple and collect data for this. If it's rare and we don't need to handle it, that's good. > > As you mentioned, zeromap handles mTHP as a whole during swapping > out, marking all subpages of the entire mTHP as zeromap rather than just > a subset of them. > > And swap-in can also entirely map a swapcache which is a large folio based > on our previous patchset which has been in mainline: > "mm: swap: entirely map large folios found in swapcache" > https://lore.kernel.org/all/20240529082824.150954-1-21cnbao@gmail.com/ > > It seems the only thing we're missing is zswap support for mTHP. It is still possible for two virtually contiguous folios to be swapped out to contiguous swap entries. It is also possible that a large folio is swapped out as a whole, then only a part of it is swapped in later due to memory pressure. If that part is later reclaimed again and gets added to the swapcache, we can run into the hybrid swapin situation. There may be other scenarios as well, I did not think this through. > > 2. Implementing hybrid swap-in would be extremely tricky and could disrupt > several software layers. I can share some pseudo code below: Yeah it definitely would be complex, so we need proper justification for it. > > swap_read_folio() > { > if (zeromap_full) > folio_read_from_zeromap() > else if (zswap_map_full) > folio_read_from_zswap() > else { > folio_read_from_swapfile() > if (zeromap_partial) > folio_read_from_zeromap_fixup() /* fill zero > for partially zeromap subpages */ > if (zwap_partial) > folio_read_from_zswap_fixup() /* zswap_load > for partially zswap-mapped subpages */ > > folio_mark_uptodate() > folio_unlock() > } > > We'd also need to modify folio_read_from_swapfile() to skip > folio_mark_uptodate() > and folio_unlock() after completing the BIO. This approach seems to > entirely disrupt > the software layers. > > This could also lead to unnecessary IO operations for subpages that > require fixup. > Since such cases are quite rare, I believe the added complexity isn't worth it. > > My point is that we should simply check that all PTEs have consistent zeromap, > zswap, and swapcache statuses before proceeding, otherwise fall back to the next > lower order if needed. This approach improves performance and avoids complex > corner cases. Agree that we should start with that, although we should probably fallback to the largest order we can swapin from a single backend, rather than the next lower order. > > So once zswap mTHP is there, I would also expect an API similar to > swap_zeromap_entries_check() > for example: > zswap_entries_check(entry, nr) which can return if we are having > full, non, and partial zswap to replace the existing > zswap_never_enabled(). I think a better API would be similar to what Usama had. Basically take in (entry, nr) and return how much of it is in zswap starting at entry, so that we can decide the swapin order. Maybe we can adjust your proposed swap_zeromap_entries_check() as well to do that? Basically return the number of swap entries in the zeromap starting at 'entry'. If 'entry' itself is not in the zeromap we return 0 naturally. That would be a small adjustment/fix over what Usama had, but implementing it with bitmap operations like you did would be better. > > Though I am not sure how cheap zswap can implement it, > swap_zeromap_entries_check() > could be two simple bit operations: > > +static inline zeromap_stat_t swap_zeromap_entries_check(swp_entry_t > entry, int nr) > +{ > + struct swap_info_struct *sis = swp_swap_info(entry); > + unsigned long start = swp_offset(entry); > + unsigned long end = start + nr; > + > + if (find_next_bit(sis->zeromap, end, start) == end) > + return SWAP_ZEROMAP_NON; > + if (find_next_zero_bit(sis->zeromap, end, start) == end) > + return SWAP_ZEROMAP_FULL; > + > + return SWAP_ZEROMAP_PARTIAL; > +} > > 3. swapcache is different from zeromap and zswap. Swapcache indicates > that the memory > is still available and should be re-mapped rather than allocating a > new folio. Our previous > patchset has implemented a full re-map of an mTHP in do_swap_page() as mentioned > in 1. > > For the same reason as point 1, partial swapcache is a rare edge case. > Not re-mapping it > and instead allocating a new folio would add significant complexity. > > > > > > > Nonetheless, `zeromap` and `zswap` are distinct cases. With `zeromap`, we > > > permit almost all mTHP swap-ins, except for those rare situations where > > > small folios that were swapped out happen to have contiguous and aligned > > > swap slots. > > > > > > swapcache is another quite different story, since our user scenarios begin from > > > the simplest sync io on mobile phones, we don't quite care about swapcache. > > > > Right. The reason I bring this up is as I mentioned above, there is a > > common problem of forming large folios from different sources, which > > includes the swap cache. The fact that synchronous swapin does not use > > the swapcache was a happy coincidence for you, as you can add support > > mTHP swapins without handling this case yet ;) > > As I mentioned above, I'd really rather filter out those corner cases > than support > them, not just for the current situation to unlock swap-in series :-) If they are indeed corner cases, then I definitely agree.
On Thu, Sep 5, 2024 at 7:55 PM Yosry Ahmed <yosryahmed@google.com> wrote: > > On Thu, Sep 5, 2024 at 12:03 AM Barry Song <21cnbao@gmail.com> wrote: > > > > On Thu, Sep 5, 2024 at 5:41 AM Yosry Ahmed <yosryahmed@google.com> wrote: > > > > > > [..] > > > > > I understand the point of doing this to unblock the synchronous large > > > > > folio swapin support work, but at some point we're gonna have to > > > > > actually handle the cases where a large folio being swapped in is > > > > > partially in the swap cache, zswap, the zeromap, etc. > > > > > > > > > > All these cases will need similar-ish handling, and I suspect we won't > > > > > just skip swapping in large folios in all these cases. > > > > > > > > I agree that this is definitely the goal. `swap_read_folio()` should be a > > > > dependable API that always returns reliable data, regardless of whether > > > > `zeromap` or `zswap` is involved. Despite these issues, mTHP swap-in shouldn't > > > > be held back. Significant efforts are underway to support large folios in > > > > `zswap`, and progress is being made. Not to mention we've already allowed > > > > `zeromap` to proceed, even though it doesn't support large folios. > > > > > > > > It's genuinely unfair to let the lack of mTHP support in `zeromap` and > > > > `zswap` hold swap-in hostage. > > > > > > > Hi Yosry, > > > > > Well, two points here: > > > > > > 1. I did not say that we should block the synchronous mTHP swapin work > > > for this :) I said the next item on the TODO list for mTHP swapin > > > support should be handling these cases. > > > > Thanks for your clarification! > > > > > > > > 2. I think two things are getting conflated here. Zswap needs to > > > support mTHP swapin*. Zeromap already supports mTHPs AFAICT. What is > > > truly, and is outside the scope of zswap/zeromap, is being able to > > > support hybrid mTHP swapin. > > > > > > When swapping in an mTHP, the swapped entries can be on disk, in the > > > swapcache, in zswap, or in the zeromap. Even if all these things > > > support mTHPs individually, we essentially need support to form an > > > mTHP from swap entries in different backends. That's what I meant. > > > Actually if we have that, we may not really need mTHP swapin support > > > in zswap, because we can just form the large folio in the swap layer > > > from multiple zswap entries. > > > > > > > After further consideration, I've actually started to disagree with the idea > > of supporting hybrid swapin (forming an mTHP from swap entries in different > > backends). My reasoning is as follows: > > I do not have any data about this, so you could very well be right > here. Handling hybrid swapin could be simply falling back to the > smallest order we can swapin from a single backend. We can at least > start with this, and collect data about how many mTHP swapins fallback > due to hybrid backends. This way we only take the complexity if > needed. > > I did imagine though that it's possible for two virtually contiguous > folios to be swapped out to contiguous swap entries and end up in > different media (e.g. if only one of them is zero-filled). I am not > sure how rare it would be in practice. > > > > > 1. The scenario where an mTHP is partially zeromap, partially zswap, etc., > > would be an extremely rare case, as long as we're swapping out the mTHP as > > a whole and all the modules are handling it accordingly. It's highly > > unlikely to form this mix of zeromap, zswap, and swapcache unless the > > contiguous VMA virtual address happens to get some small folios with > > aligned and contiguous swap slots. Even then, they would need to be > > partially zeromap and partially non-zeromap, zswap, etc. > > As I mentioned, we can start simple and collect data for this. If it's > rare and we don't need to handle it, that's good. > > > > > As you mentioned, zeromap handles mTHP as a whole during swapping > > out, marking all subpages of the entire mTHP as zeromap rather than just > > a subset of them. > > > > And swap-in can also entirely map a swapcache which is a large folio based > > on our previous patchset which has been in mainline: > > "mm: swap: entirely map large folios found in swapcache" > > https://lore.kernel.org/all/20240529082824.150954-1-21cnbao@gmail.com/ > > > > It seems the only thing we're missing is zswap support for mTHP. > > It is still possible for two virtually contiguous folios to be swapped > out to contiguous swap entries. It is also possible that a large folio > is swapped out as a whole, then only a part of it is swapped in later > due to memory pressure. If that part is later reclaimed again and gets > added to the swapcache, we can run into the hybrid swapin situation. > There may be other scenarios as well, I did not think this through. > > > > > 2. Implementing hybrid swap-in would be extremely tricky and could disrupt > > several software layers. I can share some pseudo code below: > > Yeah it definitely would be complex, so we need proper justification for it. > > > > > swap_read_folio() > > { > > if (zeromap_full) > > folio_read_from_zeromap() > > else if (zswap_map_full) > > folio_read_from_zswap() > > else { > > folio_read_from_swapfile() > > if (zeromap_partial) > > folio_read_from_zeromap_fixup() /* fill zero > > for partially zeromap subpages */ > > if (zwap_partial) > > folio_read_from_zswap_fixup() /* zswap_load > > for partially zswap-mapped subpages */ > > > > folio_mark_uptodate() > > folio_unlock() > > } > > > > We'd also need to modify folio_read_from_swapfile() to skip > > folio_mark_uptodate() > > and folio_unlock() after completing the BIO. This approach seems to > > entirely disrupt > > the software layers. > > > > This could also lead to unnecessary IO operations for subpages that > > require fixup. > > Since such cases are quite rare, I believe the added complexity isn't worth it. > > > > My point is that we should simply check that all PTEs have consistent zeromap, > > zswap, and swapcache statuses before proceeding, otherwise fall back to the next > > lower order if needed. This approach improves performance and avoids complex > > corner cases. > > Agree that we should start with that, although we should probably > fallback to the largest order we can swapin from a single backend, > rather than the next lower order. > > > > > So once zswap mTHP is there, I would also expect an API similar to > > swap_zeromap_entries_check() > > for example: > > zswap_entries_check(entry, nr) which can return if we are having > > full, non, and partial zswap to replace the existing > > zswap_never_enabled(). > > I think a better API would be similar to what Usama had. Basically > take in (entry, nr) and return how much of it is in zswap starting at > entry, so that we can decide the swapin order. > > Maybe we can adjust your proposed swap_zeromap_entries_check() as well > to do that? Basically return the number of swap entries in the zeromap > starting at 'entry'. If 'entry' itself is not in the zeromap we return > 0 naturally. That would be a small adjustment/fix over what Usama had, > but implementing it with bitmap operations like you did would be > better. I assume you means the below /* * Return the number of contiguous zeromap entries started from entry */ static inline unsigned int swap_zeromap_entries_count(swp_entry_t entry, int nr) { struct swap_info_struct *sis = swp_swap_info(entry); unsigned long start = swp_offset(entry); unsigned long end = start + nr; unsigned long idx; idx = find_next_bit(sis->zeromap, end, start); if (idx != start) return 0; return find_next_zero_bit(sis->zeromap, end, start) - idx; } If yes, I really like this idea. It seems much better than using an enum, which would require adding a new data structure :-) Additionally, returning the number allows callers to fall back to the largest possible order, rather than trying next lower orders sequentially. Hi Usama, what is your take on this? > > > > > Though I am not sure how cheap zswap can implement it, > > swap_zeromap_entries_check() > > could be two simple bit operations: > > > > +static inline zeromap_stat_t swap_zeromap_entries_check(swp_entry_t > > entry, int nr) > > +{ > > + struct swap_info_struct *sis = swp_swap_info(entry); > > + unsigned long start = swp_offset(entry); > > + unsigned long end = start + nr; > > + > > + if (find_next_bit(sis->zeromap, end, start) == end) > > + return SWAP_ZEROMAP_NON; > > + if (find_next_zero_bit(sis->zeromap, end, start) == end) > > + return SWAP_ZEROMAP_FULL; > > + > > + return SWAP_ZEROMAP_PARTIAL; > > +} > > > > 3. swapcache is different from zeromap and zswap. Swapcache indicates > > that the memory > > is still available and should be re-mapped rather than allocating a > > new folio. Our previous > > patchset has implemented a full re-map of an mTHP in do_swap_page() as mentioned > > in 1. > > > > For the same reason as point 1, partial swapcache is a rare edge case. > > Not re-mapping it > > and instead allocating a new folio would add significant complexity. > > > > > > > > > > Nonetheless, `zeromap` and `zswap` are distinct cases. With `zeromap`, we > > > > permit almost all mTHP swap-ins, except for those rare situations where > > > > small folios that were swapped out happen to have contiguous and aligned > > > > swap slots. > > > > > > > > swapcache is another quite different story, since our user scenarios begin from > > > > the simplest sync io on mobile phones, we don't quite care about swapcache. > > > > > > Right. The reason I bring this up is as I mentioned above, there is a > > > common problem of forming large folios from different sources, which > > > includes the swap cache. The fact that synchronous swapin does not use > > > the swapcache was a happy coincidence for you, as you can add support > > > mTHP swapins without handling this case yet ;) > > > > As I mentioned above, I'd really rather filter out those corner cases > > than support > > them, not just for the current situation to unlock swap-in series :-) > > If they are indeed corner cases, then I definitely agree. Thanks Barry
On Thu, Sep 5, 2024 at 8:49 PM Barry Song <21cnbao@gmail.com> wrote: > > On Thu, Sep 5, 2024 at 7:55 PM Yosry Ahmed <yosryahmed@google.com> wrote: > > > > On Thu, Sep 5, 2024 at 12:03 AM Barry Song <21cnbao@gmail.com> wrote: > > > > > > On Thu, Sep 5, 2024 at 5:41 AM Yosry Ahmed <yosryahmed@google.com> wrote: > > > > > > > > [..] > > > > > > I understand the point of doing this to unblock the synchronous large > > > > > > folio swapin support work, but at some point we're gonna have to > > > > > > actually handle the cases where a large folio being swapped in is > > > > > > partially in the swap cache, zswap, the zeromap, etc. > > > > > > > > > > > > All these cases will need similar-ish handling, and I suspect we won't > > > > > > just skip swapping in large folios in all these cases. > > > > > > > > > > I agree that this is definitely the goal. `swap_read_folio()` should be a > > > > > dependable API that always returns reliable data, regardless of whether > > > > > `zeromap` or `zswap` is involved. Despite these issues, mTHP swap-in shouldn't > > > > > be held back. Significant efforts are underway to support large folios in > > > > > `zswap`, and progress is being made. Not to mention we've already allowed > > > > > `zeromap` to proceed, even though it doesn't support large folios. > > > > > > > > > > It's genuinely unfair to let the lack of mTHP support in `zeromap` and > > > > > `zswap` hold swap-in hostage. > > > > > > > > > > Hi Yosry, > > > > > > > Well, two points here: > > > > > > > > 1. I did not say that we should block the synchronous mTHP swapin work > > > > for this :) I said the next item on the TODO list for mTHP swapin > > > > support should be handling these cases. > > > > > > Thanks for your clarification! > > > > > > > > > > > 2. I think two things are getting conflated here. Zswap needs to > > > > support mTHP swapin*. Zeromap already supports mTHPs AFAICT. What is > > > > truly, and is outside the scope of zswap/zeromap, is being able to > > > > support hybrid mTHP swapin. > > > > > > > > When swapping in an mTHP, the swapped entries can be on disk, in the > > > > swapcache, in zswap, or in the zeromap. Even if all these things > > > > support mTHPs individually, we essentially need support to form an > > > > mTHP from swap entries in different backends. That's what I meant. > > > > Actually if we have that, we may not really need mTHP swapin support > > > > in zswap, because we can just form the large folio in the swap layer > > > > from multiple zswap entries. > > > > > > > > > > After further consideration, I've actually started to disagree with the idea > > > of supporting hybrid swapin (forming an mTHP from swap entries in different > > > backends). My reasoning is as follows: > > > > I do not have any data about this, so you could very well be right > > here. Handling hybrid swapin could be simply falling back to the > > smallest order we can swapin from a single backend. We can at least > > start with this, and collect data about how many mTHP swapins fallback > > due to hybrid backends. This way we only take the complexity if > > needed. > > > > I did imagine though that it's possible for two virtually contiguous > > folios to be swapped out to contiguous swap entries and end up in > > different media (e.g. if only one of them is zero-filled). I am not > > sure how rare it would be in practice. > > > > > > > > 1. The scenario where an mTHP is partially zeromap, partially zswap, etc., > > > would be an extremely rare case, as long as we're swapping out the mTHP as > > > a whole and all the modules are handling it accordingly. It's highly > > > unlikely to form this mix of zeromap, zswap, and swapcache unless the > > > contiguous VMA virtual address happens to get some small folios with > > > aligned and contiguous swap slots. Even then, they would need to be > > > partially zeromap and partially non-zeromap, zswap, etc. > > > > As I mentioned, we can start simple and collect data for this. If it's > > rare and we don't need to handle it, that's good. > > > > > > > > As you mentioned, zeromap handles mTHP as a whole during swapping > > > out, marking all subpages of the entire mTHP as zeromap rather than just > > > a subset of them. > > > > > > And swap-in can also entirely map a swapcache which is a large folio based > > > on our previous patchset which has been in mainline: > > > "mm: swap: entirely map large folios found in swapcache" > > > https://lore.kernel.org/all/20240529082824.150954-1-21cnbao@gmail.com/ > > > > > > It seems the only thing we're missing is zswap support for mTHP. > > > > It is still possible for two virtually contiguous folios to be swapped > > out to contiguous swap entries. It is also possible that a large folio > > is swapped out as a whole, then only a part of it is swapped in later > > due to memory pressure. If that part is later reclaimed again and gets > > added to the swapcache, we can run into the hybrid swapin situation. > > There may be other scenarios as well, I did not think this through. > > > > > > > > 2. Implementing hybrid swap-in would be extremely tricky and could disrupt > > > several software layers. I can share some pseudo code below: > > > > Yeah it definitely would be complex, so we need proper justification for it. > > > > > > > > swap_read_folio() > > > { > > > if (zeromap_full) > > > folio_read_from_zeromap() > > > else if (zswap_map_full) > > > folio_read_from_zswap() > > > else { > > > folio_read_from_swapfile() > > > if (zeromap_partial) > > > folio_read_from_zeromap_fixup() /* fill zero > > > for partially zeromap subpages */ > > > if (zwap_partial) > > > folio_read_from_zswap_fixup() /* zswap_load > > > for partially zswap-mapped subpages */ > > > > > > folio_mark_uptodate() > > > folio_unlock() > > > } > > > > > > We'd also need to modify folio_read_from_swapfile() to skip > > > folio_mark_uptodate() > > > and folio_unlock() after completing the BIO. This approach seems to > > > entirely disrupt > > > the software layers. > > > > > > This could also lead to unnecessary IO operations for subpages that > > > require fixup. > > > Since such cases are quite rare, I believe the added complexity isn't worth it. > > > > > > My point is that we should simply check that all PTEs have consistent zeromap, > > > zswap, and swapcache statuses before proceeding, otherwise fall back to the next > > > lower order if needed. This approach improves performance and avoids complex > > > corner cases. > > > > Agree that we should start with that, although we should probably > > fallback to the largest order we can swapin from a single backend, > > rather than the next lower order. > > > > > > > > So once zswap mTHP is there, I would also expect an API similar to > > > swap_zeromap_entries_check() > > > for example: > > > zswap_entries_check(entry, nr) which can return if we are having > > > full, non, and partial zswap to replace the existing > > > zswap_never_enabled(). > > > > I think a better API would be similar to what Usama had. Basically > > take in (entry, nr) and return how much of it is in zswap starting at > > entry, so that we can decide the swapin order. > > > > Maybe we can adjust your proposed swap_zeromap_entries_check() as well > > to do that? Basically return the number of swap entries in the zeromap > > starting at 'entry'. If 'entry' itself is not in the zeromap we return > > 0 naturally. That would be a small adjustment/fix over what Usama had, > > but implementing it with bitmap operations like you did would be > > better. > > I assume you means the below > > /* > * Return the number of contiguous zeromap entries started from entry > */ > static inline unsigned int swap_zeromap_entries_count(swp_entry_t entry, int nr) > { > struct swap_info_struct *sis = swp_swap_info(entry); > unsigned long start = swp_offset(entry); > unsigned long end = start + nr; > unsigned long idx; > > idx = find_next_bit(sis->zeromap, end, start); > if (idx != start) > return 0; > > return find_next_zero_bit(sis->zeromap, end, start) - idx; > } > > If yes, I really like this idea. > > It seems much better than using an enum, which would require adding a new > data structure :-) Additionally, returning the number allows callers > to fall back > to the largest possible order, rather than trying next lower orders > sequentially. No, returning 0 after only checking first entry would still reintroduce the current bug, where the start entry is zeromap but other entries might not be. We need another value to indicate whether the entries are consistent if we want to avoid the enum: /* * Return the number of contiguous zeromap entries started from entry; * If all entries have consistent zeromap, *consistent will be true; * otherwise, false; */ static inline unsigned int swap_zeromap_entries_count(swp_entry_t entry, int nr, bool *consistent) { struct swap_info_struct *sis = swp_swap_info(entry); unsigned long start = swp_offset(entry); unsigned long end = start + nr; unsigned long s_idx, c_idx; s_idx = find_next_bit(sis->zeromap, end, start); if (s_idx == end) { *consistent = true; return 0; } c_idx = find_next_zero_bit(sis->zeromap, end, start); if (c_idx == end) { *consistent = true; return nr; } *consistent = false; if (s_idx == start) return 0; return c_idx - s_idx; } I can actually switch the places of the "consistent" and returned number if that looks better. > > Hi Usama, > what is your take on this? > > > > > > > > > Though I am not sure how cheap zswap can implement it, > > > swap_zeromap_entries_check() > > > could be two simple bit operations: > > > > > > +static inline zeromap_stat_t swap_zeromap_entries_check(swp_entry_t > > > entry, int nr) > > > +{ > > > + struct swap_info_struct *sis = swp_swap_info(entry); > > > + unsigned long start = swp_offset(entry); > > > + unsigned long end = start + nr; > > > + > > > + if (find_next_bit(sis->zeromap, end, start) == end) > > > + return SWAP_ZEROMAP_NON; > > > + if (find_next_zero_bit(sis->zeromap, end, start) == end) > > > + return SWAP_ZEROMAP_FULL; > > > + > > > + return SWAP_ZEROMAP_PARTIAL; > > > +} > > > > > > 3. swapcache is different from zeromap and zswap. Swapcache indicates > > > that the memory > > > is still available and should be re-mapped rather than allocating a > > > new folio. Our previous > > > patchset has implemented a full re-map of an mTHP in do_swap_page() as mentioned > > > in 1. > > > > > > For the same reason as point 1, partial swapcache is a rare edge case. > > > Not re-mapping it > > > and instead allocating a new folio would add significant complexity. > > > > > > > > > > > > > Nonetheless, `zeromap` and `zswap` are distinct cases. With `zeromap`, we > > > > > permit almost all mTHP swap-ins, except for those rare situations where > > > > > small folios that were swapped out happen to have contiguous and aligned > > > > > swap slots. > > > > > > > > > > swapcache is another quite different story, since our user scenarios begin from > > > > > the simplest sync io on mobile phones, we don't quite care about swapcache. > > > > > > > > Right. The reason I bring this up is as I mentioned above, there is a > > > > common problem of forming large folios from different sources, which > > > > includes the swap cache. The fact that synchronous swapin does not use > > > > the swapcache was a happy coincidence for you, as you can add support > > > > mTHP swapins without handling this case yet ;) > > > > > > As I mentioned above, I'd really rather filter out those corner cases > > > than support > > > them, not just for the current situation to unlock swap-in series :-) > > > > If they are indeed corner cases, then I definitely agree. > > Thanks > Barry
On Thu, Sep 5, 2024 at 10:10 PM Barry Song <21cnbao@gmail.com> wrote: > > On Thu, Sep 5, 2024 at 8:49 PM Barry Song <21cnbao@gmail.com> wrote: > > > > On Thu, Sep 5, 2024 at 7:55 PM Yosry Ahmed <yosryahmed@google.com> wrote: > > > > > > On Thu, Sep 5, 2024 at 12:03 AM Barry Song <21cnbao@gmail.com> wrote: > > > > > > > > On Thu, Sep 5, 2024 at 5:41 AM Yosry Ahmed <yosryahmed@google.com> wrote: > > > > > > > > > > [..] > > > > > > > I understand the point of doing this to unblock the synchronous large > > > > > > > folio swapin support work, but at some point we're gonna have to > > > > > > > actually handle the cases where a large folio being swapped in is > > > > > > > partially in the swap cache, zswap, the zeromap, etc. > > > > > > > > > > > > > > All these cases will need similar-ish handling, and I suspect we won't > > > > > > > just skip swapping in large folios in all these cases. > > > > > > > > > > > > I agree that this is definitely the goal. `swap_read_folio()` should be a > > > > > > dependable API that always returns reliable data, regardless of whether > > > > > > `zeromap` or `zswap` is involved. Despite these issues, mTHP swap-in shouldn't > > > > > > be held back. Significant efforts are underway to support large folios in > > > > > > `zswap`, and progress is being made. Not to mention we've already allowed > > > > > > `zeromap` to proceed, even though it doesn't support large folios. > > > > > > > > > > > > It's genuinely unfair to let the lack of mTHP support in `zeromap` and > > > > > > `zswap` hold swap-in hostage. > > > > > > > > > > > > > Hi Yosry, > > > > > > > > > Well, two points here: > > > > > > > > > > 1. I did not say that we should block the synchronous mTHP swapin work > > > > > for this :) I said the next item on the TODO list for mTHP swapin > > > > > support should be handling these cases. > > > > > > > > Thanks for your clarification! > > > > > > > > > > > > > > 2. I think two things are getting conflated here. Zswap needs to > > > > > support mTHP swapin*. Zeromap already supports mTHPs AFAICT. What is > > > > > truly, and is outside the scope of zswap/zeromap, is being able to > > > > > support hybrid mTHP swapin. > > > > > > > > > > When swapping in an mTHP, the swapped entries can be on disk, in the > > > > > swapcache, in zswap, or in the zeromap. Even if all these things > > > > > support mTHPs individually, we essentially need support to form an > > > > > mTHP from swap entries in different backends. That's what I meant. > > > > > Actually if we have that, we may not really need mTHP swapin support > > > > > in zswap, because we can just form the large folio in the swap layer > > > > > from multiple zswap entries. > > > > > > > > > > > > > After further consideration, I've actually started to disagree with the idea > > > > of supporting hybrid swapin (forming an mTHP from swap entries in different > > > > backends). My reasoning is as follows: > > > > > > I do not have any data about this, so you could very well be right > > > here. Handling hybrid swapin could be simply falling back to the > > > smallest order we can swapin from a single backend. We can at least > > > start with this, and collect data about how many mTHP swapins fallback > > > due to hybrid backends. This way we only take the complexity if > > > needed. > > > > > > I did imagine though that it's possible for two virtually contiguous > > > folios to be swapped out to contiguous swap entries and end up in > > > different media (e.g. if only one of them is zero-filled). I am not > > > sure how rare it would be in practice. > > > > > > > > > > > 1. The scenario where an mTHP is partially zeromap, partially zswap, etc., > > > > would be an extremely rare case, as long as we're swapping out the mTHP as > > > > a whole and all the modules are handling it accordingly. It's highly > > > > unlikely to form this mix of zeromap, zswap, and swapcache unless the > > > > contiguous VMA virtual address happens to get some small folios with > > > > aligned and contiguous swap slots. Even then, they would need to be > > > > partially zeromap and partially non-zeromap, zswap, etc. > > > > > > As I mentioned, we can start simple and collect data for this. If it's > > > rare and we don't need to handle it, that's good. > > > > > > > > > > > As you mentioned, zeromap handles mTHP as a whole during swapping > > > > out, marking all subpages of the entire mTHP as zeromap rather than just > > > > a subset of them. > > > > > > > > And swap-in can also entirely map a swapcache which is a large folio based > > > > on our previous patchset which has been in mainline: > > > > "mm: swap: entirely map large folios found in swapcache" > > > > https://lore.kernel.org/all/20240529082824.150954-1-21cnbao@gmail.com/ > > > > > > > > It seems the only thing we're missing is zswap support for mTHP. > > > > > > It is still possible for two virtually contiguous folios to be swapped > > > out to contiguous swap entries. It is also possible that a large folio > > > is swapped out as a whole, then only a part of it is swapped in later > > > due to memory pressure. If that part is later reclaimed again and gets > > > added to the swapcache, we can run into the hybrid swapin situation. > > > There may be other scenarios as well, I did not think this through. > > > > > > > > > > > 2. Implementing hybrid swap-in would be extremely tricky and could disrupt > > > > several software layers. I can share some pseudo code below: > > > > > > Yeah it definitely would be complex, so we need proper justification for it. > > > > > > > > > > > swap_read_folio() > > > > { > > > > if (zeromap_full) > > > > folio_read_from_zeromap() > > > > else if (zswap_map_full) > > > > folio_read_from_zswap() > > > > else { > > > > folio_read_from_swapfile() > > > > if (zeromap_partial) > > > > folio_read_from_zeromap_fixup() /* fill zero > > > > for partially zeromap subpages */ > > > > if (zwap_partial) > > > > folio_read_from_zswap_fixup() /* zswap_load > > > > for partially zswap-mapped subpages */ > > > > > > > > folio_mark_uptodate() > > > > folio_unlock() > > > > } > > > > > > > > We'd also need to modify folio_read_from_swapfile() to skip > > > > folio_mark_uptodate() > > > > and folio_unlock() after completing the BIO. This approach seems to > > > > entirely disrupt > > > > the software layers. > > > > > > > > This could also lead to unnecessary IO operations for subpages that > > > > require fixup. > > > > Since such cases are quite rare, I believe the added complexity isn't worth it. > > > > > > > > My point is that we should simply check that all PTEs have consistent zeromap, > > > > zswap, and swapcache statuses before proceeding, otherwise fall back to the next > > > > lower order if needed. This approach improves performance and avoids complex > > > > corner cases. > > > > > > Agree that we should start with that, although we should probably > > > fallback to the largest order we can swapin from a single backend, > > > rather than the next lower order. > > > > > > > > > > > So once zswap mTHP is there, I would also expect an API similar to > > > > swap_zeromap_entries_check() > > > > for example: > > > > zswap_entries_check(entry, nr) which can return if we are having > > > > full, non, and partial zswap to replace the existing > > > > zswap_never_enabled(). > > > > > > I think a better API would be similar to what Usama had. Basically > > > take in (entry, nr) and return how much of it is in zswap starting at > > > entry, so that we can decide the swapin order. > > > > > > Maybe we can adjust your proposed swap_zeromap_entries_check() as well > > > to do that? Basically return the number of swap entries in the zeromap > > > starting at 'entry'. If 'entry' itself is not in the zeromap we return > > > 0 naturally. That would be a small adjustment/fix over what Usama had, > > > but implementing it with bitmap operations like you did would be > > > better. > > > > I assume you means the below > > > > /* > > * Return the number of contiguous zeromap entries started from entry > > */ > > static inline unsigned int swap_zeromap_entries_count(swp_entry_t entry, int nr) > > { > > struct swap_info_struct *sis = swp_swap_info(entry); > > unsigned long start = swp_offset(entry); > > unsigned long end = start + nr; > > unsigned long idx; > > > > idx = find_next_bit(sis->zeromap, end, start); > > if (idx != start) > > return 0; > > > > return find_next_zero_bit(sis->zeromap, end, start) - idx; > > } > > > > If yes, I really like this idea. > > > > It seems much better than using an enum, which would require adding a new > > data structure :-) Additionally, returning the number allows callers > > to fall back > > to the largest possible order, rather than trying next lower orders > > sequentially. > > No, returning 0 after only checking first entry would still reintroduce > the current bug, where the start entry is zeromap but other entries > might not be. We need another value to indicate whether the entries > are consistent if we want to avoid the enum: > > /* > * Return the number of contiguous zeromap entries started from entry; > * If all entries have consistent zeromap, *consistent will be true; > * otherwise, false; > */ > static inline unsigned int swap_zeromap_entries_count(swp_entry_t entry, > int nr, bool *consistent) > { > struct swap_info_struct *sis = swp_swap_info(entry); > unsigned long start = swp_offset(entry); > unsigned long end = start + nr; > unsigned long s_idx, c_idx; > > s_idx = find_next_bit(sis->zeromap, end, start); > if (s_idx == end) { > *consistent = true; > return 0; > } > > c_idx = find_next_zero_bit(sis->zeromap, end, start); > if (c_idx == end) { > *consistent = true; > return nr; > } > > *consistent = false; > if (s_idx == start) > return 0; > return c_idx - s_idx; > } > > I can actually switch the places of the "consistent" and returned > number if that looks > better. I'd rather make it simpler by: /* * Check if all entries have consistent zeromap status, return true if * all entries are zeromap or non-zeromap, else return false; */ static inline bool swap_zeromap_entries_check(swp_entry_t entry, int nr) { struct swap_info_struct *sis = swp_swap_info(entry); unsigned long start = swp_offset(entry); unsigned long end = start + *nr; if (find_next_bit(sis->zeromap, end, start) == end) return true; if (find_next_zero_bit(sis->zeromap, end, start) == end) return true; return false; } mm/page_io.c can combine this with reading the zeromap of first entry to decide if it will read folio from zeromap; mm/memory.c only needs the bool to fallback to the largest possible order. static inline unsigned long thp_swap_suitable_orders(...) { int order, nr; order = highest_order(orders); while (orders) { nr = 1 << order; if ((addr >> PAGE_SHIFT) % nr == swp_offset % nr && swap_zeromap_entries_check(entry, nr)) break; order = next_order(&orders, order); } return orders; } > > > > > Hi Usama, > > what is your take on this? > > > > > > > > > > > > > Though I am not sure how cheap zswap can implement it, > > > > swap_zeromap_entries_check() > > > > could be two simple bit operations: > > > > > > > > +static inline zeromap_stat_t swap_zeromap_entries_check(swp_entry_t > > > > entry, int nr) > > > > +{ > > > > + struct swap_info_struct *sis = swp_swap_info(entry); > > > > + unsigned long start = swp_offset(entry); > > > > + unsigned long end = start + nr; > > > > + > > > > + if (find_next_bit(sis->zeromap, end, start) == end) > > > > + return SWAP_ZEROMAP_NON; > > > > + if (find_next_zero_bit(sis->zeromap, end, start) == end) > > > > + return SWAP_ZEROMAP_FULL; > > > > + > > > > + return SWAP_ZEROMAP_PARTIAL; > > > > +} > > > > > > > > 3. swapcache is different from zeromap and zswap. Swapcache indicates > > > > that the memory > > > > is still available and should be re-mapped rather than allocating a > > > > new folio. Our previous > > > > patchset has implemented a full re-map of an mTHP in do_swap_page() as mentioned > > > > in 1. > > > > > > > > For the same reason as point 1, partial swapcache is a rare edge case. > > > > Not re-mapping it > > > > and instead allocating a new folio would add significant complexity. > > > > > > > > > > > > > > > > Nonetheless, `zeromap` and `zswap` are distinct cases. With `zeromap`, we > > > > > > permit almost all mTHP swap-ins, except for those rare situations where > > > > > > small folios that were swapped out happen to have contiguous and aligned > > > > > > swap slots. > > > > > > > > > > > > swapcache is another quite different story, since our user scenarios begin from > > > > > > the simplest sync io on mobile phones, we don't quite care about swapcache. > > > > > > > > > > Right. The reason I bring this up is as I mentioned above, there is a > > > > > common problem of forming large folios from different sources, which > > > > > includes the swap cache. The fact that synchronous swapin does not use > > > > > the swapcache was a happy coincidence for you, as you can add support > > > > > mTHP swapins without handling this case yet ;) > > > > > > > > As I mentioned above, I'd really rather filter out those corner cases > > > > than support > > > > them, not just for the current situation to unlock swap-in series :-) > > > > > > If they are indeed corner cases, then I definitely agree. > > Thanks Barry
On 05/09/2024 11:10, Barry Song wrote: > On Thu, Sep 5, 2024 at 8:49 PM Barry Song <21cnbao@gmail.com> wrote: >> >> On Thu, Sep 5, 2024 at 7:55 PM Yosry Ahmed <yosryahmed@google.com> wrote: >>> >>> On Thu, Sep 5, 2024 at 12:03 AM Barry Song <21cnbao@gmail.com> wrote: >>>> >>>> On Thu, Sep 5, 2024 at 5:41 AM Yosry Ahmed <yosryahmed@google.com> wrote: >>>>> >>>>> [..] >>>>>>> I understand the point of doing this to unblock the synchronous large >>>>>>> folio swapin support work, but at some point we're gonna have to >>>>>>> actually handle the cases where a large folio being swapped in is >>>>>>> partially in the swap cache, zswap, the zeromap, etc. >>>>>>> >>>>>>> All these cases will need similar-ish handling, and I suspect we won't >>>>>>> just skip swapping in large folios in all these cases. >>>>>> >>>>>> I agree that this is definitely the goal. `swap_read_folio()` should be a >>>>>> dependable API that always returns reliable data, regardless of whether >>>>>> `zeromap` or `zswap` is involved. Despite these issues, mTHP swap-in shouldn't >>>>>> be held back. Significant efforts are underway to support large folios in >>>>>> `zswap`, and progress is being made. Not to mention we've already allowed >>>>>> `zeromap` to proceed, even though it doesn't support large folios. >>>>>> >>>>>> It's genuinely unfair to let the lack of mTHP support in `zeromap` and >>>>>> `zswap` hold swap-in hostage. >>>>> >>>> >>>> Hi Yosry, >>>> >>>>> Well, two points here: >>>>> >>>>> 1. I did not say that we should block the synchronous mTHP swapin work >>>>> for this :) I said the next item on the TODO list for mTHP swapin >>>>> support should be handling these cases. >>>> >>>> Thanks for your clarification! >>>> >>>>> >>>>> 2. I think two things are getting conflated here. Zswap needs to >>>>> support mTHP swapin*. Zeromap already supports mTHPs AFAICT. What is >>>>> truly, and is outside the scope of zswap/zeromap, is being able to >>>>> support hybrid mTHP swapin. >>>>> >>>>> When swapping in an mTHP, the swapped entries can be on disk, in the >>>>> swapcache, in zswap, or in the zeromap. Even if all these things >>>>> support mTHPs individually, we essentially need support to form an >>>>> mTHP from swap entries in different backends. That's what I meant. >>>>> Actually if we have that, we may not really need mTHP swapin support >>>>> in zswap, because we can just form the large folio in the swap layer >>>>> from multiple zswap entries. >>>>> >>>> >>>> After further consideration, I've actually started to disagree with the idea >>>> of supporting hybrid swapin (forming an mTHP from swap entries in different >>>> backends). My reasoning is as follows: >>> >>> I do not have any data about this, so you could very well be right >>> here. Handling hybrid swapin could be simply falling back to the >>> smallest order we can swapin from a single backend. We can at least >>> start with this, and collect data about how many mTHP swapins fallback >>> due to hybrid backends. This way we only take the complexity if >>> needed. >>> >>> I did imagine though that it's possible for two virtually contiguous >>> folios to be swapped out to contiguous swap entries and end up in >>> different media (e.g. if only one of them is zero-filled). I am not >>> sure how rare it would be in practice. >>> >>>> >>>> 1. The scenario where an mTHP is partially zeromap, partially zswap, etc., >>>> would be an extremely rare case, as long as we're swapping out the mTHP as >>>> a whole and all the modules are handling it accordingly. It's highly >>>> unlikely to form this mix of zeromap, zswap, and swapcache unless the >>>> contiguous VMA virtual address happens to get some small folios with >>>> aligned and contiguous swap slots. Even then, they would need to be >>>> partially zeromap and partially non-zeromap, zswap, etc. >>> >>> As I mentioned, we can start simple and collect data for this. If it's >>> rare and we don't need to handle it, that's good. >>> >>>> >>>> As you mentioned, zeromap handles mTHP as a whole during swapping >>>> out, marking all subpages of the entire mTHP as zeromap rather than just >>>> a subset of them. >>>> >>>> And swap-in can also entirely map a swapcache which is a large folio based >>>> on our previous patchset which has been in mainline: >>>> "mm: swap: entirely map large folios found in swapcache" >>>> https://lore.kernel.org/all/20240529082824.150954-1-21cnbao@gmail.com/ >>>> >>>> It seems the only thing we're missing is zswap support for mTHP. >>> >>> It is still possible for two virtually contiguous folios to be swapped >>> out to contiguous swap entries. It is also possible that a large folio >>> is swapped out as a whole, then only a part of it is swapped in later >>> due to memory pressure. If that part is later reclaimed again and gets >>> added to the swapcache, we can run into the hybrid swapin situation. >>> There may be other scenarios as well, I did not think this through. >>> >>>> >>>> 2. Implementing hybrid swap-in would be extremely tricky and could disrupt >>>> several software layers. I can share some pseudo code below: >>> >>> Yeah it definitely would be complex, so we need proper justification for it. >>> >>>> >>>> swap_read_folio() >>>> { >>>> if (zeromap_full) >>>> folio_read_from_zeromap() >>>> else if (zswap_map_full) >>>> folio_read_from_zswap() >>>> else { >>>> folio_read_from_swapfile() >>>> if (zeromap_partial) >>>> folio_read_from_zeromap_fixup() /* fill zero >>>> for partially zeromap subpages */ >>>> if (zwap_partial) >>>> folio_read_from_zswap_fixup() /* zswap_load >>>> for partially zswap-mapped subpages */ >>>> >>>> folio_mark_uptodate() >>>> folio_unlock() >>>> } >>>> >>>> We'd also need to modify folio_read_from_swapfile() to skip >>>> folio_mark_uptodate() >>>> and folio_unlock() after completing the BIO. This approach seems to >>>> entirely disrupt >>>> the software layers. >>>> >>>> This could also lead to unnecessary IO operations for subpages that >>>> require fixup. >>>> Since such cases are quite rare, I believe the added complexity isn't worth it. >>>> >>>> My point is that we should simply check that all PTEs have consistent zeromap, >>>> zswap, and swapcache statuses before proceeding, otherwise fall back to the next >>>> lower order if needed. This approach improves performance and avoids complex >>>> corner cases. >>> >>> Agree that we should start with that, although we should probably >>> fallback to the largest order we can swapin from a single backend, >>> rather than the next lower order. >>> >>>> >>>> So once zswap mTHP is there, I would also expect an API similar to >>>> swap_zeromap_entries_check() >>>> for example: >>>> zswap_entries_check(entry, nr) which can return if we are having >>>> full, non, and partial zswap to replace the existing >>>> zswap_never_enabled(). >>> >>> I think a better API would be similar to what Usama had. Basically >>> take in (entry, nr) and return how much of it is in zswap starting at >>> entry, so that we can decide the swapin order. >>> >>> Maybe we can adjust your proposed swap_zeromap_entries_check() as well >>> to do that? Basically return the number of swap entries in the zeromap >>> starting at 'entry'. If 'entry' itself is not in the zeromap we return >>> 0 naturally. That would be a small adjustment/fix over what Usama had, >>> but implementing it with bitmap operations like you did would be >>> better. >> >> I assume you means the below >> >> /* >> * Return the number of contiguous zeromap entries started from entry >> */ >> static inline unsigned int swap_zeromap_entries_count(swp_entry_t entry, int nr) >> { >> struct swap_info_struct *sis = swp_swap_info(entry); >> unsigned long start = swp_offset(entry); >> unsigned long end = start + nr; >> unsigned long idx; >> >> idx = find_next_bit(sis->zeromap, end, start); >> if (idx != start) >> return 0; >> >> return find_next_zero_bit(sis->zeromap, end, start) - idx; >> } >> >> If yes, I really like this idea. >> >> It seems much better than using an enum, which would require adding a new >> data structure :-) Additionally, returning the number allows callers >> to fall back >> to the largest possible order, rather than trying next lower orders >> sequentially. > > No, returning 0 after only checking first entry would still reintroduce > the current bug, where the start entry is zeromap but other entries > might not be. We need another value to indicate whether the entries > are consistent if we want to avoid the enum: > > /* > * Return the number of contiguous zeromap entries started from entry; > * If all entries have consistent zeromap, *consistent will be true; > * otherwise, false; > */ > static inline unsigned int swap_zeromap_entries_count(swp_entry_t entry, > int nr, bool *consistent) > { > struct swap_info_struct *sis = swp_swap_info(entry); > unsigned long start = swp_offset(entry); > unsigned long end = start + nr; > unsigned long s_idx, c_idx; > > s_idx = find_next_bit(sis->zeromap, end, start); In all of the implementations you sent, you are using find_next_bit(..,end, start), but I believe it should be find_next_bit(..,nr, start)? TBH, I liked the enum implementation you had in https://lore.kernel.org/all/20240905002926.1055-1-21cnbao@gmail.com/ Its the easiest to review and understand, and least likely to introduce any bugs. But it could be a personal preference. The likelihood of having contiguous zeromap entries *that* is less than nr is very low right? If so we could go with the enum implementation? > if (s_idx == end) { > *consistent = true; > return 0; > } > > c_idx = find_next_zero_bit(sis->zeromap, end, start); > if (c_idx == end) { > *consistent = true; > return nr; > } > > *consistent = false; > if (s_idx == start) > return 0; > return c_idx - s_idx; > } > > I can actually switch the places of the "consistent" and returned > number if that looks > better. > >> >> Hi Usama, >> what is your take on this? >> >>> >>>> >>>> Though I am not sure how cheap zswap can implement it, >>>> swap_zeromap_entries_check() >>>> could be two simple bit operations: >>>> >>>> +static inline zeromap_stat_t swap_zeromap_entries_check(swp_entry_t >>>> entry, int nr) >>>> +{ >>>> + struct swap_info_struct *sis = swp_swap_info(entry); >>>> + unsigned long start = swp_offset(entry); >>>> + unsigned long end = start + nr; >>>> + >>>> + if (find_next_bit(sis->zeromap, end, start) == end) >>>> + return SWAP_ZEROMAP_NON; >>>> + if (find_next_zero_bit(sis->zeromap, end, start) == end) >>>> + return SWAP_ZEROMAP_FULL; >>>> + >>>> + return SWAP_ZEROMAP_PARTIAL; >>>> +} >>>> >>>> 3. swapcache is different from zeromap and zswap. Swapcache indicates >>>> that the memory >>>> is still available and should be re-mapped rather than allocating a >>>> new folio. Our previous >>>> patchset has implemented a full re-map of an mTHP in do_swap_page() as mentioned >>>> in 1. >>>> >>>> For the same reason as point 1, partial swapcache is a rare edge case. >>>> Not re-mapping it >>>> and instead allocating a new folio would add significant complexity. >>>> >>>>>> >>>>>> Nonetheless, `zeromap` and `zswap` are distinct cases. With `zeromap`, we >>>>>> permit almost all mTHP swap-ins, except for those rare situations where >>>>>> small folios that were swapped out happen to have contiguous and aligned >>>>>> swap slots. >>>>>> >>>>>> swapcache is another quite different story, since our user scenarios begin from >>>>>> the simplest sync io on mobile phones, we don't quite care about swapcache. >>>>> >>>>> Right. The reason I bring this up is as I mentioned above, there is a >>>>> common problem of forming large folios from different sources, which >>>>> includes the swap cache. The fact that synchronous swapin does not use >>>>> the swapcache was a happy coincidence for you, as you can add support >>>>> mTHP swapins without handling this case yet ;) >>>> >>>> As I mentioned above, I'd really rather filter out those corner cases >>>> than support >>>> them, not just for the current situation to unlock swap-in series :-) >>> >>> If they are indeed corner cases, then I definitely agree. >> >> Thanks >> Barry
On Thu, Sep 5, 2024 at 10:37 PM Usama Arif <usamaarif642@gmail.com> wrote: > > > > On 05/09/2024 11:10, Barry Song wrote: > > On Thu, Sep 5, 2024 at 8:49 PM Barry Song <21cnbao@gmail.com> wrote: > >> > >> On Thu, Sep 5, 2024 at 7:55 PM Yosry Ahmed <yosryahmed@google.com> wrote: > >>> > >>> On Thu, Sep 5, 2024 at 12:03 AM Barry Song <21cnbao@gmail.com> wrote: > >>>> > >>>> On Thu, Sep 5, 2024 at 5:41 AM Yosry Ahmed <yosryahmed@google.com> wrote: > >>>>> > >>>>> [..] > >>>>>>> I understand the point of doing this to unblock the synchronous large > >>>>>>> folio swapin support work, but at some point we're gonna have to > >>>>>>> actually handle the cases where a large folio being swapped in is > >>>>>>> partially in the swap cache, zswap, the zeromap, etc. > >>>>>>> > >>>>>>> All these cases will need similar-ish handling, and I suspect we won't > >>>>>>> just skip swapping in large folios in all these cases. > >>>>>> > >>>>>> I agree that this is definitely the goal. `swap_read_folio()` should be a > >>>>>> dependable API that always returns reliable data, regardless of whether > >>>>>> `zeromap` or `zswap` is involved. Despite these issues, mTHP swap-in shouldn't > >>>>>> be held back. Significant efforts are underway to support large folios in > >>>>>> `zswap`, and progress is being made. Not to mention we've already allowed > >>>>>> `zeromap` to proceed, even though it doesn't support large folios. > >>>>>> > >>>>>> It's genuinely unfair to let the lack of mTHP support in `zeromap` and > >>>>>> `zswap` hold swap-in hostage. > >>>>> > >>>> > >>>> Hi Yosry, > >>>> > >>>>> Well, two points here: > >>>>> > >>>>> 1. I did not say that we should block the synchronous mTHP swapin work > >>>>> for this :) I said the next item on the TODO list for mTHP swapin > >>>>> support should be handling these cases. > >>>> > >>>> Thanks for your clarification! > >>>> > >>>>> > >>>>> 2. I think two things are getting conflated here. Zswap needs to > >>>>> support mTHP swapin*. Zeromap already supports mTHPs AFAICT. What is > >>>>> truly, and is outside the scope of zswap/zeromap, is being able to > >>>>> support hybrid mTHP swapin. > >>>>> > >>>>> When swapping in an mTHP, the swapped entries can be on disk, in the > >>>>> swapcache, in zswap, or in the zeromap. Even if all these things > >>>>> support mTHPs individually, we essentially need support to form an > >>>>> mTHP from swap entries in different backends. That's what I meant. > >>>>> Actually if we have that, we may not really need mTHP swapin support > >>>>> in zswap, because we can just form the large folio in the swap layer > >>>>> from multiple zswap entries. > >>>>> > >>>> > >>>> After further consideration, I've actually started to disagree with the idea > >>>> of supporting hybrid swapin (forming an mTHP from swap entries in different > >>>> backends). My reasoning is as follows: > >>> > >>> I do not have any data about this, so you could very well be right > >>> here. Handling hybrid swapin could be simply falling back to the > >>> smallest order we can swapin from a single backend. We can at least > >>> start with this, and collect data about how many mTHP swapins fallback > >>> due to hybrid backends. This way we only take the complexity if > >>> needed. > >>> > >>> I did imagine though that it's possible for two virtually contiguous > >>> folios to be swapped out to contiguous swap entries and end up in > >>> different media (e.g. if only one of them is zero-filled). I am not > >>> sure how rare it would be in practice. > >>> > >>>> > >>>> 1. The scenario where an mTHP is partially zeromap, partially zswap, etc., > >>>> would be an extremely rare case, as long as we're swapping out the mTHP as > >>>> a whole and all the modules are handling it accordingly. It's highly > >>>> unlikely to form this mix of zeromap, zswap, and swapcache unless the > >>>> contiguous VMA virtual address happens to get some small folios with > >>>> aligned and contiguous swap slots. Even then, they would need to be > >>>> partially zeromap and partially non-zeromap, zswap, etc. > >>> > >>> As I mentioned, we can start simple and collect data for this. If it's > >>> rare and we don't need to handle it, that's good. > >>> > >>>> > >>>> As you mentioned, zeromap handles mTHP as a whole during swapping > >>>> out, marking all subpages of the entire mTHP as zeromap rather than just > >>>> a subset of them. > >>>> > >>>> And swap-in can also entirely map a swapcache which is a large folio based > >>>> on our previous patchset which has been in mainline: > >>>> "mm: swap: entirely map large folios found in swapcache" > >>>> https://lore.kernel.org/all/20240529082824.150954-1-21cnbao@gmail.com/ > >>>> > >>>> It seems the only thing we're missing is zswap support for mTHP. > >>> > >>> It is still possible for two virtually contiguous folios to be swapped > >>> out to contiguous swap entries. It is also possible that a large folio > >>> is swapped out as a whole, then only a part of it is swapped in later > >>> due to memory pressure. If that part is later reclaimed again and gets > >>> added to the swapcache, we can run into the hybrid swapin situation. > >>> There may be other scenarios as well, I did not think this through. > >>> > >>>> > >>>> 2. Implementing hybrid swap-in would be extremely tricky and could disrupt > >>>> several software layers. I can share some pseudo code below: > >>> > >>> Yeah it definitely would be complex, so we need proper justification for it. > >>> > >>>> > >>>> swap_read_folio() > >>>> { > >>>> if (zeromap_full) > >>>> folio_read_from_zeromap() > >>>> else if (zswap_map_full) > >>>> folio_read_from_zswap() > >>>> else { > >>>> folio_read_from_swapfile() > >>>> if (zeromap_partial) > >>>> folio_read_from_zeromap_fixup() /* fill zero > >>>> for partially zeromap subpages */ > >>>> if (zwap_partial) > >>>> folio_read_from_zswap_fixup() /* zswap_load > >>>> for partially zswap-mapped subpages */ > >>>> > >>>> folio_mark_uptodate() > >>>> folio_unlock() > >>>> } > >>>> > >>>> We'd also need to modify folio_read_from_swapfile() to skip > >>>> folio_mark_uptodate() > >>>> and folio_unlock() after completing the BIO. This approach seems to > >>>> entirely disrupt > >>>> the software layers. > >>>> > >>>> This could also lead to unnecessary IO operations for subpages that > >>>> require fixup. > >>>> Since such cases are quite rare, I believe the added complexity isn't worth it. > >>>> > >>>> My point is that we should simply check that all PTEs have consistent zeromap, > >>>> zswap, and swapcache statuses before proceeding, otherwise fall back to the next > >>>> lower order if needed. This approach improves performance and avoids complex > >>>> corner cases. > >>> > >>> Agree that we should start with that, although we should probably > >>> fallback to the largest order we can swapin from a single backend, > >>> rather than the next lower order. > >>> > >>>> > >>>> So once zswap mTHP is there, I would also expect an API similar to > >>>> swap_zeromap_entries_check() > >>>> for example: > >>>> zswap_entries_check(entry, nr) which can return if we are having > >>>> full, non, and partial zswap to replace the existing > >>>> zswap_never_enabled(). > >>> > >>> I think a better API would be similar to what Usama had. Basically > >>> take in (entry, nr) and return how much of it is in zswap starting at > >>> entry, so that we can decide the swapin order. > >>> > >>> Maybe we can adjust your proposed swap_zeromap_entries_check() as well > >>> to do that? Basically return the number of swap entries in the zeromap > >>> starting at 'entry'. If 'entry' itself is not in the zeromap we return > >>> 0 naturally. That would be a small adjustment/fix over what Usama had, > >>> but implementing it with bitmap operations like you did would be > >>> better. > >> > >> I assume you means the below > >> > >> /* > >> * Return the number of contiguous zeromap entries started from entry > >> */ > >> static inline unsigned int swap_zeromap_entries_count(swp_entry_t entry, int nr) > >> { > >> struct swap_info_struct *sis = swp_swap_info(entry); > >> unsigned long start = swp_offset(entry); > >> unsigned long end = start + nr; > >> unsigned long idx; > >> > >> idx = find_next_bit(sis->zeromap, end, start); > >> if (idx != start) > >> return 0; > >> > >> return find_next_zero_bit(sis->zeromap, end, start) - idx; > >> } > >> > >> If yes, I really like this idea. > >> > >> It seems much better than using an enum, which would require adding a new > >> data structure :-) Additionally, returning the number allows callers > >> to fall back > >> to the largest possible order, rather than trying next lower orders > >> sequentially. > > > > No, returning 0 after only checking first entry would still reintroduce > > the current bug, where the start entry is zeromap but other entries > > might not be. We need another value to indicate whether the entries > > are consistent if we want to avoid the enum: > > > > /* > > * Return the number of contiguous zeromap entries started from entry; > > * If all entries have consistent zeromap, *consistent will be true; > > * otherwise, false; > > */ > > static inline unsigned int swap_zeromap_entries_count(swp_entry_t entry, > > int nr, bool *consistent) > > { > > struct swap_info_struct *sis = swp_swap_info(entry); > > unsigned long start = swp_offset(entry); > > unsigned long end = start + nr; > > unsigned long s_idx, c_idx; > > > > s_idx = find_next_bit(sis->zeromap, end, start); > > In all of the implementations you sent, you are using find_next_bit(..,end, start), but > I believe it should be find_next_bit(..,nr, start)? I guess no, the tricky thing is that size means the size from the first bit of bitmap but not from the "start" bit? > > TBH, I liked the enum implementation you had in https://lore.kernel.org/all/20240905002926.1055-1-21cnbao@gmail.com/ > Its the easiest to review and understand, and least likely to introduce any bugs. > But it could be a personal preference. > The likelihood of having contiguous zeromap entries *that* is less than nr is very low right? > If so we could go with the enum implementation? what about the bool impementation i sent in the last email, it seems the simplest code. > > > > if (s_idx == end) { > > *consistent = true; > > return 0; > > } > > > > c_idx = find_next_zero_bit(sis->zeromap, end, start); > > if (c_idx == end) { > > *consistent = true; > > return nr; > > } > > > > *consistent = false; > > if (s_idx == start) > > return 0; > > return c_idx - s_idx; > > } > > > > I can actually switch the places of the "consistent" and returned > > number if that looks > > better. > > > >> > >> Hi Usama, > >> what is your take on this? > >> > >>> > >>>> > >>>> Though I am not sure how cheap zswap can implement it, > >>>> swap_zeromap_entries_check() > >>>> could be two simple bit operations: > >>>> > >>>> +static inline zeromap_stat_t swap_zeromap_entries_check(swp_entry_t > >>>> entry, int nr) > >>>> +{ > >>>> + struct swap_info_struct *sis = swp_swap_info(entry); > >>>> + unsigned long start = swp_offset(entry); > >>>> + unsigned long end = start + nr; > >>>> + > >>>> + if (find_next_bit(sis->zeromap, end, start) == end) > >>>> + return SWAP_ZEROMAP_NON; > >>>> + if (find_next_zero_bit(sis->zeromap, end, start) == end) > >>>> + return SWAP_ZEROMAP_FULL; > >>>> + > >>>> + return SWAP_ZEROMAP_PARTIAL; > >>>> +} > >>>> > >>>> 3. swapcache is different from zeromap and zswap. Swapcache indicates > >>>> that the memory > >>>> is still available and should be re-mapped rather than allocating a > >>>> new folio. Our previous > >>>> patchset has implemented a full re-map of an mTHP in do_swap_page() as mentioned > >>>> in 1. > >>>> > >>>> For the same reason as point 1, partial swapcache is a rare edge case. > >>>> Not re-mapping it > >>>> and instead allocating a new folio would add significant complexity. > >>>> > >>>>>> > >>>>>> Nonetheless, `zeromap` and `zswap` are distinct cases. With `zeromap`, we > >>>>>> permit almost all mTHP swap-ins, except for those rare situations where > >>>>>> small folios that were swapped out happen to have contiguous and aligned > >>>>>> swap slots. > >>>>>> > >>>>>> swapcache is another quite different story, since our user scenarios begin from > >>>>>> the simplest sync io on mobile phones, we don't quite care about swapcache. > >>>>> > >>>>> Right. The reason I bring this up is as I mentioned above, there is a > >>>>> common problem of forming large folios from different sources, which > >>>>> includes the swap cache. The fact that synchronous swapin does not use > >>>>> the swapcache was a happy coincidence for you, as you can add support > >>>>> mTHP swapins without handling this case yet ;) > >>>> > >>>> As I mentioned above, I'd really rather filter out those corner cases > >>>> than support > >>>> them, not just for the current situation to unlock swap-in series :-) > >>> > >>> If they are indeed corner cases, then I definitely agree. > >> Thanks Barry
On 05/09/2024 11:42, Barry Song wrote: > On Thu, Sep 5, 2024 at 10:37 PM Usama Arif <usamaarif642@gmail.com> wrote: >> >> >> >> On 05/09/2024 11:10, Barry Song wrote: >>> On Thu, Sep 5, 2024 at 8:49 PM Barry Song <21cnbao@gmail.com> wrote: >>>> >>>> On Thu, Sep 5, 2024 at 7:55 PM Yosry Ahmed <yosryahmed@google.com> wrote: >>>>> >>>>> On Thu, Sep 5, 2024 at 12:03 AM Barry Song <21cnbao@gmail.com> wrote: >>>>>> >>>>>> On Thu, Sep 5, 2024 at 5:41 AM Yosry Ahmed <yosryahmed@google.com> wrote: >>>>>>> >>>>>>> [..] >>>>>>>>> I understand the point of doing this to unblock the synchronous large >>>>>>>>> folio swapin support work, but at some point we're gonna have to >>>>>>>>> actually handle the cases where a large folio being swapped in is >>>>>>>>> partially in the swap cache, zswap, the zeromap, etc. >>>>>>>>> >>>>>>>>> All these cases will need similar-ish handling, and I suspect we won't >>>>>>>>> just skip swapping in large folios in all these cases. >>>>>>>> >>>>>>>> I agree that this is definitely the goal. `swap_read_folio()` should be a >>>>>>>> dependable API that always returns reliable data, regardless of whether >>>>>>>> `zeromap` or `zswap` is involved. Despite these issues, mTHP swap-in shouldn't >>>>>>>> be held back. Significant efforts are underway to support large folios in >>>>>>>> `zswap`, and progress is being made. Not to mention we've already allowed >>>>>>>> `zeromap` to proceed, even though it doesn't support large folios. >>>>>>>> >>>>>>>> It's genuinely unfair to let the lack of mTHP support in `zeromap` and >>>>>>>> `zswap` hold swap-in hostage. >>>>>>> >>>>>> >>>>>> Hi Yosry, >>>>>> >>>>>>> Well, two points here: >>>>>>> >>>>>>> 1. I did not say that we should block the synchronous mTHP swapin work >>>>>>> for this :) I said the next item on the TODO list for mTHP swapin >>>>>>> support should be handling these cases. >>>>>> >>>>>> Thanks for your clarification! >>>>>> >>>>>>> >>>>>>> 2. I think two things are getting conflated here. Zswap needs to >>>>>>> support mTHP swapin*. Zeromap already supports mTHPs AFAICT. What is >>>>>>> truly, and is outside the scope of zswap/zeromap, is being able to >>>>>>> support hybrid mTHP swapin. >>>>>>> >>>>>>> When swapping in an mTHP, the swapped entries can be on disk, in the >>>>>>> swapcache, in zswap, or in the zeromap. Even if all these things >>>>>>> support mTHPs individually, we essentially need support to form an >>>>>>> mTHP from swap entries in different backends. That's what I meant. >>>>>>> Actually if we have that, we may not really need mTHP swapin support >>>>>>> in zswap, because we can just form the large folio in the swap layer >>>>>>> from multiple zswap entries. >>>>>>> >>>>>> >>>>>> After further consideration, I've actually started to disagree with the idea >>>>>> of supporting hybrid swapin (forming an mTHP from swap entries in different >>>>>> backends). My reasoning is as follows: >>>>> >>>>> I do not have any data about this, so you could very well be right >>>>> here. Handling hybrid swapin could be simply falling back to the >>>>> smallest order we can swapin from a single backend. We can at least >>>>> start with this, and collect data about how many mTHP swapins fallback >>>>> due to hybrid backends. This way we only take the complexity if >>>>> needed. >>>>> >>>>> I did imagine though that it's possible for two virtually contiguous >>>>> folios to be swapped out to contiguous swap entries and end up in >>>>> different media (e.g. if only one of them is zero-filled). I am not >>>>> sure how rare it would be in practice. >>>>> >>>>>> >>>>>> 1. The scenario where an mTHP is partially zeromap, partially zswap, etc., >>>>>> would be an extremely rare case, as long as we're swapping out the mTHP as >>>>>> a whole and all the modules are handling it accordingly. It's highly >>>>>> unlikely to form this mix of zeromap, zswap, and swapcache unless the >>>>>> contiguous VMA virtual address happens to get some small folios with >>>>>> aligned and contiguous swap slots. Even then, they would need to be >>>>>> partially zeromap and partially non-zeromap, zswap, etc. >>>>> >>>>> As I mentioned, we can start simple and collect data for this. If it's >>>>> rare and we don't need to handle it, that's good. >>>>> >>>>>> >>>>>> As you mentioned, zeromap handles mTHP as a whole during swapping >>>>>> out, marking all subpages of the entire mTHP as zeromap rather than just >>>>>> a subset of them. >>>>>> >>>>>> And swap-in can also entirely map a swapcache which is a large folio based >>>>>> on our previous patchset which has been in mainline: >>>>>> "mm: swap: entirely map large folios found in swapcache" >>>>>> https://lore.kernel.org/all/20240529082824.150954-1-21cnbao@gmail.com/ >>>>>> >>>>>> It seems the only thing we're missing is zswap support for mTHP. >>>>> >>>>> It is still possible for two virtually contiguous folios to be swapped >>>>> out to contiguous swap entries. It is also possible that a large folio >>>>> is swapped out as a whole, then only a part of it is swapped in later >>>>> due to memory pressure. If that part is later reclaimed again and gets >>>>> added to the swapcache, we can run into the hybrid swapin situation. >>>>> There may be other scenarios as well, I did not think this through. >>>>> >>>>>> >>>>>> 2. Implementing hybrid swap-in would be extremely tricky and could disrupt >>>>>> several software layers. I can share some pseudo code below: >>>>> >>>>> Yeah it definitely would be complex, so we need proper justification for it. >>>>> >>>>>> >>>>>> swap_read_folio() >>>>>> { >>>>>> if (zeromap_full) >>>>>> folio_read_from_zeromap() >>>>>> else if (zswap_map_full) >>>>>> folio_read_from_zswap() >>>>>> else { >>>>>> folio_read_from_swapfile() >>>>>> if (zeromap_partial) >>>>>> folio_read_from_zeromap_fixup() /* fill zero >>>>>> for partially zeromap subpages */ >>>>>> if (zwap_partial) >>>>>> folio_read_from_zswap_fixup() /* zswap_load >>>>>> for partially zswap-mapped subpages */ >>>>>> >>>>>> folio_mark_uptodate() >>>>>> folio_unlock() >>>>>> } >>>>>> >>>>>> We'd also need to modify folio_read_from_swapfile() to skip >>>>>> folio_mark_uptodate() >>>>>> and folio_unlock() after completing the BIO. This approach seems to >>>>>> entirely disrupt >>>>>> the software layers. >>>>>> >>>>>> This could also lead to unnecessary IO operations for subpages that >>>>>> require fixup. >>>>>> Since such cases are quite rare, I believe the added complexity isn't worth it. >>>>>> >>>>>> My point is that we should simply check that all PTEs have consistent zeromap, >>>>>> zswap, and swapcache statuses before proceeding, otherwise fall back to the next >>>>>> lower order if needed. This approach improves performance and avoids complex >>>>>> corner cases. >>>>> >>>>> Agree that we should start with that, although we should probably >>>>> fallback to the largest order we can swapin from a single backend, >>>>> rather than the next lower order. >>>>> >>>>>> >>>>>> So once zswap mTHP is there, I would also expect an API similar to >>>>>> swap_zeromap_entries_check() >>>>>> for example: >>>>>> zswap_entries_check(entry, nr) which can return if we are having >>>>>> full, non, and partial zswap to replace the existing >>>>>> zswap_never_enabled(). >>>>> >>>>> I think a better API would be similar to what Usama had. Basically >>>>> take in (entry, nr) and return how much of it is in zswap starting at >>>>> entry, so that we can decide the swapin order. >>>>> >>>>> Maybe we can adjust your proposed swap_zeromap_entries_check() as well >>>>> to do that? Basically return the number of swap entries in the zeromap >>>>> starting at 'entry'. If 'entry' itself is not in the zeromap we return >>>>> 0 naturally. That would be a small adjustment/fix over what Usama had, >>>>> but implementing it with bitmap operations like you did would be >>>>> better. >>>> >>>> I assume you means the below >>>> >>>> /* >>>> * Return the number of contiguous zeromap entries started from entry >>>> */ >>>> static inline unsigned int swap_zeromap_entries_count(swp_entry_t entry, int nr) >>>> { >>>> struct swap_info_struct *sis = swp_swap_info(entry); >>>> unsigned long start = swp_offset(entry); >>>> unsigned long end = start + nr; >>>> unsigned long idx; >>>> >>>> idx = find_next_bit(sis->zeromap, end, start); >>>> if (idx != start) >>>> return 0; >>>> >>>> return find_next_zero_bit(sis->zeromap, end, start) - idx; >>>> } >>>> >>>> If yes, I really like this idea. >>>> >>>> It seems much better than using an enum, which would require adding a new >>>> data structure :-) Additionally, returning the number allows callers >>>> to fall back >>>> to the largest possible order, rather than trying next lower orders >>>> sequentially. >>> >>> No, returning 0 after only checking first entry would still reintroduce >>> the current bug, where the start entry is zeromap but other entries >>> might not be. We need another value to indicate whether the entries >>> are consistent if we want to avoid the enum: >>> >>> /* >>> * Return the number of contiguous zeromap entries started from entry; >>> * If all entries have consistent zeromap, *consistent will be true; >>> * otherwise, false; >>> */ >>> static inline unsigned int swap_zeromap_entries_count(swp_entry_t entry, >>> int nr, bool *consistent) >>> { >>> struct swap_info_struct *sis = swp_swap_info(entry); >>> unsigned long start = swp_offset(entry); >>> unsigned long end = start + nr; >>> unsigned long s_idx, c_idx; >>> >>> s_idx = find_next_bit(sis->zeromap, end, start); >> >> In all of the implementations you sent, you are using find_next_bit(..,end, start), but >> I believe it should be find_next_bit(..,nr, start)? > > I guess no, the tricky thing is that size means the size from the first bit of > bitmap but not from the "start" bit? > Ah ok, we should probably change the function prototype to end. Its ok then if thats the case. >> TBH, I liked the enum implementation you had in https://lore.kernel.org/all/20240905002926.1055-1-21cnbao@gmail.com/ >> Its the easiest to review and understand, and least likely to introduce any bugs. >> But it could be a personal preference. >> The likelihood of having contiguous zeromap entries *that* is less than nr is very low right? >> If so we could go with the enum implementation? > > what about the bool impementation i sent in the last email, it seems the > simplest code. > Looking now. >> >> >>> if (s_idx == end) { >>> *consistent = true; >>> return 0; >>> } >>> >>> c_idx = find_next_zero_bit(sis->zeromap, end, start); >>> if (c_idx == end) { >>> *consistent = true; >>> return nr; >>> } >>> >>> *consistent = false; >>> if (s_idx == start) >>> return 0; >>> return c_idx - s_idx; >>> } >>> >>> I can actually switch the places of the "consistent" and returned >>> number if that looks >>> better. >>> >>>> >>>> Hi Usama, >>>> what is your take on this? >>>> >>>>> >>>>>> >>>>>> Though I am not sure how cheap zswap can implement it, >>>>>> swap_zeromap_entries_check() >>>>>> could be two simple bit operations: >>>>>> >>>>>> +static inline zeromap_stat_t swap_zeromap_entries_check(swp_entry_t >>>>>> entry, int nr) >>>>>> +{ >>>>>> + struct swap_info_struct *sis = swp_swap_info(entry); >>>>>> + unsigned long start = swp_offset(entry); >>>>>> + unsigned long end = start + nr; >>>>>> + >>>>>> + if (find_next_bit(sis->zeromap, end, start) == end) >>>>>> + return SWAP_ZEROMAP_NON; >>>>>> + if (find_next_zero_bit(sis->zeromap, end, start) == end) >>>>>> + return SWAP_ZEROMAP_FULL; >>>>>> + >>>>>> + return SWAP_ZEROMAP_PARTIAL; >>>>>> +} >>>>>> >>>>>> 3. swapcache is different from zeromap and zswap. Swapcache indicates >>>>>> that the memory >>>>>> is still available and should be re-mapped rather than allocating a >>>>>> new folio. Our previous >>>>>> patchset has implemented a full re-map of an mTHP in do_swap_page() as mentioned >>>>>> in 1. >>>>>> >>>>>> For the same reason as point 1, partial swapcache is a rare edge case. >>>>>> Not re-mapping it >>>>>> and instead allocating a new folio would add significant complexity. >>>>>> >>>>>>>> >>>>>>>> Nonetheless, `zeromap` and `zswap` are distinct cases. With `zeromap`, we >>>>>>>> permit almost all mTHP swap-ins, except for those rare situations where >>>>>>>> small folios that were swapped out happen to have contiguous and aligned >>>>>>>> swap slots. >>>>>>>> >>>>>>>> swapcache is another quite different story, since our user scenarios begin from >>>>>>>> the simplest sync io on mobile phones, we don't quite care about swapcache. >>>>>>> >>>>>>> Right. The reason I bring this up is as I mentioned above, there is a >>>>>>> common problem of forming large folios from different sources, which >>>>>>> includes the swap cache. The fact that synchronous swapin does not use >>>>>>> the swapcache was a happy coincidence for you, as you can add support >>>>>>> mTHP swapins without handling this case yet ;) >>>>>> >>>>>> As I mentioned above, I'd really rather filter out those corner cases >>>>>> than support >>>>>> them, not just for the current situation to unlock swap-in series :-) >>>>> >>>>> If they are indeed corner cases, then I definitely agree. >>>> > > Thanks > Barry
On 05/09/2024 11:33, Barry Song wrote: > On Thu, Sep 5, 2024 at 10:10 PM Barry Song <21cnbao@gmail.com> wrote: >> >> On Thu, Sep 5, 2024 at 8:49 PM Barry Song <21cnbao@gmail.com> wrote: >>> >>> On Thu, Sep 5, 2024 at 7:55 PM Yosry Ahmed <yosryahmed@google.com> wrote: >>>> >>>> On Thu, Sep 5, 2024 at 12:03 AM Barry Song <21cnbao@gmail.com> wrote: >>>>> >>>>> On Thu, Sep 5, 2024 at 5:41 AM Yosry Ahmed <yosryahmed@google.com> wrote: >>>>>> >>>>>> [..] >>>>>>>> I understand the point of doing this to unblock the synchronous large >>>>>>>> folio swapin support work, but at some point we're gonna have to >>>>>>>> actually handle the cases where a large folio being swapped in is >>>>>>>> partially in the swap cache, zswap, the zeromap, etc. >>>>>>>> >>>>>>>> All these cases will need similar-ish handling, and I suspect we won't >>>>>>>> just skip swapping in large folios in all these cases. >>>>>>> >>>>>>> I agree that this is definitely the goal. `swap_read_folio()` should be a >>>>>>> dependable API that always returns reliable data, regardless of whether >>>>>>> `zeromap` or `zswap` is involved. Despite these issues, mTHP swap-in shouldn't >>>>>>> be held back. Significant efforts are underway to support large folios in >>>>>>> `zswap`, and progress is being made. Not to mention we've already allowed >>>>>>> `zeromap` to proceed, even though it doesn't support large folios. >>>>>>> >>>>>>> It's genuinely unfair to let the lack of mTHP support in `zeromap` and >>>>>>> `zswap` hold swap-in hostage. >>>>>> >>>>> >>>>> Hi Yosry, >>>>> >>>>>> Well, two points here: >>>>>> >>>>>> 1. I did not say that we should block the synchronous mTHP swapin work >>>>>> for this :) I said the next item on the TODO list for mTHP swapin >>>>>> support should be handling these cases. >>>>> >>>>> Thanks for your clarification! >>>>> >>>>>> >>>>>> 2. I think two things are getting conflated here. Zswap needs to >>>>>> support mTHP swapin*. Zeromap already supports mTHPs AFAICT. What is >>>>>> truly, and is outside the scope of zswap/zeromap, is being able to >>>>>> support hybrid mTHP swapin. >>>>>> >>>>>> When swapping in an mTHP, the swapped entries can be on disk, in the >>>>>> swapcache, in zswap, or in the zeromap. Even if all these things >>>>>> support mTHPs individually, we essentially need support to form an >>>>>> mTHP from swap entries in different backends. That's what I meant. >>>>>> Actually if we have that, we may not really need mTHP swapin support >>>>>> in zswap, because we can just form the large folio in the swap layer >>>>>> from multiple zswap entries. >>>>>> >>>>> >>>>> After further consideration, I've actually started to disagree with the idea >>>>> of supporting hybrid swapin (forming an mTHP from swap entries in different >>>>> backends). My reasoning is as follows: >>>> >>>> I do not have any data about this, so you could very well be right >>>> here. Handling hybrid swapin could be simply falling back to the >>>> smallest order we can swapin from a single backend. We can at least >>>> start with this, and collect data about how many mTHP swapins fallback >>>> due to hybrid backends. This way we only take the complexity if >>>> needed. >>>> >>>> I did imagine though that it's possible for two virtually contiguous >>>> folios to be swapped out to contiguous swap entries and end up in >>>> different media (e.g. if only one of them is zero-filled). I am not >>>> sure how rare it would be in practice. >>>> >>>>> >>>>> 1. The scenario where an mTHP is partially zeromap, partially zswap, etc., >>>>> would be an extremely rare case, as long as we're swapping out the mTHP as >>>>> a whole and all the modules are handling it accordingly. It's highly >>>>> unlikely to form this mix of zeromap, zswap, and swapcache unless the >>>>> contiguous VMA virtual address happens to get some small folios with >>>>> aligned and contiguous swap slots. Even then, they would need to be >>>>> partially zeromap and partially non-zeromap, zswap, etc. >>>> >>>> As I mentioned, we can start simple and collect data for this. If it's >>>> rare and we don't need to handle it, that's good. >>>> >>>>> >>>>> As you mentioned, zeromap handles mTHP as a whole during swapping >>>>> out, marking all subpages of the entire mTHP as zeromap rather than just >>>>> a subset of them. >>>>> >>>>> And swap-in can also entirely map a swapcache which is a large folio based >>>>> on our previous patchset which has been in mainline: >>>>> "mm: swap: entirely map large folios found in swapcache" >>>>> https://lore.kernel.org/all/20240529082824.150954-1-21cnbao@gmail.com/ >>>>> >>>>> It seems the only thing we're missing is zswap support for mTHP. >>>> >>>> It is still possible for two virtually contiguous folios to be swapped >>>> out to contiguous swap entries. It is also possible that a large folio >>>> is swapped out as a whole, then only a part of it is swapped in later >>>> due to memory pressure. If that part is later reclaimed again and gets >>>> added to the swapcache, we can run into the hybrid swapin situation. >>>> There may be other scenarios as well, I did not think this through. >>>> >>>>> >>>>> 2. Implementing hybrid swap-in would be extremely tricky and could disrupt >>>>> several software layers. I can share some pseudo code below: >>>> >>>> Yeah it definitely would be complex, so we need proper justification for it. >>>> >>>>> >>>>> swap_read_folio() >>>>> { >>>>> if (zeromap_full) >>>>> folio_read_from_zeromap() >>>>> else if (zswap_map_full) >>>>> folio_read_from_zswap() >>>>> else { >>>>> folio_read_from_swapfile() >>>>> if (zeromap_partial) >>>>> folio_read_from_zeromap_fixup() /* fill zero >>>>> for partially zeromap subpages */ >>>>> if (zwap_partial) >>>>> folio_read_from_zswap_fixup() /* zswap_load >>>>> for partially zswap-mapped subpages */ >>>>> >>>>> folio_mark_uptodate() >>>>> folio_unlock() >>>>> } >>>>> >>>>> We'd also need to modify folio_read_from_swapfile() to skip >>>>> folio_mark_uptodate() >>>>> and folio_unlock() after completing the BIO. This approach seems to >>>>> entirely disrupt >>>>> the software layers. >>>>> >>>>> This could also lead to unnecessary IO operations for subpages that >>>>> require fixup. >>>>> Since such cases are quite rare, I believe the added complexity isn't worth it. >>>>> >>>>> My point is that we should simply check that all PTEs have consistent zeromap, >>>>> zswap, and swapcache statuses before proceeding, otherwise fall back to the next >>>>> lower order if needed. This approach improves performance and avoids complex >>>>> corner cases. >>>> >>>> Agree that we should start with that, although we should probably >>>> fallback to the largest order we can swapin from a single backend, >>>> rather than the next lower order. >>>> >>>>> >>>>> So once zswap mTHP is there, I would also expect an API similar to >>>>> swap_zeromap_entries_check() >>>>> for example: >>>>> zswap_entries_check(entry, nr) which can return if we are having >>>>> full, non, and partial zswap to replace the existing >>>>> zswap_never_enabled(). >>>> >>>> I think a better API would be similar to what Usama had. Basically >>>> take in (entry, nr) and return how much of it is in zswap starting at >>>> entry, so that we can decide the swapin order. >>>> >>>> Maybe we can adjust your proposed swap_zeromap_entries_check() as well >>>> to do that? Basically return the number of swap entries in the zeromap >>>> starting at 'entry'. If 'entry' itself is not in the zeromap we return >>>> 0 naturally. That would be a small adjustment/fix over what Usama had, >>>> but implementing it with bitmap operations like you did would be >>>> better. >>> >>> I assume you means the below >>> >>> /* >>> * Return the number of contiguous zeromap entries started from entry >>> */ >>> static inline unsigned int swap_zeromap_entries_count(swp_entry_t entry, int nr) >>> { >>> struct swap_info_struct *sis = swp_swap_info(entry); >>> unsigned long start = swp_offset(entry); >>> unsigned long end = start + nr; >>> unsigned long idx; >>> >>> idx = find_next_bit(sis->zeromap, end, start); >>> if (idx != start) >>> return 0; >>> >>> return find_next_zero_bit(sis->zeromap, end, start) - idx; >>> } >>> >>> If yes, I really like this idea. >>> >>> It seems much better than using an enum, which would require adding a new >>> data structure :-) Additionally, returning the number allows callers >>> to fall back >>> to the largest possible order, rather than trying next lower orders >>> sequentially. >> >> No, returning 0 after only checking first entry would still reintroduce >> the current bug, where the start entry is zeromap but other entries >> might not be. We need another value to indicate whether the entries >> are consistent if we want to avoid the enum: >> >> /* >> * Return the number of contiguous zeromap entries started from entry; >> * If all entries have consistent zeromap, *consistent will be true; >> * otherwise, false; >> */ >> static inline unsigned int swap_zeromap_entries_count(swp_entry_t entry, >> int nr, bool *consistent) >> { >> struct swap_info_struct *sis = swp_swap_info(entry); >> unsigned long start = swp_offset(entry); >> unsigned long end = start + nr; >> unsigned long s_idx, c_idx; >> >> s_idx = find_next_bit(sis->zeromap, end, start); >> if (s_idx == end) { >> *consistent = true; >> return 0; >> } >> >> c_idx = find_next_zero_bit(sis->zeromap, end, start); >> if (c_idx == end) { >> *consistent = true; >> return nr; >> } >> >> *consistent = false; >> if (s_idx == start) >> return 0; >> return c_idx - s_idx; >> } >> >> I can actually switch the places of the "consistent" and returned >> number if that looks >> better. > > I'd rather make it simpler by: > > /* > * Check if all entries have consistent zeromap status, return true if > * all entries are zeromap or non-zeromap, else return false; > */ > static inline bool swap_zeromap_entries_check(swp_entry_t entry, int nr) > { > struct swap_info_struct *sis = swp_swap_info(entry); > unsigned long start = swp_offset(entry); > unsigned long end = start + *nr; > I guess you meant end= start + nr here? > if (find_next_bit(sis->zeromap, end, start) == end) > return true; > if (find_next_zero_bit(sis->zeromap, end, start) == end) > return true; > So if zeromap is all false, this still returns true. We cant use this function in swap_read_folio_zeromap, to check at time of swapin if all were zeros, right? > return false; > } > > mm/page_io.c can combine this with reading the zeromap of first entry to > decide if it will read folio from zeromap; mm/memory.c only needs the bool > to fallback to the largest possible order. > > static inline unsigned long thp_swap_suitable_orders(...) > { > int order, nr; > > order = highest_order(orders); > > while (orders) { > nr = 1 << order; > if ((addr >> PAGE_SHIFT) % nr == swp_offset % nr && > swap_zeromap_entries_check(entry, nr)) > break; > order = next_order(&orders, order); > } > > return orders; > } > >> >>> >>> Hi Usama, >>> what is your take on this? >>> >>>> >>>>> >>>>> Though I am not sure how cheap zswap can implement it, >>>>> swap_zeromap_entries_check() >>>>> could be two simple bit operations: >>>>> >>>>> +static inline zeromap_stat_t swap_zeromap_entries_check(swp_entry_t >>>>> entry, int nr) >>>>> +{ >>>>> + struct swap_info_struct *sis = swp_swap_info(entry); >>>>> + unsigned long start = swp_offset(entry); >>>>> + unsigned long end = start + nr; >>>>> + >>>>> + if (find_next_bit(sis->zeromap, end, start) == end) >>>>> + return SWAP_ZEROMAP_NON; >>>>> + if (find_next_zero_bit(sis->zeromap, end, start) == end) >>>>> + return SWAP_ZEROMAP_FULL; >>>>> + >>>>> + return SWAP_ZEROMAP_PARTIAL; >>>>> +} >>>>> >>>>> 3. swapcache is different from zeromap and zswap. Swapcache indicates >>>>> that the memory >>>>> is still available and should be re-mapped rather than allocating a >>>>> new folio. Our previous >>>>> patchset has implemented a full re-map of an mTHP in do_swap_page() as mentioned >>>>> in 1. >>>>> >>>>> For the same reason as point 1, partial swapcache is a rare edge case. >>>>> Not re-mapping it >>>>> and instead allocating a new folio would add significant complexity. >>>>> >>>>>>> >>>>>>> Nonetheless, `zeromap` and `zswap` are distinct cases. With `zeromap`, we >>>>>>> permit almost all mTHP swap-ins, except for those rare situations where >>>>>>> small folios that were swapped out happen to have contiguous and aligned >>>>>>> swap slots. >>>>>>> >>>>>>> swapcache is another quite different story, since our user scenarios begin from >>>>>>> the simplest sync io on mobile phones, we don't quite care about swapcache. >>>>>> >>>>>> Right. The reason I bring this up is as I mentioned above, there is a >>>>>> common problem of forming large folios from different sources, which >>>>>> includes the swap cache. The fact that synchronous swapin does not use >>>>>> the swapcache was a happy coincidence for you, as you can add support >>>>>> mTHP swapins without handling this case yet ;) >>>>> >>>>> As I mentioned above, I'd really rather filter out those corner cases >>>>> than support >>>>> them, not just for the current situation to unlock swap-in series :-) >>>> >>>> If they are indeed corner cases, then I definitely agree. >>> > > Thanks > Barry
On Thu, Sep 5, 2024 at 10:53 PM Usama Arif <usamaarif642@gmail.com> wrote: > > > > On 05/09/2024 11:33, Barry Song wrote: > > On Thu, Sep 5, 2024 at 10:10 PM Barry Song <21cnbao@gmail.com> wrote: > >> > >> On Thu, Sep 5, 2024 at 8:49 PM Barry Song <21cnbao@gmail.com> wrote: > >>> > >>> On Thu, Sep 5, 2024 at 7:55 PM Yosry Ahmed <yosryahmed@google.com> wrote: > >>>> > >>>> On Thu, Sep 5, 2024 at 12:03 AM Barry Song <21cnbao@gmail.com> wrote: > >>>>> > >>>>> On Thu, Sep 5, 2024 at 5:41 AM Yosry Ahmed <yosryahmed@google.com> wrote: > >>>>>> > >>>>>> [..] > >>>>>>>> I understand the point of doing this to unblock the synchronous large > >>>>>>>> folio swapin support work, but at some point we're gonna have to > >>>>>>>> actually handle the cases where a large folio being swapped in is > >>>>>>>> partially in the swap cache, zswap, the zeromap, etc. > >>>>>>>> > >>>>>>>> All these cases will need similar-ish handling, and I suspect we won't > >>>>>>>> just skip swapping in large folios in all these cases. > >>>>>>> > >>>>>>> I agree that this is definitely the goal. `swap_read_folio()` should be a > >>>>>>> dependable API that always returns reliable data, regardless of whether > >>>>>>> `zeromap` or `zswap` is involved. Despite these issues, mTHP swap-in shouldn't > >>>>>>> be held back. Significant efforts are underway to support large folios in > >>>>>>> `zswap`, and progress is being made. Not to mention we've already allowed > >>>>>>> `zeromap` to proceed, even though it doesn't support large folios. > >>>>>>> > >>>>>>> It's genuinely unfair to let the lack of mTHP support in `zeromap` and > >>>>>>> `zswap` hold swap-in hostage. > >>>>>> > >>>>> > >>>>> Hi Yosry, > >>>>> > >>>>>> Well, two points here: > >>>>>> > >>>>>> 1. I did not say that we should block the synchronous mTHP swapin work > >>>>>> for this :) I said the next item on the TODO list for mTHP swapin > >>>>>> support should be handling these cases. > >>>>> > >>>>> Thanks for your clarification! > >>>>> > >>>>>> > >>>>>> 2. I think two things are getting conflated here. Zswap needs to > >>>>>> support mTHP swapin*. Zeromap already supports mTHPs AFAICT. What is > >>>>>> truly, and is outside the scope of zswap/zeromap, is being able to > >>>>>> support hybrid mTHP swapin. > >>>>>> > >>>>>> When swapping in an mTHP, the swapped entries can be on disk, in the > >>>>>> swapcache, in zswap, or in the zeromap. Even if all these things > >>>>>> support mTHPs individually, we essentially need support to form an > >>>>>> mTHP from swap entries in different backends. That's what I meant. > >>>>>> Actually if we have that, we may not really need mTHP swapin support > >>>>>> in zswap, because we can just form the large folio in the swap layer > >>>>>> from multiple zswap entries. > >>>>>> > >>>>> > >>>>> After further consideration, I've actually started to disagree with the idea > >>>>> of supporting hybrid swapin (forming an mTHP from swap entries in different > >>>>> backends). My reasoning is as follows: > >>>> > >>>> I do not have any data about this, so you could very well be right > >>>> here. Handling hybrid swapin could be simply falling back to the > >>>> smallest order we can swapin from a single backend. We can at least > >>>> start with this, and collect data about how many mTHP swapins fallback > >>>> due to hybrid backends. This way we only take the complexity if > >>>> needed. > >>>> > >>>> I did imagine though that it's possible for two virtually contiguous > >>>> folios to be swapped out to contiguous swap entries and end up in > >>>> different media (e.g. if only one of them is zero-filled). I am not > >>>> sure how rare it would be in practice. > >>>> > >>>>> > >>>>> 1. The scenario where an mTHP is partially zeromap, partially zswap, etc., > >>>>> would be an extremely rare case, as long as we're swapping out the mTHP as > >>>>> a whole and all the modules are handling it accordingly. It's highly > >>>>> unlikely to form this mix of zeromap, zswap, and swapcache unless the > >>>>> contiguous VMA virtual address happens to get some small folios with > >>>>> aligned and contiguous swap slots. Even then, they would need to be > >>>>> partially zeromap and partially non-zeromap, zswap, etc. > >>>> > >>>> As I mentioned, we can start simple and collect data for this. If it's > >>>> rare and we don't need to handle it, that's good. > >>>> > >>>>> > >>>>> As you mentioned, zeromap handles mTHP as a whole during swapping > >>>>> out, marking all subpages of the entire mTHP as zeromap rather than just > >>>>> a subset of them. > >>>>> > >>>>> And swap-in can also entirely map a swapcache which is a large folio based > >>>>> on our previous patchset which has been in mainline: > >>>>> "mm: swap: entirely map large folios found in swapcache" > >>>>> https://lore.kernel.org/all/20240529082824.150954-1-21cnbao@gmail.com/ > >>>>> > >>>>> It seems the only thing we're missing is zswap support for mTHP. > >>>> > >>>> It is still possible for two virtually contiguous folios to be swapped > >>>> out to contiguous swap entries. It is also possible that a large folio > >>>> is swapped out as a whole, then only a part of it is swapped in later > >>>> due to memory pressure. If that part is later reclaimed again and gets > >>>> added to the swapcache, we can run into the hybrid swapin situation. > >>>> There may be other scenarios as well, I did not think this through. > >>>> > >>>>> > >>>>> 2. Implementing hybrid swap-in would be extremely tricky and could disrupt > >>>>> several software layers. I can share some pseudo code below: > >>>> > >>>> Yeah it definitely would be complex, so we need proper justification for it. > >>>> > >>>>> > >>>>> swap_read_folio() > >>>>> { > >>>>> if (zeromap_full) > >>>>> folio_read_from_zeromap() > >>>>> else if (zswap_map_full) > >>>>> folio_read_from_zswap() > >>>>> else { > >>>>> folio_read_from_swapfile() > >>>>> if (zeromap_partial) > >>>>> folio_read_from_zeromap_fixup() /* fill zero > >>>>> for partially zeromap subpages */ > >>>>> if (zwap_partial) > >>>>> folio_read_from_zswap_fixup() /* zswap_load > >>>>> for partially zswap-mapped subpages */ > >>>>> > >>>>> folio_mark_uptodate() > >>>>> folio_unlock() > >>>>> } > >>>>> > >>>>> We'd also need to modify folio_read_from_swapfile() to skip > >>>>> folio_mark_uptodate() > >>>>> and folio_unlock() after completing the BIO. This approach seems to > >>>>> entirely disrupt > >>>>> the software layers. > >>>>> > >>>>> This could also lead to unnecessary IO operations for subpages that > >>>>> require fixup. > >>>>> Since such cases are quite rare, I believe the added complexity isn't worth it. > >>>>> > >>>>> My point is that we should simply check that all PTEs have consistent zeromap, > >>>>> zswap, and swapcache statuses before proceeding, otherwise fall back to the next > >>>>> lower order if needed. This approach improves performance and avoids complex > >>>>> corner cases. > >>>> > >>>> Agree that we should start with that, although we should probably > >>>> fallback to the largest order we can swapin from a single backend, > >>>> rather than the next lower order. > >>>> > >>>>> > >>>>> So once zswap mTHP is there, I would also expect an API similar to > >>>>> swap_zeromap_entries_check() > >>>>> for example: > >>>>> zswap_entries_check(entry, nr) which can return if we are having > >>>>> full, non, and partial zswap to replace the existing > >>>>> zswap_never_enabled(). > >>>> > >>>> I think a better API would be similar to what Usama had. Basically > >>>> take in (entry, nr) and return how much of it is in zswap starting at > >>>> entry, so that we can decide the swapin order. > >>>> > >>>> Maybe we can adjust your proposed swap_zeromap_entries_check() as well > >>>> to do that? Basically return the number of swap entries in the zeromap > >>>> starting at 'entry'. If 'entry' itself is not in the zeromap we return > >>>> 0 naturally. That would be a small adjustment/fix over what Usama had, > >>>> but implementing it with bitmap operations like you did would be > >>>> better. > >>> > >>> I assume you means the below > >>> > >>> /* > >>> * Return the number of contiguous zeromap entries started from entry > >>> */ > >>> static inline unsigned int swap_zeromap_entries_count(swp_entry_t entry, int nr) > >>> { > >>> struct swap_info_struct *sis = swp_swap_info(entry); > >>> unsigned long start = swp_offset(entry); > >>> unsigned long end = start + nr; > >>> unsigned long idx; > >>> > >>> idx = find_next_bit(sis->zeromap, end, start); > >>> if (idx != start) > >>> return 0; > >>> > >>> return find_next_zero_bit(sis->zeromap, end, start) - idx; > >>> } > >>> > >>> If yes, I really like this idea. > >>> > >>> It seems much better than using an enum, which would require adding a new > >>> data structure :-) Additionally, returning the number allows callers > >>> to fall back > >>> to the largest possible order, rather than trying next lower orders > >>> sequentially. > >> > >> No, returning 0 after only checking first entry would still reintroduce > >> the current bug, where the start entry is zeromap but other entries > >> might not be. We need another value to indicate whether the entries > >> are consistent if we want to avoid the enum: > >> > >> /* > >> * Return the number of contiguous zeromap entries started from entry; > >> * If all entries have consistent zeromap, *consistent will be true; > >> * otherwise, false; > >> */ > >> static inline unsigned int swap_zeromap_entries_count(swp_entry_t entry, > >> int nr, bool *consistent) > >> { > >> struct swap_info_struct *sis = swp_swap_info(entry); > >> unsigned long start = swp_offset(entry); > >> unsigned long end = start + nr; > >> unsigned long s_idx, c_idx; > >> > >> s_idx = find_next_bit(sis->zeromap, end, start); > >> if (s_idx == end) { > >> *consistent = true; > >> return 0; > >> } > >> > >> c_idx = find_next_zero_bit(sis->zeromap, end, start); > >> if (c_idx == end) { > >> *consistent = true; > >> return nr; > >> } > >> > >> *consistent = false; > >> if (s_idx == start) > >> return 0; > >> return c_idx - s_idx; > >> } > >> > >> I can actually switch the places of the "consistent" and returned > >> number if that looks > >> better. > > > > I'd rather make it simpler by: > > > > /* > > * Check if all entries have consistent zeromap status, return true if > > * all entries are zeromap or non-zeromap, else return false; > > */ > > static inline bool swap_zeromap_entries_check(swp_entry_t entry, int nr) > > { > > struct swap_info_struct *sis = swp_swap_info(entry); > > unsigned long start = swp_offset(entry); > > unsigned long end = start + *nr; > > > I guess you meant end= start + nr here? right. > > > if (find_next_bit(sis->zeromap, end, start) == end) > > return true; > > if (find_next_zero_bit(sis->zeromap, end, start) == end) > > return true; > > > So if zeromap is all false, this still returns true. We cant use this function in swap_read_folio_zeromap, > to check at time of swapin if all were zeros, right? We can, my point is that swap_read_folio_zeromap() is the only function that actually needs the real value of zeromap; the others only care about consistency. So if we can avoid introducing a new enum across modules, we avoid it :-) static bool swap_read_folio_zeromap(struct folio *folio) { struct swap_info_struct *sis = swp_swap_info(folio->swap) unsigned int nr_pages = folio_nr_pages(folio); swp_entry_t entry = folio->swap; /* * Swapping in a large folio that is partially in the zeromap is not * currently handled. Return true without marking the folio uptodate so * that an IO error is emitted (e.g. do_swap_page() will sigbus). */ if (WARN_ON_ONCE(!swap_zeromap_entries_check(entry, nr_pages))) return true; if (!test_bit(swp_offset(entry), sis->zeromap)) return false; folio_zero_range(folio, 0, folio_size(folio)); folio_mark_uptodate(folio); return true; } mm/memory.c only needs true or false, it doesn't care about the real value. > > > > return false; > > } > > > > mm/page_io.c can combine this with reading the zeromap of first entry to > > decide if it will read folio from zeromap; mm/memory.c only needs the bool > > to fallback to the largest possible order. > > > > static inline unsigned long thp_swap_suitable_orders(...) > > { > > int order, nr; > > > > order = highest_order(orders); > > > > while (orders) { > > nr = 1 << order; > > if ((addr >> PAGE_SHIFT) % nr == swp_offset % nr && > > swap_zeromap_entries_check(entry, nr)) > > break; > > order = next_order(&orders, order); > > } > > > > return orders; > > } > > > >> > >>> > >>> Hi Usama, > >>> what is your take on this? > >>> > >>>> > >>>>> > >>>>> Though I am not sure how cheap zswap can implement it, > >>>>> swap_zeromap_entries_check() > >>>>> could be two simple bit operations: > >>>>> > >>>>> +static inline zeromap_stat_t swap_zeromap_entries_check(swp_entry_t > >>>>> entry, int nr) > >>>>> +{ > >>>>> + struct swap_info_struct *sis = swp_swap_info(entry); > >>>>> + unsigned long start = swp_offset(entry); > >>>>> + unsigned long end = start + nr; > >>>>> + > >>>>> + if (find_next_bit(sis->zeromap, end, start) == end) > >>>>> + return SWAP_ZEROMAP_NON; > >>>>> + if (find_next_zero_bit(sis->zeromap, end, start) == end) > >>>>> + return SWAP_ZEROMAP_FULL; > >>>>> + > >>>>> + return SWAP_ZEROMAP_PARTIAL; > >>>>> +} > >>>>> > >>>>> 3. swapcache is different from zeromap and zswap. Swapcache indicates > >>>>> that the memory > >>>>> is still available and should be re-mapped rather than allocating a > >>>>> new folio. Our previous > >>>>> patchset has implemented a full re-map of an mTHP in do_swap_page() as mentioned > >>>>> in 1. > >>>>> > >>>>> For the same reason as point 1, partial swapcache is a rare edge case. > >>>>> Not re-mapping it > >>>>> and instead allocating a new folio would add significant complexity. > >>>>> > >>>>>>> > >>>>>>> Nonetheless, `zeromap` and `zswap` are distinct cases. With `zeromap`, we > >>>>>>> permit almost all mTHP swap-ins, except for those rare situations where > >>>>>>> small folios that were swapped out happen to have contiguous and aligned > >>>>>>> swap slots. > >>>>>>> > >>>>>>> swapcache is another quite different story, since our user scenarios begin from > >>>>>>> the simplest sync io on mobile phones, we don't quite care about swapcache. > >>>>>> > >>>>>> Right. The reason I bring this up is as I mentioned above, there is a > >>>>>> common problem of forming large folios from different sources, which > >>>>>> includes the swap cache. The fact that synchronous swapin does not use > >>>>>> the swapcache was a happy coincidence for you, as you can add support > >>>>>> mTHP swapins without handling this case yet ;) > >>>>> > >>>>> As I mentioned above, I'd really rather filter out those corner cases > >>>>> than support > >>>>> them, not just for the current situation to unlock swap-in series :-) > >>>> > >>>> If they are indeed corner cases, then I definitely agree. > >>> > > Thanks Barry
[..] > > > > /* > > * Return the number of contiguous zeromap entries started from entry; > > * If all entries have consistent zeromap, *consistent will be true; > > * otherwise, false; > > */ > > static inline unsigned int swap_zeromap_entries_count(swp_entry_t entry, > > int nr, bool *consistent) > > { > > struct swap_info_struct *sis = swp_swap_info(entry); > > unsigned long start = swp_offset(entry); > > unsigned long end = start + nr; > > unsigned long s_idx, c_idx; > > > > s_idx = find_next_bit(sis->zeromap, end, start); > > if (s_idx == end) { > > *consistent = true; > > return 0; > > } > > > > c_idx = find_next_zero_bit(sis->zeromap, end, start); > > if (c_idx == end) { > > *consistent = true; > > return nr; > > } > > > > *consistent = false; > > if (s_idx == start) > > return 0; > > return c_idx - s_idx; > > } > > > > I can actually switch the places of the "consistent" and returned > > number if that looks > > better. > > I'd rather make it simpler by: > > /* > * Check if all entries have consistent zeromap status, return true if > * all entries are zeromap or non-zeromap, else return false; > */ > static inline bool swap_zeromap_entries_check(swp_entry_t entry, int nr) > { > struct swap_info_struct *sis = swp_swap_info(entry); > unsigned long start = swp_offset(entry); > unsigned long end = start + *nr; > > if (find_next_bit(sis->zeromap, end, start) == end) > return true; > if (find_next_zero_bit(sis->zeromap, end, start) == end) > return true; > > return false; > } We can start with a simple version like this, and when the time comes to implement the logic below we can decide if it's worth the complexity to return an exact number/order rather than a boolean to decide the swapin order. I think it will also depend on whether we can do the same for other backends (e.g. swapcache, zswap, etc). We can note that in the commit log or something. > > mm/page_io.c can combine this with reading the zeromap of first entry to > decide if it will read folio from zeromap; mm/memory.c only needs the bool > to fallback to the largest possible order. > > static inline unsigned long thp_swap_suitable_orders(...) > { > int order, nr; > > order = highest_order(orders); > > while (orders) { > nr = 1 << order; > if ((addr >> PAGE_SHIFT) % nr == swp_offset % nr && > swap_zeromap_entries_check(entry, nr)) > break; > order = next_order(&orders, order); > } > > return orders; > } >
On 05/09/2024 12:00, Barry Song wrote: > On Thu, Sep 5, 2024 at 10:53 PM Usama Arif <usamaarif642@gmail.com> wrote: >> >> >> >> On 05/09/2024 11:33, Barry Song wrote: >>> On Thu, Sep 5, 2024 at 10:10 PM Barry Song <21cnbao@gmail.com> wrote: >>>> >>>> On Thu, Sep 5, 2024 at 8:49 PM Barry Song <21cnbao@gmail.com> wrote: >>>>> >>>>> On Thu, Sep 5, 2024 at 7:55 PM Yosry Ahmed <yosryahmed@google.com> wrote: >>>>>> >>>>>> On Thu, Sep 5, 2024 at 12:03 AM Barry Song <21cnbao@gmail.com> wrote: >>>>>>> >>>>>>> On Thu, Sep 5, 2024 at 5:41 AM Yosry Ahmed <yosryahmed@google.com> wrote: >>>>>>>> >>>>>>>> [..] >>>>>>>>>> I understand the point of doing this to unblock the synchronous large >>>>>>>>>> folio swapin support work, but at some point we're gonna have to >>>>>>>>>> actually handle the cases where a large folio being swapped in is >>>>>>>>>> partially in the swap cache, zswap, the zeromap, etc. >>>>>>>>>> >>>>>>>>>> All these cases will need similar-ish handling, and I suspect we won't >>>>>>>>>> just skip swapping in large folios in all these cases. >>>>>>>>> >>>>>>>>> I agree that this is definitely the goal. `swap_read_folio()` should be a >>>>>>>>> dependable API that always returns reliable data, regardless of whether >>>>>>>>> `zeromap` or `zswap` is involved. Despite these issues, mTHP swap-in shouldn't >>>>>>>>> be held back. Significant efforts are underway to support large folios in >>>>>>>>> `zswap`, and progress is being made. Not to mention we've already allowed >>>>>>>>> `zeromap` to proceed, even though it doesn't support large folios. >>>>>>>>> >>>>>>>>> It's genuinely unfair to let the lack of mTHP support in `zeromap` and >>>>>>>>> `zswap` hold swap-in hostage. >>>>>>>> >>>>>>> >>>>>>> Hi Yosry, >>>>>>> >>>>>>>> Well, two points here: >>>>>>>> >>>>>>>> 1. I did not say that we should block the synchronous mTHP swapin work >>>>>>>> for this :) I said the next item on the TODO list for mTHP swapin >>>>>>>> support should be handling these cases. >>>>>>> >>>>>>> Thanks for your clarification! >>>>>>> >>>>>>>> >>>>>>>> 2. I think two things are getting conflated here. Zswap needs to >>>>>>>> support mTHP swapin*. Zeromap already supports mTHPs AFAICT. What is >>>>>>>> truly, and is outside the scope of zswap/zeromap, is being able to >>>>>>>> support hybrid mTHP swapin. >>>>>>>> >>>>>>>> When swapping in an mTHP, the swapped entries can be on disk, in the >>>>>>>> swapcache, in zswap, or in the zeromap. Even if all these things >>>>>>>> support mTHPs individually, we essentially need support to form an >>>>>>>> mTHP from swap entries in different backends. That's what I meant. >>>>>>>> Actually if we have that, we may not really need mTHP swapin support >>>>>>>> in zswap, because we can just form the large folio in the swap layer >>>>>>>> from multiple zswap entries. >>>>>>>> >>>>>>> >>>>>>> After further consideration, I've actually started to disagree with the idea >>>>>>> of supporting hybrid swapin (forming an mTHP from swap entries in different >>>>>>> backends). My reasoning is as follows: >>>>>> >>>>>> I do not have any data about this, so you could very well be right >>>>>> here. Handling hybrid swapin could be simply falling back to the >>>>>> smallest order we can swapin from a single backend. We can at least >>>>>> start with this, and collect data about how many mTHP swapins fallback >>>>>> due to hybrid backends. This way we only take the complexity if >>>>>> needed. >>>>>> >>>>>> I did imagine though that it's possible for two virtually contiguous >>>>>> folios to be swapped out to contiguous swap entries and end up in >>>>>> different media (e.g. if only one of them is zero-filled). I am not >>>>>> sure how rare it would be in practice. >>>>>> >>>>>>> >>>>>>> 1. The scenario where an mTHP is partially zeromap, partially zswap, etc., >>>>>>> would be an extremely rare case, as long as we're swapping out the mTHP as >>>>>>> a whole and all the modules are handling it accordingly. It's highly >>>>>>> unlikely to form this mix of zeromap, zswap, and swapcache unless the >>>>>>> contiguous VMA virtual address happens to get some small folios with >>>>>>> aligned and contiguous swap slots. Even then, they would need to be >>>>>>> partially zeromap and partially non-zeromap, zswap, etc. >>>>>> >>>>>> As I mentioned, we can start simple and collect data for this. If it's >>>>>> rare and we don't need to handle it, that's good. >>>>>> >>>>>>> >>>>>>> As you mentioned, zeromap handles mTHP as a whole during swapping >>>>>>> out, marking all subpages of the entire mTHP as zeromap rather than just >>>>>>> a subset of them. >>>>>>> >>>>>>> And swap-in can also entirely map a swapcache which is a large folio based >>>>>>> on our previous patchset which has been in mainline: >>>>>>> "mm: swap: entirely map large folios found in swapcache" >>>>>>> https://lore.kernel.org/all/20240529082824.150954-1-21cnbao@gmail.com/ >>>>>>> >>>>>>> It seems the only thing we're missing is zswap support for mTHP. >>>>>> >>>>>> It is still possible for two virtually contiguous folios to be swapped >>>>>> out to contiguous swap entries. It is also possible that a large folio >>>>>> is swapped out as a whole, then only a part of it is swapped in later >>>>>> due to memory pressure. If that part is later reclaimed again and gets >>>>>> added to the swapcache, we can run into the hybrid swapin situation. >>>>>> There may be other scenarios as well, I did not think this through. >>>>>> >>>>>>> >>>>>>> 2. Implementing hybrid swap-in would be extremely tricky and could disrupt >>>>>>> several software layers. I can share some pseudo code below: >>>>>> >>>>>> Yeah it definitely would be complex, so we need proper justification for it. >>>>>> >>>>>>> >>>>>>> swap_read_folio() >>>>>>> { >>>>>>> if (zeromap_full) >>>>>>> folio_read_from_zeromap() >>>>>>> else if (zswap_map_full) >>>>>>> folio_read_from_zswap() >>>>>>> else { >>>>>>> folio_read_from_swapfile() >>>>>>> if (zeromap_partial) >>>>>>> folio_read_from_zeromap_fixup() /* fill zero >>>>>>> for partially zeromap subpages */ >>>>>>> if (zwap_partial) >>>>>>> folio_read_from_zswap_fixup() /* zswap_load >>>>>>> for partially zswap-mapped subpages */ >>>>>>> >>>>>>> folio_mark_uptodate() >>>>>>> folio_unlock() >>>>>>> } >>>>>>> >>>>>>> We'd also need to modify folio_read_from_swapfile() to skip >>>>>>> folio_mark_uptodate() >>>>>>> and folio_unlock() after completing the BIO. This approach seems to >>>>>>> entirely disrupt >>>>>>> the software layers. >>>>>>> >>>>>>> This could also lead to unnecessary IO operations for subpages that >>>>>>> require fixup. >>>>>>> Since such cases are quite rare, I believe the added complexity isn't worth it. >>>>>>> >>>>>>> My point is that we should simply check that all PTEs have consistent zeromap, >>>>>>> zswap, and swapcache statuses before proceeding, otherwise fall back to the next >>>>>>> lower order if needed. This approach improves performance and avoids complex >>>>>>> corner cases. >>>>>> >>>>>> Agree that we should start with that, although we should probably >>>>>> fallback to the largest order we can swapin from a single backend, >>>>>> rather than the next lower order. >>>>>> >>>>>>> >>>>>>> So once zswap mTHP is there, I would also expect an API similar to >>>>>>> swap_zeromap_entries_check() >>>>>>> for example: >>>>>>> zswap_entries_check(entry, nr) which can return if we are having >>>>>>> full, non, and partial zswap to replace the existing >>>>>>> zswap_never_enabled(). >>>>>> >>>>>> I think a better API would be similar to what Usama had. Basically >>>>>> take in (entry, nr) and return how much of it is in zswap starting at >>>>>> entry, so that we can decide the swapin order. >>>>>> >>>>>> Maybe we can adjust your proposed swap_zeromap_entries_check() as well >>>>>> to do that? Basically return the number of swap entries in the zeromap >>>>>> starting at 'entry'. If 'entry' itself is not in the zeromap we return >>>>>> 0 naturally. That would be a small adjustment/fix over what Usama had, >>>>>> but implementing it with bitmap operations like you did would be >>>>>> better. >>>>> >>>>> I assume you means the below >>>>> >>>>> /* >>>>> * Return the number of contiguous zeromap entries started from entry >>>>> */ >>>>> static inline unsigned int swap_zeromap_entries_count(swp_entry_t entry, int nr) >>>>> { >>>>> struct swap_info_struct *sis = swp_swap_info(entry); >>>>> unsigned long start = swp_offset(entry); >>>>> unsigned long end = start + nr; >>>>> unsigned long idx; >>>>> >>>>> idx = find_next_bit(sis->zeromap, end, start); >>>>> if (idx != start) >>>>> return 0; >>>>> >>>>> return find_next_zero_bit(sis->zeromap, end, start) - idx; >>>>> } >>>>> >>>>> If yes, I really like this idea. >>>>> >>>>> It seems much better than using an enum, which would require adding a new >>>>> data structure :-) Additionally, returning the number allows callers >>>>> to fall back >>>>> to the largest possible order, rather than trying next lower orders >>>>> sequentially. >>>> >>>> No, returning 0 after only checking first entry would still reintroduce >>>> the current bug, where the start entry is zeromap but other entries >>>> might not be. We need another value to indicate whether the entries >>>> are consistent if we want to avoid the enum: >>>> >>>> /* >>>> * Return the number of contiguous zeromap entries started from entry; >>>> * If all entries have consistent zeromap, *consistent will be true; >>>> * otherwise, false; >>>> */ >>>> static inline unsigned int swap_zeromap_entries_count(swp_entry_t entry, >>>> int nr, bool *consistent) >>>> { >>>> struct swap_info_struct *sis = swp_swap_info(entry); >>>> unsigned long start = swp_offset(entry); >>>> unsigned long end = start + nr; >>>> unsigned long s_idx, c_idx; >>>> >>>> s_idx = find_next_bit(sis->zeromap, end, start); >>>> if (s_idx == end) { >>>> *consistent = true; >>>> return 0; >>>> } >>>> >>>> c_idx = find_next_zero_bit(sis->zeromap, end, start); >>>> if (c_idx == end) { >>>> *consistent = true; >>>> return nr; >>>> } >>>> >>>> *consistent = false; >>>> if (s_idx == start) >>>> return 0; >>>> return c_idx - s_idx; >>>> } >>>> >>>> I can actually switch the places of the "consistent" and returned >>>> number if that looks >>>> better. >>> >>> I'd rather make it simpler by: >>> >>> /* >>> * Check if all entries have consistent zeromap status, return true if >>> * all entries are zeromap or non-zeromap, else return false; >>> */ >>> static inline bool swap_zeromap_entries_check(swp_entry_t entry, int nr) >>> { >>> struct swap_info_struct *sis = swp_swap_info(entry); >>> unsigned long start = swp_offset(entry); >>> unsigned long end = start + *nr; >>> >> I guess you meant end= start + nr here? > > right. > >> >>> if (find_next_bit(sis->zeromap, end, start) == end) >>> return true; >>> if (find_next_zero_bit(sis->zeromap, end, start) == end) >>> return true; >>> >> So if zeromap is all false, this still returns true. We cant use this function in swap_read_folio_zeromap, >> to check at time of swapin if all were zeros, right? > > We can, my point is that swap_read_folio_zeromap() is the only > function that actually > needs the real value of zeromap; the others only care about > consistency. So if we can > avoid introducing a new enum across modules, we avoid it :-) > > static bool swap_read_folio_zeromap(struct folio *folio) > { > struct swap_info_struct *sis = swp_swap_info(folio->swap) > unsigned int nr_pages = folio_nr_pages(folio); > swp_entry_t entry = folio->swap; > > /* > * Swapping in a large folio that is partially in the zeromap is not > * currently handled. Return true without marking the folio uptodate so > * that an IO error is emitted (e.g. do_swap_page() will sigbus). > */ > if (WARN_ON_ONCE(!swap_zeromap_entries_check(entry, nr_pages))) > return true; > > if (!test_bit(swp_offset(entry), sis->zeromap)) > return false; > LGTM with this swap_read_folio_zeromap. Thanks! > folio_zero_range(folio, 0, folio_size(folio)); > folio_mark_uptodate(folio); > return true; > } > > mm/memory.c only needs true or false, it doesn't care about the real value. > >> >> >>> return false; >>> } >>> >>> mm/page_io.c can combine this with reading the zeromap of first entry to >>> decide if it will read folio from zeromap; mm/memory.c only needs the bool >>> to fallback to the largest possible order. >>> >>> static inline unsigned long thp_swap_suitable_orders(...) >>> { >>> int order, nr; >>> >>> order = highest_order(orders); >>> >>> while (orders) { >>> nr = 1 << order; >>> if ((addr >> PAGE_SHIFT) % nr == swp_offset % nr && >>> swap_zeromap_entries_check(entry, nr)) >>> break; >>> order = next_order(&orders, order); >>> } >>> >>> return orders; >>> } >>> >>>> >>>>> >>>>> Hi Usama, >>>>> what is your take on this? >>>>> >>>>>> >>>>>>> >>>>>>> Though I am not sure how cheap zswap can implement it, >>>>>>> swap_zeromap_entries_check() >>>>>>> could be two simple bit operations: >>>>>>> >>>>>>> +static inline zeromap_stat_t swap_zeromap_entries_check(swp_entry_t >>>>>>> entry, int nr) >>>>>>> +{ >>>>>>> + struct swap_info_struct *sis = swp_swap_info(entry); >>>>>>> + unsigned long start = swp_offset(entry); >>>>>>> + unsigned long end = start + nr; >>>>>>> + >>>>>>> + if (find_next_bit(sis->zeromap, end, start) == end) >>>>>>> + return SWAP_ZEROMAP_NON; >>>>>>> + if (find_next_zero_bit(sis->zeromap, end, start) == end) >>>>>>> + return SWAP_ZEROMAP_FULL; >>>>>>> + >>>>>>> + return SWAP_ZEROMAP_PARTIAL; >>>>>>> +} >>>>>>> >>>>>>> 3. swapcache is different from zeromap and zswap. Swapcache indicates >>>>>>> that the memory >>>>>>> is still available and should be re-mapped rather than allocating a >>>>>>> new folio. Our previous >>>>>>> patchset has implemented a full re-map of an mTHP in do_swap_page() as mentioned >>>>>>> in 1. >>>>>>> >>>>>>> For the same reason as point 1, partial swapcache is a rare edge case. >>>>>>> Not re-mapping it >>>>>>> and instead allocating a new folio would add significant complexity. >>>>>>> >>>>>>>>> >>>>>>>>> Nonetheless, `zeromap` and `zswap` are distinct cases. With `zeromap`, we >>>>>>>>> permit almost all mTHP swap-ins, except for those rare situations where >>>>>>>>> small folios that were swapped out happen to have contiguous and aligned >>>>>>>>> swap slots. >>>>>>>>> >>>>>>>>> swapcache is another quite different story, since our user scenarios begin from >>>>>>>>> the simplest sync io on mobile phones, we don't quite care about swapcache. >>>>>>>> >>>>>>>> Right. The reason I bring this up is as I mentioned above, there is a >>>>>>>> common problem of forming large folios from different sources, which >>>>>>>> includes the swap cache. The fact that synchronous swapin does not use >>>>>>>> the swapcache was a happy coincidence for you, as you can add support >>>>>>>> mTHP swapins without handling this case yet ;) >>>>>>> >>>>>>> As I mentioned above, I'd really rather filter out those corner cases >>>>>>> than support >>>>>>> them, not just for the current situation to unlock swap-in series :-) >>>>>> >>>>>> If they are indeed corner cases, then I definitely agree. >>>>> >>> > > Thanks > Barry
[..] > /* > * Check if all entries have consistent zeromap status, return true if > * all entries are zeromap or non-zeromap, else return false; > */ > static inline bool swap_zeromap_entries_check(swp_entry_t entry, int nr) Let's also rename this now to swap_zeromap_entries_same(), "check" is a little vague. > { > struct swap_info_struct *sis = swp_swap_info(entry); > unsigned long start = swp_offset(entry); > unsigned long end = start + *nr; > > if (find_next_bit(sis->zeromap, end, start) == end) > return true; > if (find_next_zero_bit(sis->zeromap, end, start) == end) > return true; > > return false; > } >
On Fri, Sep 6, 2024 at 7:28 AM Yosry Ahmed <yosryahmed@google.com> wrote: > > [..] > > /* > > * Check if all entries have consistent zeromap status, return true if > > * all entries are zeromap or non-zeromap, else return false; > > */ > > static inline bool swap_zeromap_entries_check(swp_entry_t entry, int nr) > > Let's also rename this now to swap_zeromap_entries_same(), "check" is > a little vague. Hi Yosry, Usama, Thanks very much for your comments. After further consideration, I have adopted a different approach that offers more flexibility than returning a boolean value and also has an equally low implementation cost: https://lore.kernel.org/linux-mm/20240906001047.1245-2-21cnbao@gmail.com/ This is somewhat similar to Yosry's previous idea but does not reintroduce the existing bug. > > > { > > struct swap_info_struct *sis = swp_swap_info(entry); > > unsigned long start = swp_offset(entry); > > unsigned long end = start + *nr; > > > > if (find_next_bit(sis->zeromap, end, start) == end) > > return true; > > if (find_next_zero_bit(sis->zeromap, end, start) == end) > > return true; > > > > return false; > > } > > Thanks Barry
diff --git a/include/linux/swap.h b/include/linux/swap.h index a11c75e897ec..e88563978441 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -299,6 +299,7 @@ struct swap_info_struct { signed char type; /* strange name for an index */ unsigned int max; /* extent of the swap_map */ unsigned char *swap_map; /* vmalloc'ed array of usage counts */ + unsigned long *zeromap; /* vmalloc'ed bitmap to track zero pages */ struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ struct swap_cluster_list free_clusters; /* free clusters list */ unsigned int lowest_bit; /* index of first free in swap_map */ diff --git a/mm/page_io.c b/mm/page_io.c index a360857cf75d..39fc3919ce15 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -172,6 +172,88 @@ int generic_swapfile_activate(struct swap_info_struct *sis, goto out; } +static bool is_folio_page_zero_filled(struct folio *folio, int i) +{ + unsigned long *data; + unsigned int pos, last_pos = PAGE_SIZE / sizeof(*data) - 1; + bool ret = false; + + data = kmap_local_folio(folio, i * PAGE_SIZE); + if (data[last_pos]) + goto out; + for (pos = 0; pos < PAGE_SIZE / sizeof(*data); pos++) { + if (data[pos]) + goto out; + } + ret = true; +out: + kunmap_local(data); + return ret; +} + +static bool is_folio_zero_filled(struct folio *folio) +{ + unsigned int i; + + for (i = 0; i < folio_nr_pages(folio); i++) { + if (!is_folio_page_zero_filled(folio, i)) + return false; + } + return true; +} + +static void folio_zero_fill(struct folio *folio) +{ + unsigned int i; + + for (i = 0; i < folio_nr_pages(folio); i++) + clear_highpage(folio_page(folio, i)); +} + +static void swap_zeromap_folio_set(struct folio *folio) +{ + struct swap_info_struct *sis = swp_swap_info(folio->swap); + swp_entry_t entry; + unsigned int i; + + for (i = 0; i < folio_nr_pages(folio); i++) { + entry = page_swap_entry(folio_page(folio, i)); + set_bit(swp_offset(entry), sis->zeromap); + } +} + +static void swap_zeromap_folio_clear(struct folio *folio) +{ + struct swap_info_struct *sis = swp_swap_info(folio->swap); + swp_entry_t entry; + unsigned int i; + + for (i = 0; i < folio_nr_pages(folio); i++) { + entry = page_swap_entry(folio_page(folio, i)); + clear_bit(swp_offset(entry), sis->zeromap); + } +} + +/* + * Return the index of the first subpage which is not zero-filled + * according to swap_info_struct->zeromap. + * If all pages are zero-filled according to zeromap, it will return + * folio_nr_pages(folio). + */ +static unsigned int swap_zeromap_folio_test(struct folio *folio) +{ + struct swap_info_struct *sis = swp_swap_info(folio->swap); + swp_entry_t entry; + unsigned int i; + + for (i = 0; i < folio_nr_pages(folio); i++) { + entry = page_swap_entry(folio_page(folio, i)); + if (!test_bit(swp_offset(entry), sis->zeromap)) + return i; + } + return i; +} + /* * We may have stale swap cache pages in memory: notice * them here and get rid of the unnecessary final write. @@ -195,6 +277,13 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) folio_unlock(folio); return ret; } + + if (is_folio_zero_filled(folio)) { + swap_zeromap_folio_set(folio); + folio_unlock(folio); + return 0; + } + swap_zeromap_folio_clear(folio); if (zswap_store(folio)) { folio_start_writeback(folio); folio_unlock(folio); @@ -426,6 +515,26 @@ static void sio_read_complete(struct kiocb *iocb, long ret) mempool_free(sio, sio_pool); } +static bool swap_read_folio_zeromap(struct folio *folio) +{ + unsigned int idx = swap_zeromap_folio_test(folio); + + if (idx == 0) + return false; + + /* + * Swapping in a large folio that is partially in the zeromap is not + * currently handled. Return true without marking the folio uptodate so + * that an IO error is emitted (e.g. do_swap_page() will sigbus). + */ + if (WARN_ON_ONCE(idx < folio_nr_pages(folio))) + return true; + + folio_zero_fill(folio); + folio_mark_uptodate(folio); + return true; +} + static void swap_read_folio_fs(struct folio *folio, struct swap_iocb **plug) { struct swap_info_struct *sis = swp_swap_info(folio->swap); @@ -515,8 +624,9 @@ void swap_read_folio(struct folio *folio, bool synchronous, psi_memstall_enter(&pflags); } delayacct_swapin_start(); - - if (zswap_load(folio)) { + if (swap_read_folio_zeromap(folio)) { + folio_unlock(folio); + } else if (zswap_load(folio)) { folio_mark_uptodate(folio); folio_unlock(folio); } else if (data_race(sis->flags & SWP_FS_OPS)) { diff --git a/mm/swapfile.c b/mm/swapfile.c index f1e559e216bd..48d8dca0b94b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -453,6 +453,8 @@ static unsigned int cluster_list_del_first(struct swap_cluster_list *list, static void swap_cluster_schedule_discard(struct swap_info_struct *si, unsigned int idx) { + unsigned int i; + /* * If scan_swap_map_slots() can't find a free cluster, it will check * si->swap_map directly. To make sure the discarding cluster isn't @@ -461,6 +463,13 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si, */ memset(si->swap_map + idx * SWAPFILE_CLUSTER, SWAP_MAP_BAD, SWAPFILE_CLUSTER); + /* + * zeromap can see updates from concurrent swap_writepage() and swap_read_folio() + * call on other slots, hence use atomic clear_bit for zeromap instead of the + * non-atomic bitmap_clear. + */ + for (i = 0; i < SWAPFILE_CLUSTER; i++) + clear_bit(idx * SWAPFILE_CLUSTER + i, si->zeromap); cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx); @@ -482,7 +491,7 @@ static void __free_cluster(struct swap_info_struct *si, unsigned long idx) static void swap_do_scheduled_discard(struct swap_info_struct *si) { struct swap_cluster_info *info, *ci; - unsigned int idx; + unsigned int idx, i; info = si->cluster_info; @@ -498,6 +507,8 @@ static void swap_do_scheduled_discard(struct swap_info_struct *si) __free_cluster(si, idx); memset(si->swap_map + idx * SWAPFILE_CLUSTER, 0, SWAPFILE_CLUSTER); + for (i = 0; i < SWAPFILE_CLUSTER; i++) + clear_bit(idx * SWAPFILE_CLUSTER + i, si->zeromap); unlock_cluster(ci); } } @@ -1059,9 +1070,12 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) { unsigned long offset = idx * SWAPFILE_CLUSTER; struct swap_cluster_info *ci; + unsigned int i; ci = lock_cluster(si, offset); memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER); + for (i = 0; i < SWAPFILE_CLUSTER; i++) + clear_bit(offset + i, si->zeromap); cluster_set_count_flag(ci, 0, 0); free_cluster(si, idx); unlock_cluster(ci); @@ -1336,6 +1350,7 @@ static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry) count = p->swap_map[offset]; VM_BUG_ON(count != SWAP_HAS_CACHE); p->swap_map[offset] = 0; + clear_bit(offset, p->zeromap); dec_cluster_info_page(p, p->cluster_info, offset); unlock_cluster(ci); @@ -2597,6 +2612,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) free_percpu(p->cluster_next_cpu); p->cluster_next_cpu = NULL; vfree(swap_map); + bitmap_free(p->zeromap); kvfree(cluster_info); /* Destroy swap account information */ swap_cgroup_swapoff(p->type); @@ -3123,6 +3139,12 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) goto bad_swap_unlock_inode; } + p->zeromap = bitmap_zalloc(maxpages, GFP_KERNEL); + if (!p->zeromap) { + error = -ENOMEM; + goto bad_swap_unlock_inode; + } + if (p->bdev && bdev_stable_writes(p->bdev)) p->flags |= SWP_STABLE_WRITES;
Approximately 10-20% of pages to be swapped out are zero pages [1]. Rather than reading/writing these pages to flash resulting in increased I/O and flash wear, a bitmap can be used to mark these pages as zero at write time, and the pages can be filled at read time if the bit corresponding to the page is set. With this patch, NVMe writes in Meta server fleet decreased by almost 10% with conventional swap setup (zswap disabled). [1] https://lore.kernel.org/all/20171018104832epcms5p1b2232e2236258de3d03d1344dde9fce0@epcms5p1/ Signed-off-by: Usama Arif <usamaarif642@gmail.com> --- include/linux/swap.h | 1 + mm/page_io.c | 114 ++++++++++++++++++++++++++++++++++++++++++- mm/swapfile.c | 24 ++++++++- 3 files changed, 136 insertions(+), 3 deletions(-)