Message ID | 20250113175732.48099-13-ryncsn@gmail.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | mm, swap: rework of swap allocator locks | expand |
On Tue, Jan 14, 2025 at 2:00 AM Kairui Song <ryncsn@gmail.com> wrote: > > From: Kairui Song <kasong@tencent.com> > > Non-rotational devices (SSD / ZRAM) can tolerate fragmentation, so the > goal of the SWAP allocator is to avoid contention for clusters. It uses > a per-CPU cluster design, and each CPU will use a different cluster as > much as possible. > > However, HDDs are very sensitive to fragmentation, contention is trivial > in comparison. Therefore, we use one global cluster instead. This ensures > that each order will be written to the same cluster as much as possible, > which helps make the I/O more continuous. > > This ensures that the performance of the cluster allocator is as good as > that of the old allocator. Tests after this commit compared to those > before this series: > > Tested using 'make -j32' with tinyconfig, a 1G memcg limit, and HDD swap: > > make -j32 with tinyconfig, using 1G memcg limit and HDD swap: > > Before this series: > 114.44user 29.11system 39:42.90elapsed 6%CPU (0avgtext+0avgdata 157284maxresident)k > 2901232inputs+0outputs (238877major+4227640minor)pagefaults > > After this commit: > 113.90user 23.81system 38:11.77elapsed 6%CPU (0avgtext+0avgdata 157260maxresident)k > 2548728inputs+0outputs (235471major+4238110minor)pagefaults > > Suggested-by: Chris Li <chrisl@kernel.org> > Signed-off-by: Kairui Song <kasong@tencent.com> > --- > include/linux/swap.h | 2 ++ > mm/swapfile.c | 51 ++++++++++++++++++++++++++++++++------------ > 2 files changed, 39 insertions(+), 14 deletions(-) > > diff --git a/include/linux/swap.h b/include/linux/swap.h > index 4c1d2e69689f..b13b72645db3 100644 > --- a/include/linux/swap.h > +++ b/include/linux/swap.h > @@ -318,6 +318,8 @@ struct swap_info_struct { > unsigned int pages; /* total of usable pages of swap */ > atomic_long_t inuse_pages; /* number of those currently in use */ > struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */ > + struct percpu_cluster *global_cluster; /* Use one global cluster for rotating device */ > + spinlock_t global_cluster_lock; /* Serialize usage of global cluster */ > struct rb_root swap_extent_root;/* root of the swap extent rbtree */ > struct block_device *bdev; /* swap device or bdev of swap file */ > struct file *swap_file; /* seldom referenced */ > diff --git a/mm/swapfile.c b/mm/swapfile.c > index 37d540fa0310..793b2fd1a2a8 100644 > --- a/mm/swapfile.c > +++ b/mm/swapfile.c > @@ -820,7 +820,10 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, > out: > relocate_cluster(si, ci); > unlock_cluster(ci); > - __this_cpu_write(si->percpu_cluster->next[order], next); > + if (si->flags & SWP_SOLIDSTATE) > + __this_cpu_write(si->percpu_cluster->next[order], next); > + else > + si->global_cluster->next[order] = next; > return found; > } > > @@ -881,9 +884,16 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o > struct swap_cluster_info *ci; > unsigned int offset, found = 0; > > - /* Fast path using per CPU cluster */ > - local_lock(&si->percpu_cluster->lock); > - offset = __this_cpu_read(si->percpu_cluster->next[order]); > + if (si->flags & SWP_SOLIDSTATE) { > + /* Fast path using per CPU cluster */ > + local_lock(&si->percpu_cluster->lock); > + offset = __this_cpu_read(si->percpu_cluster->next[order]); > + } else { > + /* Serialize HDD SWAP allocation for each device. */ > + spin_lock(&si->global_cluster_lock); > + offset = si->global_cluster->next[order]; > + } > + > if (offset) { > ci = lock_cluster(si, offset); > /* Cluster could have been used by another order */ > @@ -975,8 +985,10 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o > } > } > done: > - local_unlock(&si->percpu_cluster->lock); > - > + if (si->flags & SWP_SOLIDSTATE) > + local_unlock(&si->percpu_cluster->lock); > + else > + spin_unlock(&si->global_cluster_lock); > return found; > } > > @@ -2784,6 +2796,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) > mutex_unlock(&swapon_mutex); > free_percpu(p->percpu_cluster); > p->percpu_cluster = NULL; > + kfree(p->global_cluster); > + p->global_cluster = NULL; > vfree(swap_map); > kvfree(zeromap); > kvfree(cluster_info); > @@ -3189,17 +3203,24 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, > for (i = 0; i < nr_clusters; i++) > spin_lock_init(&cluster_info[i].lock); > > - si->percpu_cluster = alloc_percpu(struct percpu_cluster); > - if (!si->percpu_cluster) > - goto err_free; > + if (si->flags & SWP_SOLIDSTATE) { > + si->percpu_cluster = alloc_percpu(struct percpu_cluster); > + if (!si->percpu_cluster) > + goto err_free; > > - for_each_possible_cpu(cpu) { > - struct percpu_cluster *cluster; > + for_each_possible_cpu(cpu) { > + struct percpu_cluster *cluster; > > - cluster = per_cpu_ptr(si->percpu_cluster, cpu); > + cluster = per_cpu_ptr(si->percpu_cluster, cpu); > + for (i = 0; i < SWAP_NR_ORDERS; i++) > + cluster->next[i] = SWAP_ENTRY_INVALID; > + local_lock_init(&cluster->lock); > + } > + } else { > + si->global_cluster = kmalloc(sizeof(*si->global_cluster), GFP_KERNEL); Hi Andrew, Sorry I just found I missed this tiny fix here, can you help fold it to the current unstable tree? diff --git a/mm/swapfile.c b/mm/swapfile.c index e57e5453a25b..559b8e62ff71 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3212,6 +3212,8 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, } } else { si->global_cluster = kmalloc(sizeof(*si->global_cluster), GFP_KERNEL); + if (!si->global_cluster) + goto err_free; for (i = 0; i < SWAP_NR_ORDERS; i++) si->global_cluster->next[i] = SWAP_ENTRY_INVALID; spin_lock_init(&si->global_cluster_lock);
diff --git a/include/linux/swap.h b/include/linux/swap.h index 4c1d2e69689f..b13b72645db3 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -318,6 +318,8 @@ struct swap_info_struct { unsigned int pages; /* total of usable pages of swap */ atomic_long_t inuse_pages; /* number of those currently in use */ struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */ + struct percpu_cluster *global_cluster; /* Use one global cluster for rotating device */ + spinlock_t global_cluster_lock; /* Serialize usage of global cluster */ struct rb_root swap_extent_root;/* root of the swap extent rbtree */ struct block_device *bdev; /* swap device or bdev of swap file */ struct file *swap_file; /* seldom referenced */ diff --git a/mm/swapfile.c b/mm/swapfile.c index 37d540fa0310..793b2fd1a2a8 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -820,7 +820,10 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, out: relocate_cluster(si, ci); unlock_cluster(ci); - __this_cpu_write(si->percpu_cluster->next[order], next); + if (si->flags & SWP_SOLIDSTATE) + __this_cpu_write(si->percpu_cluster->next[order], next); + else + si->global_cluster->next[order] = next; return found; } @@ -881,9 +884,16 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o struct swap_cluster_info *ci; unsigned int offset, found = 0; - /* Fast path using per CPU cluster */ - local_lock(&si->percpu_cluster->lock); - offset = __this_cpu_read(si->percpu_cluster->next[order]); + if (si->flags & SWP_SOLIDSTATE) { + /* Fast path using per CPU cluster */ + local_lock(&si->percpu_cluster->lock); + offset = __this_cpu_read(si->percpu_cluster->next[order]); + } else { + /* Serialize HDD SWAP allocation for each device. */ + spin_lock(&si->global_cluster_lock); + offset = si->global_cluster->next[order]; + } + if (offset) { ci = lock_cluster(si, offset); /* Cluster could have been used by another order */ @@ -975,8 +985,10 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o } } done: - local_unlock(&si->percpu_cluster->lock); - + if (si->flags & SWP_SOLIDSTATE) + local_unlock(&si->percpu_cluster->lock); + else + spin_unlock(&si->global_cluster_lock); return found; } @@ -2784,6 +2796,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) mutex_unlock(&swapon_mutex); free_percpu(p->percpu_cluster); p->percpu_cluster = NULL; + kfree(p->global_cluster); + p->global_cluster = NULL; vfree(swap_map); kvfree(zeromap); kvfree(cluster_info); @@ -3189,17 +3203,24 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, for (i = 0; i < nr_clusters; i++) spin_lock_init(&cluster_info[i].lock); - si->percpu_cluster = alloc_percpu(struct percpu_cluster); - if (!si->percpu_cluster) - goto err_free; + if (si->flags & SWP_SOLIDSTATE) { + si->percpu_cluster = alloc_percpu(struct percpu_cluster); + if (!si->percpu_cluster) + goto err_free; - for_each_possible_cpu(cpu) { - struct percpu_cluster *cluster; + for_each_possible_cpu(cpu) { + struct percpu_cluster *cluster; - cluster = per_cpu_ptr(si->percpu_cluster, cpu); + cluster = per_cpu_ptr(si->percpu_cluster, cpu); + for (i = 0; i < SWAP_NR_ORDERS; i++) + cluster->next[i] = SWAP_ENTRY_INVALID; + local_lock_init(&cluster->lock); + } + } else { + si->global_cluster = kmalloc(sizeof(*si->global_cluster), GFP_KERNEL); for (i = 0; i < SWAP_NR_ORDERS; i++) - cluster->next[i] = SWAP_ENTRY_INVALID; - local_lock_init(&cluster->lock); + si->global_cluster->next[i] = SWAP_ENTRY_INVALID; + spin_lock_init(&si->global_cluster_lock); } /* @@ -3473,6 +3494,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) bad_swap: free_percpu(si->percpu_cluster); si->percpu_cluster = NULL; + kfree(si->global_cluster); + si->global_cluster = NULL; inode = NULL; destroy_swap_extents(si); swap_cgroup_swapoff(si->type);