diff mbox series

[v2,1/2] mm: swap: swap cluster switch to double link list

Message ID 20240614-swap-allocator-v2-1-2a513b4a7f2f@kernel.org (mailing list archive)
State New
Headers show
Series mm: swap: mTHP swap allocator base on swap cluster order | expand

Commit Message

Chris Li June 14, 2024, 11:48 p.m. UTC
Previously, the swap cluster used a cluster index as a pointer
to construct a custom single link list type "swap_cluster_list".
The next cluster pointer is shared with the cluster->count.
It prevents puting the non free cluster into a list.
Change the cluster to use the standard double link list instead.
This allows tracing the nonfull cluster in the follow up patch.

Remove the cluster getter/setter for accessing the cluster
struct member.

The list operation is protected by the swap_info_struct->lock.

Change cluster code to use "struct swap_cluster_info *" to
reference the cluster rather than by using index. That is more
consistent with the list manipulation. It avoids the repeat
adding index to the cluser_info. The code is easier to understand.

Remove the cluster next pointer is NULL flag, the double link
list can handle the empty list pretty well.

The "swap_cluster_info" struct is two pointer bigger, because
512 swap entries share one swap struct, it has very little impact
on the average memory usage per swap entry. For 1TB swapfile, the
swap cluster data structure increases from 8MB to 24MB.

Other than the list conversion, there is no real function change
in this patch.

Signed-off-by: Chris Li <chrisl@kernel.org>
---
 include/linux/swap.h |  28 +++----
 mm/swapfile.c        | 227 +++++++++++++--------------------------------------
 2 files changed, 70 insertions(+), 185 deletions(-)

Comments

Huang, Ying June 17, 2024, 6:19 a.m. UTC | #1
Hi, Chris,

Chris Li <chrisl@kernel.org> writes:

> Previously, the swap cluster used a cluster index as a pointer
> to construct a custom single link list type "swap_cluster_list".
> The next cluster pointer is shared with the cluster->count.
> It prevents puting the non free cluster into a list.
> Change the cluster to use the standard double link list instead.
> This allows tracing the nonfull cluster in the follow up patch.
>
> Remove the cluster getter/setter for accessing the cluster
> struct member.
>
> The list operation is protected by the swap_info_struct->lock.
>
> Change cluster code to use "struct swap_cluster_info *" to
> reference the cluster rather than by using index. That is more
> consistent with the list manipulation. It avoids the repeat
> adding index to the cluser_info. The code is easier to understand.
>
> Remove the cluster next pointer is NULL flag, the double link
> list can handle the empty list pretty well.

The above is more about "what" instead of "why".  We can identify "what"
from the patch itself.  I expect more "why".  I guess that we can reduce
swap_map[] scanning if we have lists of non-full/non-free clusters.

> The "swap_cluster_info" struct is two pointer bigger, because
> 512 swap entries share one swap struct, it has very little impact
> on the average memory usage per swap entry. For 1TB swapfile, the
> swap cluster data structure increases from 8MB to 24MB.
>
> Other than the list conversion, there is no real function change
> in this patch.
>
> Signed-off-by: Chris Li <chrisl@kernel.org>
> ---
>  include/linux/swap.h |  28 +++----
>  mm/swapfile.c        | 227 +++++++++++++--------------------------------------
>  2 files changed, 70 insertions(+), 185 deletions(-)
>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 3df75d62a835..cd9154a3e934 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -242,23 +242,22 @@ enum {
>   * space with SWAPFILE_CLUSTER pages long and naturally aligns in disk. All
>   * free clusters are organized into a list. We fetch an entry from the list to
>   * get a free cluster.
> - *
> - * The data field stores next cluster if the cluster is free or cluster usage
> - * counter otherwise. The flags field determines if a cluster is free. This is
> - * protected by swap_info_struct.lock.
>   */
>  struct swap_cluster_info {
>  	spinlock_t lock;	/*
> -				 * Protect swap_cluster_info fields
> -				 * and swap_info_struct->swap_map
> +				 * Protect swap_cluster_info count and state

Protect swap_cluster_info fields except 'list' ?

> +				 * field and swap_info_struct->swap_map
>  				 * elements correspond to the swap
>  				 * cluster
>  				 */
> -	unsigned int data:24;
> -	unsigned int flags:8;
> +	unsigned int count:12;
> +	unsigned int state:3;

I still prefer normal data type over bit fields.  How about

        u16 usage;
        u8  state;

And, how about use 'usage' instead of 'count'?  Personally I think that
it is more clear.  But I don't have strong opinions on this.

> +	struct list_head list;	/* Protected by swap_info_struct->lock */
>  };
> -#define CLUSTER_FLAG_FREE 1 /* This cluster is free */
> -#define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */
> +
> +#define CLUSTER_STATE_FREE	1 /* This cluster is free */

Can we use swap_cluster_info->count == 0?

> +#define CLUSTER_STATE_PER_CPU	2 /* This cluster on per_cpu_cluster  */
> +

There's no users of this state in this patch.  IMHO, it's better to
introduce a symbol with its users, otherwise, it's hard to understand
why do we need it and how to use it.  And, IIUC, the state isn't
maintained properly, it should be changed when we move the cluster off
the per-cpu cluster.

>  /*
>   * The first page in the swap file is the swap header, which is always marked
> @@ -283,11 +282,6 @@ struct percpu_cluster {
>  	unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */
>  };
>  
> -struct swap_cluster_list {
> -	struct swap_cluster_info head;
> -	struct swap_cluster_info tail;
> -};
> -
>  /*
>   * The in-memory structure used to track swap areas.
>   */
> @@ -300,7 +294,7 @@ struct swap_info_struct {
>  	unsigned int	max;		/* extent of the swap_map */
>  	unsigned char *swap_map;	/* vmalloc'ed array of usage counts */
>  	struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
> -	struct swap_cluster_list free_clusters; /* free clusters list */
> +	struct list_head free_clusters; /* free clusters list */
>  	unsigned int lowest_bit;	/* index of first free in swap_map */
>  	unsigned int highest_bit;	/* index of last free in swap_map */
>  	unsigned int pages;		/* total of usable pages of swap */
> @@ -331,7 +325,7 @@ struct swap_info_struct {
>  					 * list.
>  					 */
>  	struct work_struct discard_work; /* discard worker */
> -	struct swap_cluster_list discard_clusters; /* discard clusters list */
> +	struct list_head discard_clusters; /* discard clusters list */
>  	struct plist_node avail_lists[]; /*
>  					   * entries in swap_avail_heads, one
>  					   * entry per node.
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 9c6d8e557c0f..2f878b374349 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -290,62 +290,9 @@ static void discard_swap_cluster(struct swap_info_struct *si,
>  #endif
>  #define LATENCY_LIMIT		256
>  
> -static inline void cluster_set_flag(struct swap_cluster_info *info,
> -	unsigned int flag)
> -{
> -	info->flags = flag;
> -}
> -
> -static inline unsigned int cluster_count(struct swap_cluster_info *info)
> -{
> -	return info->data;
> -}
> -
> -static inline void cluster_set_count(struct swap_cluster_info *info,
> -				     unsigned int c)
> -{
> -	info->data = c;
> -}
> -
> -static inline void cluster_set_count_flag(struct swap_cluster_info *info,
> -					 unsigned int c, unsigned int f)
> -{
> -	info->flags = f;
> -	info->data = c;
> -}
> -
> -static inline unsigned int cluster_next(struct swap_cluster_info *info)
> -{
> -	return info->data;
> -}
> -
> -static inline void cluster_set_next(struct swap_cluster_info *info,
> -				    unsigned int n)
> -{
> -	info->data = n;
> -}
> -
> -static inline void cluster_set_next_flag(struct swap_cluster_info *info,
> -					 unsigned int n, unsigned int f)
> -{
> -	info->flags = f;
> -	info->data = n;
> -}
> -
>  static inline bool cluster_is_free(struct swap_cluster_info *info)
>  {
> -	return info->flags & CLUSTER_FLAG_FREE;
> -}
> -
> -static inline bool cluster_is_null(struct swap_cluster_info *info)
> -{
> -	return info->flags & CLUSTER_FLAG_NEXT_NULL;
> -}
> -
> -static inline void cluster_set_null(struct swap_cluster_info *info)
> -{
> -	info->flags = CLUSTER_FLAG_NEXT_NULL;
> -	info->data = 0;
> +	return info->state == CLUSTER_STATE_FREE;
>  }
>  
>  static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
> @@ -394,65 +341,11 @@ static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si,
>  		spin_unlock(&si->lock);
>  }
>  
> -static inline bool cluster_list_empty(struct swap_cluster_list *list)
> -{
> -	return cluster_is_null(&list->head);
> -}
> -
> -static inline unsigned int cluster_list_first(struct swap_cluster_list *list)
> -{
> -	return cluster_next(&list->head);
> -}
> -
> -static void cluster_list_init(struct swap_cluster_list *list)
> -{
> -	cluster_set_null(&list->head);
> -	cluster_set_null(&list->tail);
> -}
> -
> -static void cluster_list_add_tail(struct swap_cluster_list *list,
> -				  struct swap_cluster_info *ci,
> -				  unsigned int idx)
> -{
> -	if (cluster_list_empty(list)) {
> -		cluster_set_next_flag(&list->head, idx, 0);
> -		cluster_set_next_flag(&list->tail, idx, 0);
> -	} else {
> -		struct swap_cluster_info *ci_tail;
> -		unsigned int tail = cluster_next(&list->tail);
> -
> -		/*
> -		 * Nested cluster lock, but both cluster locks are
> -		 * only acquired when we held swap_info_struct->lock
> -		 */
> -		ci_tail = ci + tail;
> -		spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING);
> -		cluster_set_next(ci_tail, idx);
> -		spin_unlock(&ci_tail->lock);
> -		cluster_set_next_flag(&list->tail, idx, 0);
> -	}
> -}
> -
> -static unsigned int cluster_list_del_first(struct swap_cluster_list *list,
> -					   struct swap_cluster_info *ci)
> -{
> -	unsigned int idx;
> -
> -	idx = cluster_next(&list->head);
> -	if (cluster_next(&list->tail) == idx) {
> -		cluster_set_null(&list->head);
> -		cluster_set_null(&list->tail);
> -	} else
> -		cluster_set_next_flag(&list->head,
> -				      cluster_next(&ci[idx]), 0);
> -
> -	return idx;
> -}
> -
>  /* Add a cluster to discard list and schedule it to do discard */
>  static void swap_cluster_schedule_discard(struct swap_info_struct *si,
> -		unsigned int idx)
> +		struct swap_cluster_info *ci)
>  {
> +	unsigned int idx = ci - si->cluster_info;

I see this multiple times in the patch, can we define a helper for this?

>  	/*
>  	 * If scan_swap_map_slots() can't find a free cluster, it will check
>  	 * si->swap_map directly. To make sure the discarding cluster isn't
> @@ -462,17 +355,14 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si,
>  	memset(si->swap_map + idx * SWAPFILE_CLUSTER,
>  			SWAP_MAP_BAD, SWAPFILE_CLUSTER);
>  
> -	cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx);
> -
> +	list_add_tail(&ci->list, &si->discard_clusters);
>  	schedule_work(&si->discard_work);
>  }
>  
> -static void __free_cluster(struct swap_info_struct *si, unsigned long idx)
> +static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
>  {
> -	struct swap_cluster_info *ci = si->cluster_info;
> -
> -	cluster_set_flag(ci + idx, CLUSTER_FLAG_FREE);
> -	cluster_list_add_tail(&si->free_clusters, ci, idx);
> +	ci->state = CLUSTER_STATE_FREE;
> +	list_add_tail(&ci->list, &si->free_clusters);
>  }
>  
>  /*
> @@ -481,21 +371,22 @@ static void __free_cluster(struct swap_info_struct *si, unsigned long idx)
>  */
>  static void swap_do_scheduled_discard(struct swap_info_struct *si)
>  {
> -	struct swap_cluster_info *info, *ci;
> +	struct swap_cluster_info *ci;
>  	unsigned int idx;
>  
> -	info = si->cluster_info;
> -
> -	while (!cluster_list_empty(&si->discard_clusters)) {
> -		idx = cluster_list_del_first(&si->discard_clusters, info);
> +	while (!list_empty(&si->discard_clusters)) {
> +		ci = list_first_entry(&si->discard_clusters, struct swap_cluster_info, list);
> +		list_del(&ci->list);
> +		idx = ci - si->cluster_info;
>  		spin_unlock(&si->lock);
>  
>  		discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
>  				SWAPFILE_CLUSTER);
>  
>  		spin_lock(&si->lock);
> -		ci = lock_cluster(si, idx * SWAPFILE_CLUSTER);
> -		__free_cluster(si, idx);
> +
> +		spin_lock(&ci->lock);

Personally, I still prefer to use lock_cluster(), which is more readable
and matches unlock_cluster() below.

> +		__free_cluster(si, ci);
>  		memset(si->swap_map + idx * SWAPFILE_CLUSTER,
>  				0, SWAPFILE_CLUSTER);
>  		unlock_cluster(ci);
> @@ -521,20 +412,19 @@ static void swap_users_ref_free(struct percpu_ref *ref)
>  	complete(&si->comp);
>  }
>  
> -static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
> +static struct swap_cluster_info *alloc_cluster(struct swap_info_struct *si, unsigned long idx)
>  {
> -	struct swap_cluster_info *ci = si->cluster_info;
> +	struct swap_cluster_info *ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list);
>  
> -	VM_BUG_ON(cluster_list_first(&si->free_clusters) != idx);
> -	cluster_list_del_first(&si->free_clusters, ci);
> -	cluster_set_count_flag(ci + idx, 0, 0);
> +	VM_BUG_ON(ci - si->cluster_info != idx);
> +	list_del(&ci->list);
> +	ci->count = 0;

Do we need this now?  If we keep CLUSTER_STATE_FREE, we need to change
it here.

> +	return ci;
>  }
>  
> -static void free_cluster(struct swap_info_struct *si, unsigned long idx)
> +static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
>  {
> -	struct swap_cluster_info *ci = si->cluster_info + idx;
> -
> -	VM_BUG_ON(cluster_count(ci) != 0);
> +	VM_BUG_ON(ci->count != 0);
>  	/*
>  	 * If the swap is discardable, prepare discard the cluster
>  	 * instead of free it immediately. The cluster will be freed
> @@ -542,11 +432,11 @@ static void free_cluster(struct swap_info_struct *si, unsigned long idx)
>  	 */
>  	if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
>  	    (SWP_WRITEOK | SWP_PAGE_DISCARD)) {
> -		swap_cluster_schedule_discard(si, idx);
> +		swap_cluster_schedule_discard(si, ci);
>  		return;
>  	}
>  
> -	__free_cluster(si, idx);
> +	__free_cluster(si, ci);
>  }
>  
>  /*
> @@ -559,15 +449,15 @@ static void add_cluster_info_page(struct swap_info_struct *p,
>  	unsigned long count)
>  {
>  	unsigned long idx = page_nr / SWAPFILE_CLUSTER;
> +	struct swap_cluster_info *ci = cluster_info + idx;
>  
>  	if (!cluster_info)
>  		return;
> -	if (cluster_is_free(&cluster_info[idx]))
> +	if (cluster_is_free(ci))
>  		alloc_cluster(p, idx);
>  
> -	VM_BUG_ON(cluster_count(&cluster_info[idx]) + count > SWAPFILE_CLUSTER);
> -	cluster_set_count(&cluster_info[idx],
> -		cluster_count(&cluster_info[idx]) + count);
> +	VM_BUG_ON(ci->count + count > SWAPFILE_CLUSTER);
> +	ci->count += count;
>  }
>  
>  /*
> @@ -581,24 +471,20 @@ static void inc_cluster_info_page(struct swap_info_struct *p,
>  }
>  
>  /*
> - * The cluster corresponding to page_nr decreases one usage. If the usage
> - * counter becomes 0, which means no page in the cluster is in using, we can
> - * optionally discard the cluster and add it to free cluster list.
> + * The cluster ci decreases one usage. If the usage counter becomes 0,
> + * which means no page in the cluster is in using, we can optionally discard
> + * the cluster and add it to free cluster list.
>   */
> -static void dec_cluster_info_page(struct swap_info_struct *p,
> -	struct swap_cluster_info *cluster_info, unsigned long page_nr)
> +static void dec_cluster_info_page(struct swap_info_struct *p, struct swap_cluster_info *ci)
>  {
> -	unsigned long idx = page_nr / SWAPFILE_CLUSTER;
> -
> -	if (!cluster_info)
> +	if (!p->cluster_info)
>  		return;
>  
> -	VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0);
> -	cluster_set_count(&cluster_info[idx],
> -		cluster_count(&cluster_info[idx]) - 1);
> +	VM_BUG_ON(ci->count == 0);
> +	ci->count--;
>  
> -	if (cluster_count(&cluster_info[idx]) == 0)
> -		free_cluster(p, idx);
> +	if (!ci->count)
> +		free_cluster(p, ci);
>  }
>  
>  /*
> @@ -611,10 +497,10 @@ scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
>  {
>  	struct percpu_cluster *percpu_cluster;
>  	bool conflict;
> -

Usually we use one blank line after local variable declaration.

> +	struct swap_cluster_info *first = list_first_entry(&si->free_clusters, struct swap_cluster_info, list);
>  	offset /= SWAPFILE_CLUSTER;
> -	conflict = !cluster_list_empty(&si->free_clusters) &&
> -		offset != cluster_list_first(&si->free_clusters) &&
> +	conflict = !list_empty(&si->free_clusters) &&
> +		offset !=  first - si->cluster_info &&
>  		cluster_is_free(&si->cluster_info[offset]);
>  
>  	if (!conflict)
> @@ -655,10 +541,14 @@ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
>  	cluster = this_cpu_ptr(si->percpu_cluster);
>  	tmp = cluster->next[order];
>  	if (tmp == SWAP_NEXT_INVALID) {
> -		if (!cluster_list_empty(&si->free_clusters)) {
> -			tmp = cluster_next(&si->free_clusters.head) *
> -					SWAPFILE_CLUSTER;
> -		} else if (!cluster_list_empty(&si->discard_clusters)) {
> +		if (!list_empty(&si->free_clusters)) {
> +			ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list);
> +			list_del(&ci->list);

The free cluster is deleted from si->free_clusters now.  But later you
will call scan_swap_map_ssd_cluster_conflict() and may abandon the
cluster.  And in alloc_cluster() later, it may be deleted again.

> +			spin_lock(&ci->lock);
> +			ci->state = CLUSTER_STATE_PER_CPU;

Need to change ci->state when move a cluster off the percpu_cluster.

> +			spin_unlock(&ci->lock);
> +			tmp = (ci - si->cluster_info) * SWAPFILE_CLUSTER;
> +		} else if (!list_empty(&si->discard_clusters)) {
>  			/*
>  			 * we don't have free cluster but have some clusters in
>  			 * discarding, do discard now and reclaim them, then
> @@ -1062,8 +952,8 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
>  
>  	ci = lock_cluster(si, offset);
>  	memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
> -	cluster_set_count_flag(ci, 0, 0);
> -	free_cluster(si, idx);
> +	ci->count = 0;
> +	free_cluster(si, ci);
>  	unlock_cluster(ci);
>  	swap_range_free(si, offset, SWAPFILE_CLUSTER);
>  }
> @@ -1336,7 +1226,7 @@ static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry)
>  	count = p->swap_map[offset];
>  	VM_BUG_ON(count != SWAP_HAS_CACHE);
>  	p->swap_map[offset] = 0;
> -	dec_cluster_info_page(p, p->cluster_info, offset);
> +	dec_cluster_info_page(p, ci);
>  	unlock_cluster(ci);
>  
>  	mem_cgroup_uncharge_swap(entry, 1);
> @@ -3003,8 +2893,8 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
>  
>  	nr_good_pages = maxpages - 1;	/* omit header page */
>  
> -	cluster_list_init(&p->free_clusters);
> -	cluster_list_init(&p->discard_clusters);
> +	INIT_LIST_HEAD(&p->free_clusters);
> +	INIT_LIST_HEAD(&p->discard_clusters);
>  
>  	for (i = 0; i < swap_header->info.nr_badpages; i++) {
>  		unsigned int page_nr = swap_header->info.badpages[i];
> @@ -3055,14 +2945,15 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
>  	for (k = 0; k < SWAP_CLUSTER_COLS; k++) {
>  		j = (k + col) % SWAP_CLUSTER_COLS;
>  		for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) {
> +			struct swap_cluster_info *ci;
>  			idx = i * SWAP_CLUSTER_COLS + j;
> +			ci = cluster_info + idx;
>  			if (idx >= nr_clusters)
>  				continue;
> -			if (cluster_count(&cluster_info[idx]))
> +			if (ci->count)
>  				continue;
> -			cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
> -			cluster_list_add_tail(&p->free_clusters, cluster_info,
> -					      idx);
> +			ci->state = CLUSTER_STATE_FREE;
> +			list_add_tail(&ci->list, &p->free_clusters);
>  		}
>  	}
>  	return nr_extents;

--
Best Regards,
Huang, Ying
Chris Li June 18, 2024, 5:06 a.m. UTC | #2
On Sun, Jun 16, 2024 at 11:21 PM Huang, Ying <ying.huang@intel.com> wrote:
>
> Hi, Chris,
>
> Chris Li <chrisl@kernel.org> writes:
>
> > Previously, the swap cluster used a cluster index as a pointer
> > to construct a custom single link list type "swap_cluster_list".
> > The next cluster pointer is shared with the cluster->count.
> > It prevents puting the non free cluster into a list.
> > Change the cluster to use the standard double link list instead.
> > This allows tracing the nonfull cluster in the follow up patch.
> >
> > Remove the cluster getter/setter for accessing the cluster
> > struct member.
> >
> > The list operation is protected by the swap_info_struct->lock.
> >
> > Change cluster code to use "struct swap_cluster_info *" to
> > reference the cluster rather than by using index. That is more
> > consistent with the list manipulation. It avoids the repeat
> > adding index to the cluser_info. The code is easier to understand.
> >
> > Remove the cluster next pointer is NULL flag, the double link
> > list can handle the empty list pretty well.
>
> The above is more about "what" instead of "why".  We can identify "what"
> from the patch itself.  I expect more "why".  I guess that we can reduce
> swap_map[] scanning if we have lists of non-full/non-free clusters.

In my mind, the "why" is captured by " This allows tracing the nonfull
cluster in the follow up patch.".
If you want to ask "why" we want the "nonfull cluster list". It is to
get to the suitable candidate cluster with that order quicker than
scanning swap_map[].

>
> > The "swap_cluster_info" struct is two pointer bigger, because
> > 512 swap entries share one swap struct, it has very little impact
> > on the average memory usage per swap entry. For 1TB swapfile, the
> > swap cluster data structure increases from 8MB to 24MB.
> >
> > Other than the list conversion, there is no real function change
> > in this patch.
> >
> > Signed-off-by: Chris Li <chrisl@kernel.org>
> > ---
> >  include/linux/swap.h |  28 +++----
> >  mm/swapfile.c        | 227 +++++++++++++--------------------------------------
> >  2 files changed, 70 insertions(+), 185 deletions(-)
> >
> > diff --git a/include/linux/swap.h b/include/linux/swap.h
> > index 3df75d62a835..cd9154a3e934 100644
> > --- a/include/linux/swap.h
> > +++ b/include/linux/swap.h
> > @@ -242,23 +242,22 @@ enum {
> >   * space with SWAPFILE_CLUSTER pages long and naturally aligns in disk. All
> >   * free clusters are organized into a list. We fetch an entry from the list to
> >   * get a free cluster.
> > - *
> > - * The data field stores next cluster if the cluster is free or cluster usage
> > - * counter otherwise. The flags field determines if a cluster is free. This is
> > - * protected by swap_info_struct.lock.
> >   */
> >  struct swap_cluster_info {
> >       spinlock_t lock;        /*
> > -                              * Protect swap_cluster_info fields
> > -                              * and swap_info_struct->swap_map
> > +                              * Protect swap_cluster_info count and state
>
> Protect swap_cluster_info fields except 'list' ?

I change it to protect the swap_cluster_info bitfields in the second patch.
>
> > +                              * field and swap_info_struct->swap_map
> >                                * elements correspond to the swap
> >                                * cluster
> >                                */
> > -     unsigned int data:24;
> > -     unsigned int flags:8;
> > +     unsigned int count:12;
> > +     unsigned int state:3;
>
> I still prefer normal data type over bit fields.  How about
>
>         u16 usage;
>         u8  state;

I don't mind the "count" rename to "usage". That is probably a better
name. However I have another patch intended to add more bit fields in
the cluster info struct. The second patch adds "order" and the later
patch will add more. That is why I choose bitfield to be more condense
with bits.

>
> And, how about use 'usage' instead of 'count'?  Personally I think that
> it is more clear.  But I don't have strong opinions on this.
>
> > +     struct list_head list;  /* Protected by swap_info_struct->lock */
> >  };
> > -#define CLUSTER_FLAG_FREE 1 /* This cluster is free */
> > -#define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */
> > +
> > +#define CLUSTER_STATE_FREE   1 /* This cluster is free */
>
> Can we use swap_cluster_info->count == 0?

It is not as good considering the second patch starts to track the
state of the cluster of per cpu struct. We will be comparing both the
cluster->count and cluster->state.

>
> > +#define CLUSTER_STATE_PER_CPU        2 /* This cluster on per_cpu_cluster  */
> > +
>
> There's no users of this state in this patch.  IMHO, it's better to

Yes, there is usage of this state in this patch in the sense that, if
you remove that state definition,
the code can't compile due to assignment of CLUSTER_STATE_PER_CPU.
There is a code test if a cluster state is not a free state, which
excludes "CLUSTER_STATE_PER_CPU".

> introduce a symbol with its users, otherwise, it's hard to understand
> why do we need it and how to use it.  And, IIUC, the state isn't
> maintained properly, it should be changed when we move the cluster off
> the per-cpu cluster.

I am actually following the same usage principle as you suggested
here. Only the second patch starts to use the off per cpu state
(SCANNED). That is why I introduce it there.

>
> >  /*
> >   * The first page in the swap file is the swap header, which is always marked
> > @@ -283,11 +282,6 @@ struct percpu_cluster {
> >       unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */
> >  };
> >
> > -struct swap_cluster_list {
> > -     struct swap_cluster_info head;
> > -     struct swap_cluster_info tail;
> > -};
> > -
> >  /*
> >   * The in-memory structure used to track swap areas.
> >   */
> > @@ -300,7 +294,7 @@ struct swap_info_struct {
> >       unsigned int    max;            /* extent of the swap_map */
> >       unsigned char *swap_map;        /* vmalloc'ed array of usage counts */
> >       struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
> > -     struct swap_cluster_list free_clusters; /* free clusters list */
> > +     struct list_head free_clusters; /* free clusters list */
> >       unsigned int lowest_bit;        /* index of first free in swap_map */
> >       unsigned int highest_bit;       /* index of last free in swap_map */
> >       unsigned int pages;             /* total of usable pages of swap */
> > @@ -331,7 +325,7 @@ struct swap_info_struct {
> >                                        * list.
> >                                        */
> >       struct work_struct discard_work; /* discard worker */
> > -     struct swap_cluster_list discard_clusters; /* discard clusters list */
> > +     struct list_head discard_clusters; /* discard clusters list */
> >       struct plist_node avail_lists[]; /*
> >                                          * entries in swap_avail_heads, one
> >                                          * entry per node.
> > diff --git a/mm/swapfile.c b/mm/swapfile.c
> > index 9c6d8e557c0f..2f878b374349 100644
> > --- a/mm/swapfile.c
> > +++ b/mm/swapfile.c
> > @@ -290,62 +290,9 @@ static void discard_swap_cluster(struct swap_info_struct *si,
> >  #endif
> >  #define LATENCY_LIMIT                256
> >
> > -static inline void cluster_set_flag(struct swap_cluster_info *info,
> > -     unsigned int flag)
> > -{
> > -     info->flags = flag;
> > -}
> > -
> > -static inline unsigned int cluster_count(struct swap_cluster_info *info)
> > -{
> > -     return info->data;
> > -}
> > -
> > -static inline void cluster_set_count(struct swap_cluster_info *info,
> > -                                  unsigned int c)
> > -{
> > -     info->data = c;
> > -}
> > -
> > -static inline void cluster_set_count_flag(struct swap_cluster_info *info,
> > -                                      unsigned int c, unsigned int f)
> > -{
> > -     info->flags = f;
> > -     info->data = c;
> > -}
> > -
> > -static inline unsigned int cluster_next(struct swap_cluster_info *info)
> > -{
> > -     return info->data;
> > -}
> > -
> > -static inline void cluster_set_next(struct swap_cluster_info *info,
> > -                                 unsigned int n)
> > -{
> > -     info->data = n;
> > -}
> > -
> > -static inline void cluster_set_next_flag(struct swap_cluster_info *info,
> > -                                      unsigned int n, unsigned int f)
> > -{
> > -     info->flags = f;
> > -     info->data = n;
> > -}
> > -
> >  static inline bool cluster_is_free(struct swap_cluster_info *info)
> >  {
> > -     return info->flags & CLUSTER_FLAG_FREE;
> > -}
> > -
> > -static inline bool cluster_is_null(struct swap_cluster_info *info)
> > -{
> > -     return info->flags & CLUSTER_FLAG_NEXT_NULL;
> > -}
> > -
> > -static inline void cluster_set_null(struct swap_cluster_info *info)
> > -{
> > -     info->flags = CLUSTER_FLAG_NEXT_NULL;
> > -     info->data = 0;
> > +     return info->state == CLUSTER_STATE_FREE;
> >  }
> >
> >  static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
> > @@ -394,65 +341,11 @@ static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si,
> >               spin_unlock(&si->lock);
> >  }
> >
> > -static inline bool cluster_list_empty(struct swap_cluster_list *list)
> > -{
> > -     return cluster_is_null(&list->head);
> > -}
> > -
> > -static inline unsigned int cluster_list_first(struct swap_cluster_list *list)
> > -{
> > -     return cluster_next(&list->head);
> > -}
> > -
> > -static void cluster_list_init(struct swap_cluster_list *list)
> > -{
> > -     cluster_set_null(&list->head);
> > -     cluster_set_null(&list->tail);
> > -}
> > -
> > -static void cluster_list_add_tail(struct swap_cluster_list *list,
> > -                               struct swap_cluster_info *ci,
> > -                               unsigned int idx)
> > -{
> > -     if (cluster_list_empty(list)) {
> > -             cluster_set_next_flag(&list->head, idx, 0);
> > -             cluster_set_next_flag(&list->tail, idx, 0);
> > -     } else {
> > -             struct swap_cluster_info *ci_tail;
> > -             unsigned int tail = cluster_next(&list->tail);
> > -
> > -             /*
> > -              * Nested cluster lock, but both cluster locks are
> > -              * only acquired when we held swap_info_struct->lock
> > -              */
> > -             ci_tail = ci + tail;
> > -             spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING);
> > -             cluster_set_next(ci_tail, idx);
> > -             spin_unlock(&ci_tail->lock);
> > -             cluster_set_next_flag(&list->tail, idx, 0);
> > -     }
> > -}
> > -
> > -static unsigned int cluster_list_del_first(struct swap_cluster_list *list,
> > -                                        struct swap_cluster_info *ci)
> > -{
> > -     unsigned int idx;
> > -
> > -     idx = cluster_next(&list->head);
> > -     if (cluster_next(&list->tail) == idx) {
> > -             cluster_set_null(&list->head);
> > -             cluster_set_null(&list->tail);
> > -     } else
> > -             cluster_set_next_flag(&list->head,
> > -                                   cluster_next(&ci[idx]), 0);
> > -
> > -     return idx;
> > -}
> > -
> >  /* Add a cluster to discard list and schedule it to do discard */
> >  static void swap_cluster_schedule_discard(struct swap_info_struct *si,
> > -             unsigned int idx)
> > +             struct swap_cluster_info *ci)
> >  {
> > +     unsigned int idx = ci - si->cluster_info;
>
> I see this multiple times in the patch, can we define a helper for this?
Ack.

>
> >       /*
> >        * If scan_swap_map_slots() can't find a free cluster, it will check
> >        * si->swap_map directly. To make sure the discarding cluster isn't
> > @@ -462,17 +355,14 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si,
> >       memset(si->swap_map + idx * SWAPFILE_CLUSTER,
> >                       SWAP_MAP_BAD, SWAPFILE_CLUSTER);
> >
> > -     cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx);
> > -
> > +     list_add_tail(&ci->list, &si->discard_clusters);
> >       schedule_work(&si->discard_work);
> >  }
> >
> > -static void __free_cluster(struct swap_info_struct *si, unsigned long idx)
> > +static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
> >  {
> > -     struct swap_cluster_info *ci = si->cluster_info;
> > -
> > -     cluster_set_flag(ci + idx, CLUSTER_FLAG_FREE);
> > -     cluster_list_add_tail(&si->free_clusters, ci, idx);
> > +     ci->state = CLUSTER_STATE_FREE;
> > +     list_add_tail(&ci->list, &si->free_clusters);
> >  }
> >
> >  /*
> > @@ -481,21 +371,22 @@ static void __free_cluster(struct swap_info_struct *si, unsigned long idx)
> >  */
> >  static void swap_do_scheduled_discard(struct swap_info_struct *si)
> >  {
> > -     struct swap_cluster_info *info, *ci;
> > +     struct swap_cluster_info *ci;
> >       unsigned int idx;
> >
> > -     info = si->cluster_info;
> > -
> > -     while (!cluster_list_empty(&si->discard_clusters)) {
> > -             idx = cluster_list_del_first(&si->discard_clusters, info);
> > +     while (!list_empty(&si->discard_clusters)) {
> > +             ci = list_first_entry(&si->discard_clusters, struct swap_cluster_info, list);
> > +             list_del(&ci->list);
> > +             idx = ci - si->cluster_info;
> >               spin_unlock(&si->lock);
> >
> >               discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
> >                               SWAPFILE_CLUSTER);
> >
> >               spin_lock(&si->lock);
> > -             ci = lock_cluster(si, idx * SWAPFILE_CLUSTER);
> > -             __free_cluster(si, idx);
> > +
> > +             spin_lock(&ci->lock);
>
> Personally, I still prefer to use lock_cluster(), which is more readable
> and matches unlock_cluster() below.

lock_cluster() uses an index which is not matching unlock_cluster()
which is using a pointer to cluster.
When you get the cluster from the list, you have a cluster pointer. I
feel it is unnecessary to convert to index then back convert to
cluster pointer inside lock_cluster(). I actually feel using indexes
to refer to the cluster is error prone because we also have offset.


>
> > +             __free_cluster(si, ci);
> >               memset(si->swap_map + idx * SWAPFILE_CLUSTER,
> >                               0, SWAPFILE_CLUSTER);
> >               unlock_cluster(ci);
> > @@ -521,20 +412,19 @@ static void swap_users_ref_free(struct percpu_ref *ref)
> >       complete(&si->comp);
> >  }
> >
> > -static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
> > +static struct swap_cluster_info *alloc_cluster(struct swap_info_struct *si, unsigned long idx)
> >  {
> > -     struct swap_cluster_info *ci = si->cluster_info;
> > +     struct swap_cluster_info *ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list);
> >
> > -     VM_BUG_ON(cluster_list_first(&si->free_clusters) != idx);
> > -     cluster_list_del_first(&si->free_clusters, ci);
> > -     cluster_set_count_flag(ci + idx, 0, 0);
> > +     VM_BUG_ON(ci - si->cluster_info != idx);
> > +     list_del(&ci->list);
> > +     ci->count = 0;
>
> Do we need this now?  If we keep CLUSTER_STATE_FREE, we need to change
> it here.

Good catch, thanks for catching that. Now I realized this is actually
problematic and tricky to get it right. Let me work on that.

>
> > +     return ci;
> >  }
> >
> > -static void free_cluster(struct swap_info_struct *si, unsigned long idx)
> > +static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
> >  {
> > -     struct swap_cluster_info *ci = si->cluster_info + idx;
> > -
> > -     VM_BUG_ON(cluster_count(ci) != 0);
> > +     VM_BUG_ON(ci->count != 0);
> >       /*
> >        * If the swap is discardable, prepare discard the cluster
> >        * instead of free it immediately. The cluster will be freed
> > @@ -542,11 +432,11 @@ static void free_cluster(struct swap_info_struct *si, unsigned long idx)
> >        */
> >       if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
> >           (SWP_WRITEOK | SWP_PAGE_DISCARD)) {
> > -             swap_cluster_schedule_discard(si, idx);
> > +             swap_cluster_schedule_discard(si, ci);
> >               return;
> >       }
> >
> > -     __free_cluster(si, idx);
> > +     __free_cluster(si, ci);
> >  }
> >
> >  /*
> > @@ -559,15 +449,15 @@ static void add_cluster_info_page(struct swap_info_struct *p,
> >       unsigned long count)
> >  {
> >       unsigned long idx = page_nr / SWAPFILE_CLUSTER;
> > +     struct swap_cluster_info *ci = cluster_info + idx;
> >
> >       if (!cluster_info)
> >               return;
> > -     if (cluster_is_free(&cluster_info[idx]))
> > +     if (cluster_is_free(ci))
> >               alloc_cluster(p, idx);
> >
> > -     VM_BUG_ON(cluster_count(&cluster_info[idx]) + count > SWAPFILE_CLUSTER);
> > -     cluster_set_count(&cluster_info[idx],
> > -             cluster_count(&cluster_info[idx]) + count);
> > +     VM_BUG_ON(ci->count + count > SWAPFILE_CLUSTER);
> > +     ci->count += count;
> >  }
> >
> >  /*
> > @@ -581,24 +471,20 @@ static void inc_cluster_info_page(struct swap_info_struct *p,
> >  }
> >
> >  /*
> > - * The cluster corresponding to page_nr decreases one usage. If the usage
> > - * counter becomes 0, which means no page in the cluster is in using, we can
> > - * optionally discard the cluster and add it to free cluster list.
> > + * The cluster ci decreases one usage. If the usage counter becomes 0,
> > + * which means no page in the cluster is in using, we can optionally discard
> > + * the cluster and add it to free cluster list.
> >   */
> > -static void dec_cluster_info_page(struct swap_info_struct *p,
> > -     struct swap_cluster_info *cluster_info, unsigned long page_nr)
> > +static void dec_cluster_info_page(struct swap_info_struct *p, struct swap_cluster_info *ci)
> >  {
> > -     unsigned long idx = page_nr / SWAPFILE_CLUSTER;
> > -
> > -     if (!cluster_info)
> > +     if (!p->cluster_info)
> >               return;
> >
> > -     VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0);
> > -     cluster_set_count(&cluster_info[idx],
> > -             cluster_count(&cluster_info[idx]) - 1);
> > +     VM_BUG_ON(ci->count == 0);
> > +     ci->count--;
> >
> > -     if (cluster_count(&cluster_info[idx]) == 0)
> > -             free_cluster(p, idx);
> > +     if (!ci->count)
> > +             free_cluster(p, ci);
> >  }
> >
> >  /*
> > @@ -611,10 +497,10 @@ scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
> >  {
> >       struct percpu_cluster *percpu_cluster;
> >       bool conflict;
> > -
>
> Usually we use one blank line after local variable declaration.
Ack.

>
> > +     struct swap_cluster_info *first = list_first_entry(&si->free_clusters, struct swap_cluster_info, list);
> >       offset /= SWAPFILE_CLUSTER;
> > -     conflict = !cluster_list_empty(&si->free_clusters) &&
> > -             offset != cluster_list_first(&si->free_clusters) &&
> > +     conflict = !list_empty(&si->free_clusters) &&
> > +             offset !=  first - si->cluster_info &&
> >               cluster_is_free(&si->cluster_info[offset]);
> >
> >       if (!conflict)
> > @@ -655,10 +541,14 @@ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
> >       cluster = this_cpu_ptr(si->percpu_cluster);
> >       tmp = cluster->next[order];
> >       if (tmp == SWAP_NEXT_INVALID) {
> > -             if (!cluster_list_empty(&si->free_clusters)) {
> > -                     tmp = cluster_next(&si->free_clusters.head) *
> > -                                     SWAPFILE_CLUSTER;
> > -             } else if (!cluster_list_empty(&si->discard_clusters)) {
> > +             if (!list_empty(&si->free_clusters)) {
> > +                     ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list);
> > +                     list_del(&ci->list);
>
> The free cluster is deleted from si->free_clusters now.  But later you
> will call scan_swap_map_ssd_cluster_conflict() and may abandon the
> cluster.  And in alloc_cluster() later, it may be deleted again.

Yes, that is a bug. Thanks for catching that.

>
> > +                     spin_lock(&ci->lock);
> > +                     ci->state = CLUSTER_STATE_PER_CPU;
>
> Need to change ci->state when move a cluster off the percpu_cluster.

In the next patch. This patch does not use the off state yet.

>
> > +                     spin_unlock(&ci->lock);
> > +                     tmp = (ci - si->cluster_info) * SWAPFILE_CLUSTER;
> > +             } else if (!list_empty(&si->discard_clusters)) {
> >                       /*
> >                        * we don't have free cluster but have some clusters in
> >                        * discarding, do discard now and reclaim them, then
> > @@ -1062,8 +952,8 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
> >
> >       ci = lock_cluster(si, offset);
> >       memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
> > -     cluster_set_count_flag(ci, 0, 0);
> > -     free_cluster(si, idx);
> > +     ci->count = 0;
> > +     free_cluster(si, ci);
> >       unlock_cluster(ci);
> >       swap_range_free(si, offset, SWAPFILE_CLUSTER);
> >  }
> > @@ -1336,7 +1226,7 @@ static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry)
> >       count = p->swap_map[offset];
> >       VM_BUG_ON(count != SWAP_HAS_CACHE);
> >       p->swap_map[offset] = 0;
> > -     dec_cluster_info_page(p, p->cluster_info, offset);
> > +     dec_cluster_info_page(p, ci);
> >       unlock_cluster(ci);
> >
> >       mem_cgroup_uncharge_swap(entry, 1);
> > @@ -3003,8 +2893,8 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
> >
> >       nr_good_pages = maxpages - 1;   /* omit header page */
> >
> > -     cluster_list_init(&p->free_clusters);
> > -     cluster_list_init(&p->discard_clusters);
> > +     INIT_LIST_HEAD(&p->free_clusters);
> > +     INIT_LIST_HEAD(&p->discard_clusters);
> >
> >       for (i = 0; i < swap_header->info.nr_badpages; i++) {
> >               unsigned int page_nr = swap_header->info.badpages[i];
> > @@ -3055,14 +2945,15 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
> >       for (k = 0; k < SWAP_CLUSTER_COLS; k++) {
> >               j = (k + col) % SWAP_CLUSTER_COLS;
> >               for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) {
> > +                     struct swap_cluster_info *ci;
> >                       idx = i * SWAP_CLUSTER_COLS + j;
> > +                     ci = cluster_info + idx;
> >                       if (idx >= nr_clusters)
> >                               continue;
> > -                     if (cluster_count(&cluster_info[idx]))
> > +                     if (ci->count)
> >                               continue;
> > -                     cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
> > -                     cluster_list_add_tail(&p->free_clusters, cluster_info,
> > -                                           idx);
> > +                     ci->state = CLUSTER_STATE_FREE;
> > +                     list_add_tail(&ci->list, &p->free_clusters);
> >               }
> >       }
> >       return nr_extents;

Thank you for the review and spotting the bug.

Chris
Huang, Ying June 18, 2024, 7:54 a.m. UTC | #3
Chris Li <chrisl@kernel.org> writes:

> On Sun, Jun 16, 2024 at 11:21 PM Huang, Ying <ying.huang@intel.com> wrote:
>>
>> Hi, Chris,
>>
>> Chris Li <chrisl@kernel.org> writes:
>>
>> > Previously, the swap cluster used a cluster index as a pointer
>> > to construct a custom single link list type "swap_cluster_list".
>> > The next cluster pointer is shared with the cluster->count.
>> > It prevents puting the non free cluster into a list.
>> > Change the cluster to use the standard double link list instead.
>> > This allows tracing the nonfull cluster in the follow up patch.
>> >
>> > Remove the cluster getter/setter for accessing the cluster
>> > struct member.
>> >
>> > The list operation is protected by the swap_info_struct->lock.
>> >
>> > Change cluster code to use "struct swap_cluster_info *" to
>> > reference the cluster rather than by using index. That is more
>> > consistent with the list manipulation. It avoids the repeat
>> > adding index to the cluser_info. The code is easier to understand.
>> >
>> > Remove the cluster next pointer is NULL flag, the double link
>> > list can handle the empty list pretty well.
>>
>> The above is more about "what" instead of "why".  We can identify "what"
>> from the patch itself.  I expect more "why".  I guess that we can reduce
>> swap_map[] scanning if we have lists of non-full/non-free clusters.
>
> In my mind, the "why" is captured by " This allows tracing the nonfull
> cluster in the follow up patch.".
> If you want to ask "why" we want the "nonfull cluster list". It is to
> get to the suitable candidate cluster with that order quicker than
> scanning swap_map[].

Good.  Please add that into the patch description.  And I think that we
can reduce the description about "what" too.

>>
>> > The "swap_cluster_info" struct is two pointer bigger, because
>> > 512 swap entries share one swap struct, it has very little impact
>> > on the average memory usage per swap entry. For 1TB swapfile, the
>> > swap cluster data structure increases from 8MB to 24MB.
>> >
>> > Other than the list conversion, there is no real function change
>> > in this patch.
>> >
>> > Signed-off-by: Chris Li <chrisl@kernel.org>
>> > ---
>> >  include/linux/swap.h |  28 +++----
>> >  mm/swapfile.c        | 227 +++++++++++++--------------------------------------
>> >  2 files changed, 70 insertions(+), 185 deletions(-)
>> >
>> > diff --git a/include/linux/swap.h b/include/linux/swap.h
>> > index 3df75d62a835..cd9154a3e934 100644
>> > --- a/include/linux/swap.h
>> > +++ b/include/linux/swap.h
>> > @@ -242,23 +242,22 @@ enum {
>> >   * space with SWAPFILE_CLUSTER pages long and naturally aligns in disk. All
>> >   * free clusters are organized into a list. We fetch an entry from the list to
>> >   * get a free cluster.
>> > - *
>> > - * The data field stores next cluster if the cluster is free or cluster usage
>> > - * counter otherwise. The flags field determines if a cluster is free. This is
>> > - * protected by swap_info_struct.lock.
>> >   */
>> >  struct swap_cluster_info {
>> >       spinlock_t lock;        /*
>> > -                              * Protect swap_cluster_info fields
>> > -                              * and swap_info_struct->swap_map
>> > +                              * Protect swap_cluster_info count and state
>>
>> Protect swap_cluster_info fields except 'list' ?
>
> I change it to protect the swap_cluster_info bitfields in the second patch.

Although I still prefer my version, I will not insist on that.

>>
>> > +                              * field and swap_info_struct->swap_map
>> >                                * elements correspond to the swap
>> >                                * cluster
>> >                                */
>> > -     unsigned int data:24;
>> > -     unsigned int flags:8;
>> > +     unsigned int count:12;
>> > +     unsigned int state:3;
>>
>> I still prefer normal data type over bit fields.  How about
>>
>>         u16 usage;
>>         u8  state;
>
> I don't mind the "count" rename to "usage". That is probably a better
> name. However I have another patch intended to add more bit fields in
> the cluster info struct. The second patch adds "order" and the later
> patch will add more. That is why I choose bitfield to be more condense
> with bits.

We still have space for another "u8" for "order".  It appears trivial to
change it to bit fields when necessary in the future.

>>
>> And, how about use 'usage' instead of 'count'?  Personally I think that
>> it is more clear.  But I don't have strong opinions on this.
>>
>> > +     struct list_head list;  /* Protected by swap_info_struct->lock */
>> >  };
>> > -#define CLUSTER_FLAG_FREE 1 /* This cluster is free */
>> > -#define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */
>> > +
>> > +#define CLUSTER_STATE_FREE   1 /* This cluster is free */
>>
>> Can we use swap_cluster_info->count == 0?
>
> It is not as good considering the second patch starts to track the
> state of the cluster of per cpu struct. We will be comparing both the
> cluster->count and cluster->state.
>
>>
>> > +#define CLUSTER_STATE_PER_CPU        2 /* This cluster on per_cpu_cluster  */
>> > +
>>
>> There's no users of this state in this patch.  IMHO, it's better to
>
> Yes, there is usage of this state in this patch in the sense that, if
> you remove that state definition,
> the code can't compile due to assignment of CLUSTER_STATE_PER_CPU.

Sorry, my words were confusing, we can move both the assignment and the
state itself to the second patch.

> There is a code test if a cluster state is not a free state, which
> excludes "CLUSTER_STATE_PER_CPU".

You mean the functionality that is equivalent to original
cluster_set_count_flag(0, 0) and cluster_is_free()?  I think
CLUSTER_STATE_PER_CPU cannot catch all.  If so, I suggest you to keep
swap_cluster_info.flags and CLUSTER_FLAG_FREE in this patch and change
it in the 2nd patch.  That will make this patch more focused and easier
to be reviewed.

In general, please try to keep this patch as simple as possible to help
reviewers.  Because it's quite long.  For example, just convert the list
implementation and keep other stuff as much as possible.

>> introduce a symbol with its users, otherwise, it's hard to understand
>> why do we need it and how to use it.  And, IIUC, the state isn't
>> maintained properly, it should be changed when we move the cluster off
>> the per-cpu cluster.
>
> I am actually following the same usage principle as you suggested
> here. Only the second patch starts to use the off per cpu state
> (SCANNED). That is why I introduce it there.
>
>>
>> >  /*
>> >   * The first page in the swap file is the swap header, which is always marked
>> > @@ -283,11 +282,6 @@ struct percpu_cluster {
>> >       unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */
>> >  };
>> >
>> > -struct swap_cluster_list {
>> > -     struct swap_cluster_info head;
>> > -     struct swap_cluster_info tail;
>> > -};
>> > -
>> >  /*
>> >   * The in-memory structure used to track swap areas.
>> >   */
>> > @@ -300,7 +294,7 @@ struct swap_info_struct {
>> >       unsigned int    max;            /* extent of the swap_map */
>> >       unsigned char *swap_map;        /* vmalloc'ed array of usage counts */
>> >       struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
>> > -     struct swap_cluster_list free_clusters; /* free clusters list */
>> > +     struct list_head free_clusters; /* free clusters list */
>> >       unsigned int lowest_bit;        /* index of first free in swap_map */
>> >       unsigned int highest_bit;       /* index of last free in swap_map */
>> >       unsigned int pages;             /* total of usable pages of swap */
>> > @@ -331,7 +325,7 @@ struct swap_info_struct {
>> >                                        * list.
>> >                                        */
>> >       struct work_struct discard_work; /* discard worker */
>> > -     struct swap_cluster_list discard_clusters; /* discard clusters list */
>> > +     struct list_head discard_clusters; /* discard clusters list */
>> >       struct plist_node avail_lists[]; /*
>> >                                          * entries in swap_avail_heads, one
>> >                                          * entry per node.
>> > diff --git a/mm/swapfile.c b/mm/swapfile.c
>> > index 9c6d8e557c0f..2f878b374349 100644
>> > --- a/mm/swapfile.c
>> > +++ b/mm/swapfile.c
>> > @@ -290,62 +290,9 @@ static void discard_swap_cluster(struct swap_info_struct *si,
>> >  #endif
>> >  #define LATENCY_LIMIT                256
>> >
>> > -static inline void cluster_set_flag(struct swap_cluster_info *info,
>> > -     unsigned int flag)
>> > -{
>> > -     info->flags = flag;
>> > -}
>> > -
>> > -static inline unsigned int cluster_count(struct swap_cluster_info *info)
>> > -{
>> > -     return info->data;
>> > -}
>> > -
>> > -static inline void cluster_set_count(struct swap_cluster_info *info,
>> > -                                  unsigned int c)
>> > -{
>> > -     info->data = c;
>> > -}
>> > -
>> > -static inline void cluster_set_count_flag(struct swap_cluster_info *info,
>> > -                                      unsigned int c, unsigned int f)
>> > -{
>> > -     info->flags = f;
>> > -     info->data = c;
>> > -}
>> > -
>> > -static inline unsigned int cluster_next(struct swap_cluster_info *info)
>> > -{
>> > -     return info->data;
>> > -}
>> > -
>> > -static inline void cluster_set_next(struct swap_cluster_info *info,
>> > -                                 unsigned int n)
>> > -{
>> > -     info->data = n;
>> > -}
>> > -
>> > -static inline void cluster_set_next_flag(struct swap_cluster_info *info,
>> > -                                      unsigned int n, unsigned int f)
>> > -{
>> > -     info->flags = f;
>> > -     info->data = n;
>> > -}
>> > -
>> >  static inline bool cluster_is_free(struct swap_cluster_info *info)
>> >  {
>> > -     return info->flags & CLUSTER_FLAG_FREE;
>> > -}
>> > -
>> > -static inline bool cluster_is_null(struct swap_cluster_info *info)
>> > -{
>> > -     return info->flags & CLUSTER_FLAG_NEXT_NULL;
>> > -}
>> > -
>> > -static inline void cluster_set_null(struct swap_cluster_info *info)
>> > -{
>> > -     info->flags = CLUSTER_FLAG_NEXT_NULL;
>> > -     info->data = 0;
>> > +     return info->state == CLUSTER_STATE_FREE;
>> >  }
>> >
>> >  static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
>> > @@ -394,65 +341,11 @@ static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si,
>> >               spin_unlock(&si->lock);
>> >  }
>> >
>> > -static inline bool cluster_list_empty(struct swap_cluster_list *list)
>> > -{
>> > -     return cluster_is_null(&list->head);
>> > -}
>> > -
>> > -static inline unsigned int cluster_list_first(struct swap_cluster_list *list)
>> > -{
>> > -     return cluster_next(&list->head);
>> > -}
>> > -
>> > -static void cluster_list_init(struct swap_cluster_list *list)
>> > -{
>> > -     cluster_set_null(&list->head);
>> > -     cluster_set_null(&list->tail);
>> > -}
>> > -
>> > -static void cluster_list_add_tail(struct swap_cluster_list *list,
>> > -                               struct swap_cluster_info *ci,
>> > -                               unsigned int idx)
>> > -{
>> > -     if (cluster_list_empty(list)) {
>> > -             cluster_set_next_flag(&list->head, idx, 0);
>> > -             cluster_set_next_flag(&list->tail, idx, 0);
>> > -     } else {
>> > -             struct swap_cluster_info *ci_tail;
>> > -             unsigned int tail = cluster_next(&list->tail);
>> > -
>> > -             /*
>> > -              * Nested cluster lock, but both cluster locks are
>> > -              * only acquired when we held swap_info_struct->lock
>> > -              */
>> > -             ci_tail = ci + tail;
>> > -             spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING);
>> > -             cluster_set_next(ci_tail, idx);
>> > -             spin_unlock(&ci_tail->lock);
>> > -             cluster_set_next_flag(&list->tail, idx, 0);
>> > -     }
>> > -}
>> > -
>> > -static unsigned int cluster_list_del_first(struct swap_cluster_list *list,
>> > -                                        struct swap_cluster_info *ci)
>> > -{
>> > -     unsigned int idx;
>> > -
>> > -     idx = cluster_next(&list->head);
>> > -     if (cluster_next(&list->tail) == idx) {
>> > -             cluster_set_null(&list->head);
>> > -             cluster_set_null(&list->tail);
>> > -     } else
>> > -             cluster_set_next_flag(&list->head,
>> > -                                   cluster_next(&ci[idx]), 0);
>> > -
>> > -     return idx;
>> > -}
>> > -
>> >  /* Add a cluster to discard list and schedule it to do discard */
>> >  static void swap_cluster_schedule_discard(struct swap_info_struct *si,
>> > -             unsigned int idx)
>> > +             struct swap_cluster_info *ci)
>> >  {
>> > +     unsigned int idx = ci - si->cluster_info;
>>
>> I see this multiple times in the patch, can we define a helper for this?
> Ack.
>
>>
>> >       /*
>> >        * If scan_swap_map_slots() can't find a free cluster, it will check
>> >        * si->swap_map directly. To make sure the discarding cluster isn't
>> > @@ -462,17 +355,14 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si,
>> >       memset(si->swap_map + idx * SWAPFILE_CLUSTER,
>> >                       SWAP_MAP_BAD, SWAPFILE_CLUSTER);
>> >
>> > -     cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx);
>> > -
>> > +     list_add_tail(&ci->list, &si->discard_clusters);
>> >       schedule_work(&si->discard_work);
>> >  }
>> >
>> > -static void __free_cluster(struct swap_info_struct *si, unsigned long idx)
>> > +static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
>> >  {
>> > -     struct swap_cluster_info *ci = si->cluster_info;
>> > -
>> > -     cluster_set_flag(ci + idx, CLUSTER_FLAG_FREE);
>> > -     cluster_list_add_tail(&si->free_clusters, ci, idx);
>> > +     ci->state = CLUSTER_STATE_FREE;
>> > +     list_add_tail(&ci->list, &si->free_clusters);
>> >  }
>> >
>> >  /*
>> > @@ -481,21 +371,22 @@ static void __free_cluster(struct swap_info_struct *si, unsigned long idx)
>> >  */
>> >  static void swap_do_scheduled_discard(struct swap_info_struct *si)
>> >  {
>> > -     struct swap_cluster_info *info, *ci;
>> > +     struct swap_cluster_info *ci;
>> >       unsigned int idx;
>> >
>> > -     info = si->cluster_info;
>> > -
>> > -     while (!cluster_list_empty(&si->discard_clusters)) {
>> > -             idx = cluster_list_del_first(&si->discard_clusters, info);
>> > +     while (!list_empty(&si->discard_clusters)) {
>> > +             ci = list_first_entry(&si->discard_clusters, struct swap_cluster_info, list);
>> > +             list_del(&ci->list);
>> > +             idx = ci - si->cluster_info;
>> >               spin_unlock(&si->lock);
>> >
>> >               discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
>> >                               SWAPFILE_CLUSTER);
>> >
>> >               spin_lock(&si->lock);
>> > -             ci = lock_cluster(si, idx * SWAPFILE_CLUSTER);
>> > -             __free_cluster(si, idx);
>> > +
>> > +             spin_lock(&ci->lock);
>>
>> Personally, I still prefer to use lock_cluster(), which is more readable
>> and matches unlock_cluster() below.
>
> lock_cluster() uses an index which is not matching unlock_cluster()
> which is using a pointer to cluster.

lock_cluster()/unlock_cluster() are pair and fit original design
well.  They use different parameter because swap cluster is optional.

> When you get the cluster from the list, you have a cluster pointer. I
> feel it is unnecessary to convert to index then back convert to
> cluster pointer inside lock_cluster(). I actually feel using indexes
> to refer to the cluster is error prone because we also have offset.

I don't think so, it's common to use swap offset.

>
>>
>> > +             __free_cluster(si, ci);
>> >               memset(si->swap_map + idx * SWAPFILE_CLUSTER,
>> >                               0, SWAPFILE_CLUSTER);
>> >               unlock_cluster(ci);
>> > @@ -521,20 +412,19 @@ static void swap_users_ref_free(struct percpu_ref *ref)
>> >       complete(&si->comp);
>> >  }
>> >
>> > -static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
>> > +static struct swap_cluster_info *alloc_cluster(struct swap_info_struct *si, unsigned long idx)
>> >  {
>> > -     struct swap_cluster_info *ci = si->cluster_info;
>> > +     struct swap_cluster_info *ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list);
>> >
>> > -     VM_BUG_ON(cluster_list_first(&si->free_clusters) != idx);
>> > -     cluster_list_del_first(&si->free_clusters, ci);
>> > -     cluster_set_count_flag(ci + idx, 0, 0);
>> > +     VM_BUG_ON(ci - si->cluster_info != idx);
>> > +     list_del(&ci->list);
>> > +     ci->count = 0;
>>
>> Do we need this now?  If we keep CLUSTER_STATE_FREE, we need to change
>> it here.
>
> Good catch, thanks for catching that. Now I realized this is actually
> problematic and tricky to get it right. Let me work on that.
>
>>
>> > +     return ci;
>> >  }
>> >
>> > -static void free_cluster(struct swap_info_struct *si, unsigned long idx)
>> > +static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
>> >  {
>> > -     struct swap_cluster_info *ci = si->cluster_info + idx;
>> > -
>> > -     VM_BUG_ON(cluster_count(ci) != 0);
>> > +     VM_BUG_ON(ci->count != 0);
>> >       /*
>> >        * If the swap is discardable, prepare discard the cluster
>> >        * instead of free it immediately. The cluster will be freed
>> > @@ -542,11 +432,11 @@ static void free_cluster(struct swap_info_struct *si, unsigned long idx)
>> >        */
>> >       if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
>> >           (SWP_WRITEOK | SWP_PAGE_DISCARD)) {
>> > -             swap_cluster_schedule_discard(si, idx);
>> > +             swap_cluster_schedule_discard(si, ci);
>> >               return;
>> >       }
>> >
>> > -     __free_cluster(si, idx);
>> > +     __free_cluster(si, ci);
>> >  }
>> >
>> >  /*
>> > @@ -559,15 +449,15 @@ static void add_cluster_info_page(struct swap_info_struct *p,
>> >       unsigned long count)
>> >  {
>> >       unsigned long idx = page_nr / SWAPFILE_CLUSTER;
>> > +     struct swap_cluster_info *ci = cluster_info + idx;
>> >
>> >       if (!cluster_info)
>> >               return;
>> > -     if (cluster_is_free(&cluster_info[idx]))
>> > +     if (cluster_is_free(ci))
>> >               alloc_cluster(p, idx);
>> >
>> > -     VM_BUG_ON(cluster_count(&cluster_info[idx]) + count > SWAPFILE_CLUSTER);
>> > -     cluster_set_count(&cluster_info[idx],
>> > -             cluster_count(&cluster_info[idx]) + count);
>> > +     VM_BUG_ON(ci->count + count > SWAPFILE_CLUSTER);
>> > +     ci->count += count;
>> >  }
>> >
>> >  /*
>> > @@ -581,24 +471,20 @@ static void inc_cluster_info_page(struct swap_info_struct *p,
>> >  }
>> >
>> >  /*
>> > - * The cluster corresponding to page_nr decreases one usage. If the usage
>> > - * counter becomes 0, which means no page in the cluster is in using, we can
>> > - * optionally discard the cluster and add it to free cluster list.
>> > + * The cluster ci decreases one usage. If the usage counter becomes 0,
>> > + * which means no page in the cluster is in using, we can optionally discard
>> > + * the cluster and add it to free cluster list.
>> >   */
>> > -static void dec_cluster_info_page(struct swap_info_struct *p,
>> > -     struct swap_cluster_info *cluster_info, unsigned long page_nr)
>> > +static void dec_cluster_info_page(struct swap_info_struct *p, struct swap_cluster_info *ci)
>> >  {
>> > -     unsigned long idx = page_nr / SWAPFILE_CLUSTER;
>> > -
>> > -     if (!cluster_info)
>> > +     if (!p->cluster_info)
>> >               return;
>> >
>> > -     VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0);
>> > -     cluster_set_count(&cluster_info[idx],
>> > -             cluster_count(&cluster_info[idx]) - 1);
>> > +     VM_BUG_ON(ci->count == 0);
>> > +     ci->count--;
>> >
>> > -     if (cluster_count(&cluster_info[idx]) == 0)
>> > -             free_cluster(p, idx);
>> > +     if (!ci->count)
>> > +             free_cluster(p, ci);
>> >  }
>> >
>> >  /*
>> > @@ -611,10 +497,10 @@ scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
>> >  {
>> >       struct percpu_cluster *percpu_cluster;
>> >       bool conflict;
>> > -
>>
>> Usually we use one blank line after local variable declaration.
> Ack.
>
>>
>> > +     struct swap_cluster_info *first = list_first_entry(&si->free_clusters, struct swap_cluster_info, list);
>> >       offset /= SWAPFILE_CLUSTER;
>> > -     conflict = !cluster_list_empty(&si->free_clusters) &&
>> > -             offset != cluster_list_first(&si->free_clusters) &&
>> > +     conflict = !list_empty(&si->free_clusters) &&
>> > +             offset !=  first - si->cluster_info &&
>> >               cluster_is_free(&si->cluster_info[offset]);
>> >
>> >       if (!conflict)
>> > @@ -655,10 +541,14 @@ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
>> >       cluster = this_cpu_ptr(si->percpu_cluster);
>> >       tmp = cluster->next[order];
>> >       if (tmp == SWAP_NEXT_INVALID) {
>> > -             if (!cluster_list_empty(&si->free_clusters)) {
>> > -                     tmp = cluster_next(&si->free_clusters.head) *
>> > -                                     SWAPFILE_CLUSTER;
>> > -             } else if (!cluster_list_empty(&si->discard_clusters)) {
>> > +             if (!list_empty(&si->free_clusters)) {
>> > +                     ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list);
>> > +                     list_del(&ci->list);
>>
>> The free cluster is deleted from si->free_clusters now.  But later you
>> will call scan_swap_map_ssd_cluster_conflict() and may abandon the
>> cluster.  And in alloc_cluster() later, it may be deleted again.
>
> Yes, that is a bug. Thanks for catching that.
>
>>
>> > +                     spin_lock(&ci->lock);
>> > +                     ci->state = CLUSTER_STATE_PER_CPU;
>>
>> Need to change ci->state when move a cluster off the percpu_cluster.
>
> In the next patch. This patch does not use the off state yet.

But that is confusing to use wrong state name, the really meaning is
something like CLUSTER_STATE_NON_FREE.  But as I suggested above, we can
keep swap_cluster_info.flags and CLUSTER_FLAG_FREE in this patch.

>>
>> > +                     spin_unlock(&ci->lock);
>> > +                     tmp = (ci - si->cluster_info) * SWAPFILE_CLUSTER;
>> > +             } else if (!list_empty(&si->discard_clusters)) {
>> >                       /*
>> >                        * we don't have free cluster but have some clusters in
>> >                        * discarding, do discard now and reclaim them, then
>> > @@ -1062,8 +952,8 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
>> >
>> >       ci = lock_cluster(si, offset);
>> >       memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
>> > -     cluster_set_count_flag(ci, 0, 0);
>> > -     free_cluster(si, idx);
>> > +     ci->count = 0;
>> > +     free_cluster(si, ci);
>> >       unlock_cluster(ci);
>> >       swap_range_free(si, offset, SWAPFILE_CLUSTER);
>> >  }
>> > @@ -1336,7 +1226,7 @@ static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry)
>> >       count = p->swap_map[offset];
>> >       VM_BUG_ON(count != SWAP_HAS_CACHE);
>> >       p->swap_map[offset] = 0;
>> > -     dec_cluster_info_page(p, p->cluster_info, offset);
>> > +     dec_cluster_info_page(p, ci);
>> >       unlock_cluster(ci);
>> >
>> >       mem_cgroup_uncharge_swap(entry, 1);
>> > @@ -3003,8 +2893,8 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
>> >
>> >       nr_good_pages = maxpages - 1;   /* omit header page */
>> >
>> > -     cluster_list_init(&p->free_clusters);
>> > -     cluster_list_init(&p->discard_clusters);
>> > +     INIT_LIST_HEAD(&p->free_clusters);
>> > +     INIT_LIST_HEAD(&p->discard_clusters);
>> >
>> >       for (i = 0; i < swap_header->info.nr_badpages; i++) {
>> >               unsigned int page_nr = swap_header->info.badpages[i];
>> > @@ -3055,14 +2945,15 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
>> >       for (k = 0; k < SWAP_CLUSTER_COLS; k++) {
>> >               j = (k + col) % SWAP_CLUSTER_COLS;
>> >               for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) {
>> > +                     struct swap_cluster_info *ci;
>> >                       idx = i * SWAP_CLUSTER_COLS + j;
>> > +                     ci = cluster_info + idx;
>> >                       if (idx >= nr_clusters)
>> >                               continue;
>> > -                     if (cluster_count(&cluster_info[idx]))
>> > +                     if (ci->count)
>> >                               continue;
>> > -                     cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
>> > -                     cluster_list_add_tail(&p->free_clusters, cluster_info,
>> > -                                           idx);
>> > +                     ci->state = CLUSTER_STATE_FREE;
>> > +                     list_add_tail(&ci->list, &p->free_clusters);
>> >               }
>> >       }
>> >       return nr_extents;
>
> Thank you for the review and spotting the bug.

My pleasure!

--
Best Regards,
Huang, Ying
Chris Li June 18, 2024, 10:01 a.m. UTC | #4
On Tue, Jun 18, 2024 at 12:56 AM Huang, Ying <ying.huang@intel.com> wrote:
>
> Chris Li <chrisl@kernel.org> writes:
>
> > On Sun, Jun 16, 2024 at 11:21 PM Huang, Ying <ying.huang@intel.com> wrote:
> >>
> >> Hi, Chris,
> >>
> >> Chris Li <chrisl@kernel.org> writes:
> >>
> >> > Previously, the swap cluster used a cluster index as a pointer
> >> > to construct a custom single link list type "swap_cluster_list".
> >> > The next cluster pointer is shared with the cluster->count.
> >> > It prevents puting the non free cluster into a list.
> >> > Change the cluster to use the standard double link list instead.
> >> > This allows tracing the nonfull cluster in the follow up patch.
> >> >
> >> > Remove the cluster getter/setter for accessing the cluster
> >> > struct member.
> >> >
> >> > The list operation is protected by the swap_info_struct->lock.
> >> >
> >> > Change cluster code to use "struct swap_cluster_info *" to
> >> > reference the cluster rather than by using index. That is more
> >> > consistent with the list manipulation. It avoids the repeat
> >> > adding index to the cluser_info. The code is easier to understand.
> >> >
> >> > Remove the cluster next pointer is NULL flag, the double link
> >> > list can handle the empty list pretty well.
> >>
> >> The above is more about "what" instead of "why".  We can identify "what"
> >> from the patch itself.  I expect more "why".  I guess that we can reduce
> >> swap_map[] scanning if we have lists of non-full/non-free clusters.
> >
> > In my mind, the "why" is captured by " This allows tracing the nonfull
> > cluster in the follow up patch.".
> > If you want to ask "why" we want the "nonfull cluster list". It is to
> > get to the suitable candidate cluster with that order quicker than
> > scanning swap_map[].
>
> Good.  Please add that into the patch description.  And I think that we
> can reduce the description about "what" too.

Sure.

>
> >>
> >> > The "swap_cluster_info" struct is two pointer bigger, because
> >> > 512 swap entries share one swap struct, it has very little impact
> >> > on the average memory usage per swap entry. For 1TB swapfile, the
> >> > swap cluster data structure increases from 8MB to 24MB.
> >> >
> >> > Other than the list conversion, there is no real function change
> >> > in this patch.
> >> >
> >> > Signed-off-by: Chris Li <chrisl@kernel.org>
> >> > ---
> >> >  include/linux/swap.h |  28 +++----
> >> >  mm/swapfile.c        | 227 +++++++++++++--------------------------------------
> >> >  2 files changed, 70 insertions(+), 185 deletions(-)
> >> >
> >> > diff --git a/include/linux/swap.h b/include/linux/swap.h
> >> > index 3df75d62a835..cd9154a3e934 100644
> >> > --- a/include/linux/swap.h
> >> > +++ b/include/linux/swap.h
> >> > @@ -242,23 +242,22 @@ enum {
> >> >   * space with SWAPFILE_CLUSTER pages long and naturally aligns in disk. All
> >> >   * free clusters are organized into a list. We fetch an entry from the list to
> >> >   * get a free cluster.
> >> > - *
> >> > - * The data field stores next cluster if the cluster is free or cluster usage
> >> > - * counter otherwise. The flags field determines if a cluster is free. This is
> >> > - * protected by swap_info_struct.lock.
> >> >   */
> >> >  struct swap_cluster_info {
> >> >       spinlock_t lock;        /*
> >> > -                              * Protect swap_cluster_info fields
> >> > -                              * and swap_info_struct->swap_map
> >> > +                              * Protect swap_cluster_info count and state
> >>
> >> Protect swap_cluster_info fields except 'list' ?
> >
> > I change it to protect the swap_cluster_info bitfields in the second patch.
>
> Although I still prefer my version, I will not insist on that.

Sure, I actually don't have a strong preference about that. It is just comments.

>
> >>
> >> > +                              * field and swap_info_struct->swap_map
> >> >                                * elements correspond to the swap
> >> >                                * cluster
> >> >                                */
> >> > -     unsigned int data:24;
> >> > -     unsigned int flags:8;
> >> > +     unsigned int count:12;
> >> > +     unsigned int state:3;
> >>
> >> I still prefer normal data type over bit fields.  How about
> >>
> >>         u16 usage;
> >>         u8  state;
> >
> > I don't mind the "count" rename to "usage". That is probably a better
> > name. However I have another patch intended to add more bit fields in
> > the cluster info struct. The second patch adds "order" and the later
> > patch will add more. That is why I choose bitfield to be more condense
> > with bits.
>
> We still have space for another "u8" for "order".  It appears trivial to
> change it to bit fields when necessary in the future.

We can, I don't see it necessary to change from bit field to u8 and
back to bit field in the future. It is more of a personal preference
issue.

> >>
> >> And, how about use 'usage' instead of 'count'?  Personally I think that
> >> it is more clear.  But I don't have strong opinions on this.
> >>
> >> > +     struct list_head list;  /* Protected by swap_info_struct->lock */
> >> >  };
> >> > -#define CLUSTER_FLAG_FREE 1 /* This cluster is free */
> >> > -#define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */
> >> > +
> >> > +#define CLUSTER_STATE_FREE   1 /* This cluster is free */
> >>
> >> Can we use swap_cluster_info->count == 0?
> >
> > It is not as good considering the second patch starts to track the
> > state of the cluster of per cpu struct. We will be comparing both the
> > cluster->count and cluster->state.
> >
> >>
> >> > +#define CLUSTER_STATE_PER_CPU        2 /* This cluster on per_cpu_cluster  */
> >> > +
> >>
> >> There's no users of this state in this patch.  IMHO, it's better to
> >
> > Yes, there is usage of this state in this patch in the sense that, if
> > you remove that state definition,
> > the code can't compile due to assignment of CLUSTER_STATE_PER_CPU.
>
> Sorry, my words were confusing, we can move both the assignment and the
> state itself to the second patch.
>
> > There is a code test if a cluster state is not a free state, which
> > excludes "CLUSTER_STATE_PER_CPU".
>
> You mean the functionality that is equivalent to original
> cluster_set_count_flag(0, 0) and cluster_is_free()?  I think
> CLUSTER_STATE_PER_CPU cannot catch all.  If so, I suggest you to keep
> swap_cluster_info.flags and CLUSTER_FLAG_FREE in this patch and change
> it in the 2nd patch.  That will make this patch more focused and easier
> to be reviewed.

That is one way to do it.

>
> In general, please try to keep this patch as simple as possible to help
> reviewers.  Because it's quite long.  For example, just convert the list
> implementation and keep other stuff as much as possible.
>
Let me think about it. Thanks.

> >> introduce a symbol with its users, otherwise, it's hard to understand
> >> why do we need it and how to use it.  And, IIUC, the state isn't
> >> maintained properly, it should be changed when we move the cluster off
> >> the per-cpu cluster.
> >
> > I am actually following the same usage principle as you suggested
> > here. Only the second patch starts to use the off per cpu state
> > (SCANNED). That is why I introduce it there.
> >
> >>
> >> >  /*
> >> >   * The first page in the swap file is the swap header, which is always marked
> >> > @@ -283,11 +282,6 @@ struct percpu_cluster {
> >> >       unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */
> >> >  };
> >> >
> >> > -struct swap_cluster_list {
> >> > -     struct swap_cluster_info head;
> >> > -     struct swap_cluster_info tail;
> >> > -};
> >> > -
> >> >  /*
> >> >   * The in-memory structure used to track swap areas.
> >> >   */
> >> > @@ -300,7 +294,7 @@ struct swap_info_struct {
> >> >       unsigned int    max;            /* extent of the swap_map */
> >> >       unsigned char *swap_map;        /* vmalloc'ed array of usage counts */
> >> >       struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
> >> > -     struct swap_cluster_list free_clusters; /* free clusters list */
> >> > +     struct list_head free_clusters; /* free clusters list */
> >> >       unsigned int lowest_bit;        /* index of first free in swap_map */
> >> >       unsigned int highest_bit;       /* index of last free in swap_map */
> >> >       unsigned int pages;             /* total of usable pages of swap */
> >> > @@ -331,7 +325,7 @@ struct swap_info_struct {
> >> >                                        * list.
> >> >                                        */
> >> >       struct work_struct discard_work; /* discard worker */
> >> > -     struct swap_cluster_list discard_clusters; /* discard clusters list */
> >> > +     struct list_head discard_clusters; /* discard clusters list */
> >> >       struct plist_node avail_lists[]; /*
> >> >                                          * entries in swap_avail_heads, one
> >> >                                          * entry per node.
> >> > diff --git a/mm/swapfile.c b/mm/swapfile.c
> >> > index 9c6d8e557c0f..2f878b374349 100644
> >> > --- a/mm/swapfile.c
> >> > +++ b/mm/swapfile.c
> >> > @@ -290,62 +290,9 @@ static void discard_swap_cluster(struct swap_info_struct *si,
> >> >  #endif
> >> >  #define LATENCY_LIMIT                256
> >> >
> >> > -static inline void cluster_set_flag(struct swap_cluster_info *info,
> >> > -     unsigned int flag)
> >> > -{
> >> > -     info->flags = flag;
> >> > -}
> >> > -
> >> > -static inline unsigned int cluster_count(struct swap_cluster_info *info)
> >> > -{
> >> > -     return info->data;
> >> > -}
> >> > -
> >> > -static inline void cluster_set_count(struct swap_cluster_info *info,
> >> > -                                  unsigned int c)
> >> > -{
> >> > -     info->data = c;
> >> > -}
> >> > -
> >> > -static inline void cluster_set_count_flag(struct swap_cluster_info *info,
> >> > -                                      unsigned int c, unsigned int f)
> >> > -{
> >> > -     info->flags = f;
> >> > -     info->data = c;
> >> > -}
> >> > -
> >> > -static inline unsigned int cluster_next(struct swap_cluster_info *info)
> >> > -{
> >> > -     return info->data;
> >> > -}
> >> > -
> >> > -static inline void cluster_set_next(struct swap_cluster_info *info,
> >> > -                                 unsigned int n)
> >> > -{
> >> > -     info->data = n;
> >> > -}
> >> > -
> >> > -static inline void cluster_set_next_flag(struct swap_cluster_info *info,
> >> > -                                      unsigned int n, unsigned int f)
> >> > -{
> >> > -     info->flags = f;
> >> > -     info->data = n;
> >> > -}
> >> > -
> >> >  static inline bool cluster_is_free(struct swap_cluster_info *info)
> >> >  {
> >> > -     return info->flags & CLUSTER_FLAG_FREE;
> >> > -}
> >> > -
> >> > -static inline bool cluster_is_null(struct swap_cluster_info *info)
> >> > -{
> >> > -     return info->flags & CLUSTER_FLAG_NEXT_NULL;
> >> > -}
> >> > -
> >> > -static inline void cluster_set_null(struct swap_cluster_info *info)
> >> > -{
> >> > -     info->flags = CLUSTER_FLAG_NEXT_NULL;
> >> > -     info->data = 0;
> >> > +     return info->state == CLUSTER_STATE_FREE;
> >> >  }
> >> >
> >> >  static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
> >> > @@ -394,65 +341,11 @@ static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si,
> >> >               spin_unlock(&si->lock);
> >> >  }
> >> >
> >> > -static inline bool cluster_list_empty(struct swap_cluster_list *list)
> >> > -{
> >> > -     return cluster_is_null(&list->head);
> >> > -}
> >> > -
> >> > -static inline unsigned int cluster_list_first(struct swap_cluster_list *list)
> >> > -{
> >> > -     return cluster_next(&list->head);
> >> > -}
> >> > -
> >> > -static void cluster_list_init(struct swap_cluster_list *list)
> >> > -{
> >> > -     cluster_set_null(&list->head);
> >> > -     cluster_set_null(&list->tail);
> >> > -}
> >> > -
> >> > -static void cluster_list_add_tail(struct swap_cluster_list *list,
> >> > -                               struct swap_cluster_info *ci,
> >> > -                               unsigned int idx)
> >> > -{
> >> > -     if (cluster_list_empty(list)) {
> >> > -             cluster_set_next_flag(&list->head, idx, 0);
> >> > -             cluster_set_next_flag(&list->tail, idx, 0);
> >> > -     } else {
> >> > -             struct swap_cluster_info *ci_tail;
> >> > -             unsigned int tail = cluster_next(&list->tail);
> >> > -
> >> > -             /*
> >> > -              * Nested cluster lock, but both cluster locks are
> >> > -              * only acquired when we held swap_info_struct->lock
> >> > -              */
> >> > -             ci_tail = ci + tail;
> >> > -             spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING);
> >> > -             cluster_set_next(ci_tail, idx);
> >> > -             spin_unlock(&ci_tail->lock);
> >> > -             cluster_set_next_flag(&list->tail, idx, 0);
> >> > -     }
> >> > -}
> >> > -
> >> > -static unsigned int cluster_list_del_first(struct swap_cluster_list *list,
> >> > -                                        struct swap_cluster_info *ci)
> >> > -{
> >> > -     unsigned int idx;
> >> > -
> >> > -     idx = cluster_next(&list->head);
> >> > -     if (cluster_next(&list->tail) == idx) {
> >> > -             cluster_set_null(&list->head);
> >> > -             cluster_set_null(&list->tail);
> >> > -     } else
> >> > -             cluster_set_next_flag(&list->head,
> >> > -                                   cluster_next(&ci[idx]), 0);
> >> > -
> >> > -     return idx;
> >> > -}
> >> > -
> >> >  /* Add a cluster to discard list and schedule it to do discard */
> >> >  static void swap_cluster_schedule_discard(struct swap_info_struct *si,
> >> > -             unsigned int idx)
> >> > +             struct swap_cluster_info *ci)
> >> >  {
> >> > +     unsigned int idx = ci - si->cluster_info;
> >>
> >> I see this multiple times in the patch, can we define a helper for this?
> > Ack.
> >
> >>
> >> >       /*
> >> >        * If scan_swap_map_slots() can't find a free cluster, it will check
> >> >        * si->swap_map directly. To make sure the discarding cluster isn't
> >> > @@ -462,17 +355,14 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si,
> >> >       memset(si->swap_map + idx * SWAPFILE_CLUSTER,
> >> >                       SWAP_MAP_BAD, SWAPFILE_CLUSTER);
> >> >
> >> > -     cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx);
> >> > -
> >> > +     list_add_tail(&ci->list, &si->discard_clusters);
> >> >       schedule_work(&si->discard_work);
> >> >  }
> >> >
> >> > -static void __free_cluster(struct swap_info_struct *si, unsigned long idx)
> >> > +static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
> >> >  {
> >> > -     struct swap_cluster_info *ci = si->cluster_info;
> >> > -
> >> > -     cluster_set_flag(ci + idx, CLUSTER_FLAG_FREE);
> >> > -     cluster_list_add_tail(&si->free_clusters, ci, idx);
> >> > +     ci->state = CLUSTER_STATE_FREE;
> >> > +     list_add_tail(&ci->list, &si->free_clusters);
> >> >  }
> >> >
> >> >  /*
> >> > @@ -481,21 +371,22 @@ static void __free_cluster(struct swap_info_struct *si, unsigned long idx)
> >> >  */
> >> >  static void swap_do_scheduled_discard(struct swap_info_struct *si)
> >> >  {
> >> > -     struct swap_cluster_info *info, *ci;
> >> > +     struct swap_cluster_info *ci;
> >> >       unsigned int idx;
> >> >
> >> > -     info = si->cluster_info;
> >> > -
> >> > -     while (!cluster_list_empty(&si->discard_clusters)) {
> >> > -             idx = cluster_list_del_first(&si->discard_clusters, info);
> >> > +     while (!list_empty(&si->discard_clusters)) {
> >> > +             ci = list_first_entry(&si->discard_clusters, struct swap_cluster_info, list);
> >> > +             list_del(&ci->list);
> >> > +             idx = ci - si->cluster_info;
> >> >               spin_unlock(&si->lock);
> >> >
> >> >               discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
> >> >                               SWAPFILE_CLUSTER);
> >> >
> >> >               spin_lock(&si->lock);
> >> > -             ci = lock_cluster(si, idx * SWAPFILE_CLUSTER);
> >> > -             __free_cluster(si, idx);
> >> > +
> >> > +             spin_lock(&ci->lock);
> >>
> >> Personally, I still prefer to use lock_cluster(), which is more readable
> >> and matches unlock_cluster() below.
> >
> > lock_cluster() uses an index which is not matching unlock_cluster()
> > which is using a pointer to cluster.
>
> lock_cluster()/unlock_cluster() are pair and fit original design
> well.  They use different parameter because swap cluster is optional.
>
> > When you get the cluster from the list, you have a cluster pointer. I
> > feel it is unnecessary to convert to index then back convert to
> > cluster pointer inside lock_cluster(). I actually feel using indexes
> > to refer to the cluster is error prone because we also have offset.
>
> I don't think so, it's common to use swap offset.

Swap offset is not an issue, it is all over the place. The cluster
index(offset/512) is the one I try to avoid.
I have made some mistakes myself on offset vs index.

>
> >
> >>
> >> > +             __free_cluster(si, ci);
> >> >               memset(si->swap_map + idx * SWAPFILE_CLUSTER,
> >> >                               0, SWAPFILE_CLUSTER);
> >> >               unlock_cluster(ci);
> >> > @@ -521,20 +412,19 @@ static void swap_users_ref_free(struct percpu_ref *ref)
> >> >       complete(&si->comp);
> >> >  }
> >> >
> >> > -static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
> >> > +static struct swap_cluster_info *alloc_cluster(struct swap_info_struct *si, unsigned long idx)
> >> >  {
> >> > -     struct swap_cluster_info *ci = si->cluster_info;
> >> > +     struct swap_cluster_info *ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list);
> >> >
> >> > -     VM_BUG_ON(cluster_list_first(&si->free_clusters) != idx);
> >> > -     cluster_list_del_first(&si->free_clusters, ci);
> >> > -     cluster_set_count_flag(ci + idx, 0, 0);
> >> > +     VM_BUG_ON(ci - si->cluster_info != idx);
> >> > +     list_del(&ci->list);
> >> > +     ci->count = 0;
> >>
> >> Do we need this now?  If we keep CLUSTER_STATE_FREE, we need to change
> >> it here.
> >
> > Good catch, thanks for catching that. Now I realized this is actually
> > problematic and tricky to get it right. Let me work on that.
> >
> >>
> >> > +     return ci;
> >> >  }
> >> >
> >> > -static void free_cluster(struct swap_info_struct *si, unsigned long idx)
> >> > +static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
> >> >  {
> >> > -     struct swap_cluster_info *ci = si->cluster_info + idx;
> >> > -
> >> > -     VM_BUG_ON(cluster_count(ci) != 0);
> >> > +     VM_BUG_ON(ci->count != 0);
> >> >       /*
> >> >        * If the swap is discardable, prepare discard the cluster
> >> >        * instead of free it immediately. The cluster will be freed
> >> > @@ -542,11 +432,11 @@ static void free_cluster(struct swap_info_struct *si, unsigned long idx)
> >> >        */
> >> >       if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
> >> >           (SWP_WRITEOK | SWP_PAGE_DISCARD)) {
> >> > -             swap_cluster_schedule_discard(si, idx);
> >> > +             swap_cluster_schedule_discard(si, ci);
> >> >               return;
> >> >       }
> >> >
> >> > -     __free_cluster(si, idx);
> >> > +     __free_cluster(si, ci);
> >> >  }
> >> >
> >> >  /*
> >> > @@ -559,15 +449,15 @@ static void add_cluster_info_page(struct swap_info_struct *p,
> >> >       unsigned long count)
> >> >  {
> >> >       unsigned long idx = page_nr / SWAPFILE_CLUSTER;
> >> > +     struct swap_cluster_info *ci = cluster_info + idx;
> >> >
> >> >       if (!cluster_info)
> >> >               return;
> >> > -     if (cluster_is_free(&cluster_info[idx]))
> >> > +     if (cluster_is_free(ci))
> >> >               alloc_cluster(p, idx);
> >> >
> >> > -     VM_BUG_ON(cluster_count(&cluster_info[idx]) + count > SWAPFILE_CLUSTER);
> >> > -     cluster_set_count(&cluster_info[idx],
> >> > -             cluster_count(&cluster_info[idx]) + count);
> >> > +     VM_BUG_ON(ci->count + count > SWAPFILE_CLUSTER);
> >> > +     ci->count += count;
> >> >  }
> >> >
> >> >  /*
> >> > @@ -581,24 +471,20 @@ static void inc_cluster_info_page(struct swap_info_struct *p,
> >> >  }
> >> >
> >> >  /*
> >> > - * The cluster corresponding to page_nr decreases one usage. If the usage
> >> > - * counter becomes 0, which means no page in the cluster is in using, we can
> >> > - * optionally discard the cluster and add it to free cluster list.
> >> > + * The cluster ci decreases one usage. If the usage counter becomes 0,
> >> > + * which means no page in the cluster is in using, we can optionally discard
> >> > + * the cluster and add it to free cluster list.
> >> >   */
> >> > -static void dec_cluster_info_page(struct swap_info_struct *p,
> >> > -     struct swap_cluster_info *cluster_info, unsigned long page_nr)
> >> > +static void dec_cluster_info_page(struct swap_info_struct *p, struct swap_cluster_info *ci)
> >> >  {
> >> > -     unsigned long idx = page_nr / SWAPFILE_CLUSTER;
> >> > -
> >> > -     if (!cluster_info)
> >> > +     if (!p->cluster_info)
> >> >               return;
> >> >
> >> > -     VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0);
> >> > -     cluster_set_count(&cluster_info[idx],
> >> > -             cluster_count(&cluster_info[idx]) - 1);
> >> > +     VM_BUG_ON(ci->count == 0);
> >> > +     ci->count--;
> >> >
> >> > -     if (cluster_count(&cluster_info[idx]) == 0)
> >> > -             free_cluster(p, idx);
> >> > +     if (!ci->count)
> >> > +             free_cluster(p, ci);
> >> >  }
> >> >
> >> >  /*
> >> > @@ -611,10 +497,10 @@ scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
> >> >  {
> >> >       struct percpu_cluster *percpu_cluster;
> >> >       bool conflict;
> >> > -
> >>
> >> Usually we use one blank line after local variable declaration.
> > Ack.
> >
> >>
> >> > +     struct swap_cluster_info *first = list_first_entry(&si->free_clusters, struct swap_cluster_info, list);
> >> >       offset /= SWAPFILE_CLUSTER;
> >> > -     conflict = !cluster_list_empty(&si->free_clusters) &&
> >> > -             offset != cluster_list_first(&si->free_clusters) &&
> >> > +     conflict = !list_empty(&si->free_clusters) &&
> >> > +             offset !=  first - si->cluster_info &&
> >> >               cluster_is_free(&si->cluster_info[offset]);
> >> >
> >> >       if (!conflict)
> >> > @@ -655,10 +541,14 @@ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
> >> >       cluster = this_cpu_ptr(si->percpu_cluster);
> >> >       tmp = cluster->next[order];
> >> >       if (tmp == SWAP_NEXT_INVALID) {
> >> > -             if (!cluster_list_empty(&si->free_clusters)) {
> >> > -                     tmp = cluster_next(&si->free_clusters.head) *
> >> > -                                     SWAPFILE_CLUSTER;
> >> > -             } else if (!cluster_list_empty(&si->discard_clusters)) {
> >> > +             if (!list_empty(&si->free_clusters)) {
> >> > +                     ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list);
> >> > +                     list_del(&ci->list);
> >>
> >> The free cluster is deleted from si->free_clusters now.  But later you
> >> will call scan_swap_map_ssd_cluster_conflict() and may abandon the
> >> cluster.  And in alloc_cluster() later, it may be deleted again.
> >
> > Yes, that is a bug. Thanks for catching that.
> >
> >>
> >> > +                     spin_lock(&ci->lock);
> >> > +                     ci->state = CLUSTER_STATE_PER_CPU;
> >>
> >> Need to change ci->state when move a cluster off the percpu_cluster.
> >
> > In the next patch. This patch does not use the off state yet.
>
> But that is confusing to use wrong state name, the really meaning is
> something like CLUSTER_STATE_NON_FREE.  But as I suggested above, we can

It can be FREE and on the per cpu pointer as well. That is the tricky part.
It can happen on the current code as well.

> keep swap_cluster_info.flags and CLUSTER_FLAG_FREE in this patch.

Maybe. Will consider that.

>
> >>
> >> > +                     spin_unlock(&ci->lock);
> >> > +                     tmp = (ci - si->cluster_info) * SWAPFILE_CLUSTER;
> >> > +             } else if (!list_empty(&si->discard_clusters)) {
> >> >                       /*
> >> >                        * we don't have free cluster but have some clusters in
> >> >                        * discarding, do discard now and reclaim them, then
> >> > @@ -1062,8 +952,8 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
> >> >
> >> >       ci = lock_cluster(si, offset);
> >> >       memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
> >> > -     cluster_set_count_flag(ci, 0, 0);
> >> > -     free_cluster(si, idx);
> >> > +     ci->count = 0;
> >> > +     free_cluster(si, ci);
> >> >       unlock_cluster(ci);
> >> >       swap_range_free(si, offset, SWAPFILE_CLUSTER);
> >> >  }
> >> > @@ -1336,7 +1226,7 @@ static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry)
> >> >       count = p->swap_map[offset];
> >> >       VM_BUG_ON(count != SWAP_HAS_CACHE);
> >> >       p->swap_map[offset] = 0;
> >> > -     dec_cluster_info_page(p, p->cluster_info, offset);
> >> > +     dec_cluster_info_page(p, ci);
> >> >       unlock_cluster(ci);
> >> >
> >> >       mem_cgroup_uncharge_swap(entry, 1);
> >> > @@ -3003,8 +2893,8 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
> >> >
> >> >       nr_good_pages = maxpages - 1;   /* omit header page */
> >> >
> >> > -     cluster_list_init(&p->free_clusters);
> >> > -     cluster_list_init(&p->discard_clusters);
> >> > +     INIT_LIST_HEAD(&p->free_clusters);
> >> > +     INIT_LIST_HEAD(&p->discard_clusters);
> >> >
> >> >       for (i = 0; i < swap_header->info.nr_badpages; i++) {
> >> >               unsigned int page_nr = swap_header->info.badpages[i];
> >> > @@ -3055,14 +2945,15 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
> >> >       for (k = 0; k < SWAP_CLUSTER_COLS; k++) {
> >> >               j = (k + col) % SWAP_CLUSTER_COLS;
> >> >               for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) {
> >> > +                     struct swap_cluster_info *ci;
> >> >                       idx = i * SWAP_CLUSTER_COLS + j;
> >> > +                     ci = cluster_info + idx;
> >> >                       if (idx >= nr_clusters)
> >> >                               continue;
> >> > -                     if (cluster_count(&cluster_info[idx]))
> >> > +                     if (ci->count)
> >> >                               continue;
> >> > -                     cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
> >> > -                     cluster_list_add_tail(&p->free_clusters, cluster_info,
> >> > -                                           idx);
> >> > +                     ci->state = CLUSTER_STATE_FREE;
> >> > +                     list_add_tail(&ci->list, &p->free_clusters);
> >> >               }
> >> >       }
> >> >       return nr_extents;
> >
> > Thank you for the review and spotting the bug.
>
> My pleasure!

Thanks!

Chris
Huang, Ying June 19, 2024, 7:51 a.m. UTC | #5
Chris Li <chrisl@kernel.org> writes:

> On Tue, Jun 18, 2024 at 12:56 AM Huang, Ying <ying.huang@intel.com> wrote:
>>
>> Chris Li <chrisl@kernel.org> writes:
>>
>> > On Sun, Jun 16, 2024 at 11:21 PM Huang, Ying <ying.huang@intel.com> wrote:
>> >>
>> >> Hi, Chris,
>> >>
>> >> Chris Li <chrisl@kernel.org> writes:

[snip]

>> >> > diff --git a/include/linux/swap.h b/include/linux/swap.h
>> >> > index 3df75d62a835..cd9154a3e934 100644
>> >> > --- a/include/linux/swap.h
>> >> > +++ b/include/linux/swap.h
>> >> > @@ -242,23 +242,22 @@ enum {
>> >> >   * space with SWAPFILE_CLUSTER pages long and naturally aligns in disk. All
>> >> >   * free clusters are organized into a list. We fetch an entry from the list to
>> >> >   * get a free cluster.
>> >> > - *
>> >> > - * The data field stores next cluster if the cluster is free or cluster usage
>> >> > - * counter otherwise. The flags field determines if a cluster is free. This is
>> >> > - * protected by swap_info_struct.lock.
>> >> >   */
>> >> >  struct swap_cluster_info {
>> >> >       spinlock_t lock;        /*
>> >> > -                              * Protect swap_cluster_info fields
>> >> > -                              * and swap_info_struct->swap_map
>> >> > +                              * Protect swap_cluster_info count and state
>> >>
>> >> Protect swap_cluster_info fields except 'list' ?
>> >
>> > I change it to protect the swap_cluster_info bitfields in the second patch.
>>
>> Although I still prefer my version, I will not insist on that.
>
> Sure, I actually don't have a strong preference about that. It is just comments.
>
>>
>> >>
>> >> > +                              * field and swap_info_struct->swap_map
>> >> >                                * elements correspond to the swap
>> >> >                                * cluster
>> >> >                                */
>> >> > -     unsigned int data:24;
>> >> > -     unsigned int flags:8;
>> >> > +     unsigned int count:12;
>> >> > +     unsigned int state:3;
>> >>
>> >> I still prefer normal data type over bit fields.  How about
>> >>
>> >>         u16 usage;
>> >>         u8  state;
>> >
>> > I don't mind the "count" rename to "usage". That is probably a better
>> > name. However I have another patch intended to add more bit fields in
>> > the cluster info struct. The second patch adds "order" and the later
>> > patch will add more. That is why I choose bitfield to be more condense
>> > with bits.
>>
>> We still have space for another "u8" for "order".  It appears trivial to
>> change it to bit fields when necessary in the future.
>
> We can, I don't see it necessary to change from bit field to u8 and
> back to bit field in the future. It is more of a personal preference
> issue.

I have to say that I don't think that it's just a personal preference.
IMO, if it's unnecessary, we shouldn't use bit fields.  You cannot
guarantee that your future changes will be merged in its current state.
So, I still think that it's better to avoid bit fields for now.

>> >>
>> >> And, how about use 'usage' instead of 'count'?  Personally I think that
>> >> it is more clear.  But I don't have strong opinions on this.
>> >>
>> >> > +     struct list_head list;  /* Protected by swap_info_struct->lock */
>> >> >  };
>> >> > -#define CLUSTER_FLAG_FREE 1 /* This cluster is free */
>> >> > -#define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */
>> >> > +
>> >> > +#define CLUSTER_STATE_FREE   1 /* This cluster is free */
>> >>

[snip]

>> >> >  /*
>> >> > @@ -481,21 +371,22 @@ static void __free_cluster(struct swap_info_struct *si, unsigned long idx)
>> >> >  */
>> >> >  static void swap_do_scheduled_discard(struct swap_info_struct *si)
>> >> >  {
>> >> > -     struct swap_cluster_info *info, *ci;
>> >> > +     struct swap_cluster_info *ci;
>> >> >       unsigned int idx;
>> >> >
>> >> > -     info = si->cluster_info;
>> >> > -
>> >> > -     while (!cluster_list_empty(&si->discard_clusters)) {
>> >> > -             idx = cluster_list_del_first(&si->discard_clusters, info);
>> >> > +     while (!list_empty(&si->discard_clusters)) {
>> >> > +             ci = list_first_entry(&si->discard_clusters, struct swap_cluster_info, list);
>> >> > +             list_del(&ci->list);
>> >> > +             idx = ci - si->cluster_info;
>> >> >               spin_unlock(&si->lock);
>> >> >
>> >> >               discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
>> >> >                               SWAPFILE_CLUSTER);
>> >> >
>> >> >               spin_lock(&si->lock);
>> >> > -             ci = lock_cluster(si, idx * SWAPFILE_CLUSTER);
>> >> > -             __free_cluster(si, idx);
>> >> > +
>> >> > +             spin_lock(&ci->lock);
>> >>
>> >> Personally, I still prefer to use lock_cluster(), which is more readable
>> >> and matches unlock_cluster() below.
>> >
>> > lock_cluster() uses an index which is not matching unlock_cluster()
>> > which is using a pointer to cluster.
>>
>> lock_cluster()/unlock_cluster() are pair and fit original design
>> well.  They use different parameter because swap cluster is optional.
>>
>> > When you get the cluster from the list, you have a cluster pointer. I
>> > feel it is unnecessary to convert to index then back convert to
>> > cluster pointer inside lock_cluster(). I actually feel using indexes
>> > to refer to the cluster is error prone because we also have offset.
>>
>> I don't think so, it's common to use swap offset.
>
> Swap offset is not an issue, it is all over the place. The cluster
> index(offset/512) is the one I try to avoid.
> I have made some mistakes myself on offset vs index.

Yes.  That's not good, but it's hard to be avoided too.  Can we make the
variable name more consistent?  index: cluster index, offset: swap
offset.

And, in fact, swap offset is the parameter of lock_cluster() instead of
cluster index.

>> >
>> >>
>> >> > +             __free_cluster(si, ci);
>> >> >               memset(si->swap_map + idx * SWAPFILE_CLUSTER,
>> >> >                               0, SWAPFILE_CLUSTER);
>> >> >               unlock_cluster(ci);
>> >> > @@ -521,20 +412,19 @@ static void swap_users_ref_free(struct percpu_ref *ref)
>> >> >       complete(&si->comp);
>> >> >  }
>> >> >

[snip]

>> >> > @@ -611,10 +497,10 @@ scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
>> >> >  {
>> >> >       struct percpu_cluster *percpu_cluster;
>> >> >       bool conflict;
>> >> > -
>> >>
>> >> Usually we use one blank line after local variable declaration.
>> > Ack.
>> >
>> >>
>> >> > +     struct swap_cluster_info *first = list_first_entry(&si->free_clusters, struct swap_cluster_info, list);
>> >> >       offset /= SWAPFILE_CLUSTER;
>> >> > -     conflict = !cluster_list_empty(&si->free_clusters) &&
>> >> > -             offset != cluster_list_first(&si->free_clusters) &&
>> >> > +     conflict = !list_empty(&si->free_clusters) &&
>> >> > +             offset !=  first - si->cluster_info &&
>> >> >               cluster_is_free(&si->cluster_info[offset]);
>> >> >
>> >> >       if (!conflict)
>> >> > @@ -655,10 +541,14 @@ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
>> >> >       cluster = this_cpu_ptr(si->percpu_cluster);
>> >> >       tmp = cluster->next[order];
>> >> >       if (tmp == SWAP_NEXT_INVALID) {
>> >> > -             if (!cluster_list_empty(&si->free_clusters)) {
>> >> > -                     tmp = cluster_next(&si->free_clusters.head) *
>> >> > -                                     SWAPFILE_CLUSTER;
>> >> > -             } else if (!cluster_list_empty(&si->discard_clusters)) {
>> >> > +             if (!list_empty(&si->free_clusters)) {
>> >> > +                     ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list);
>> >> > +                     list_del(&ci->list);
>> >>
>> >> The free cluster is deleted from si->free_clusters now.  But later you
>> >> will call scan_swap_map_ssd_cluster_conflict() and may abandon the
>> >> cluster.  And in alloc_cluster() later, it may be deleted again.
>> >
>> > Yes, that is a bug. Thanks for catching that.
>> >
>> >>
>> >> > +                     spin_lock(&ci->lock);
>> >> > +                     ci->state = CLUSTER_STATE_PER_CPU;
>> >>
>> >> Need to change ci->state when move a cluster off the percpu_cluster.
>> >
>> > In the next patch. This patch does not use the off state yet.
>>
>> But that is confusing to use wrong state name, the really meaning is
>> something like CLUSTER_STATE_NON_FREE.  But as I suggested above, we can
>
> It can be FREE and on the per cpu pointer as well. That is the tricky part.
> It can happen on the current code as well.

cluster_set_count_flag(0, 0) is called in alloc_cluster().  So, it's not
an issue in current code.  If you need more, that shouldn't be done in
this patch.

>> keep swap_cluster_info.flags and CLUSTER_FLAG_FREE in this patch.
>
> Maybe. Will consider that.
>
>>
>> >>
>> >> > +                     spin_unlock(&ci->lock);
>> >> > +                     tmp = (ci - si->cluster_info) * SWAPFILE_CLUSTER;
>> >> > +             } else if (!list_empty(&si->discard_clusters)) {
>> >> >                       /*
>> >> >                        * we don't have free cluster but have some clusters in
>> >> >                        * discarding, do discard now and reclaim them, then
>> >> > @@ -1062,8 +952,8 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
>> >> >
>> >> >       ci = lock_cluster(si, offset);
>> >> >       memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
>> >> > -     cluster_set_count_flag(ci, 0, 0);
>> >> > -     free_cluster(si, idx);
>> >> > +     ci->count = 0;
>> >> > +     free_cluster(si, ci);
>> >> >       unlock_cluster(ci);
>> >> >       swap_range_free(si, offset, SWAPFILE_CLUSTER);
>> >> >  }

[snip]

--
Best Regards,
Huang, Ying
Chris Li June 19, 2024, 9:03 a.m. UTC | #6
On Wed, Jun 19, 2024 at 12:53 AM Huang, Ying <ying.huang@intel.com> wrote:
>
> Chris Li <chrisl@kernel.org> writes:
>
> > On Tue, Jun 18, 2024 at 12:56 AM Huang, Ying <ying.huang@intel.com> wrote:
> >>
> >> Chris Li <chrisl@kernel.org> writes:
> >>
> >> > On Sun, Jun 16, 2024 at 11:21 PM Huang, Ying <ying.huang@intel.com> wrote:
> >> >>
> >> >> Hi, Chris,
> >> >>
> >> >> Chris Li <chrisl@kernel.org> writes:
>
> [snip]
>
> >> >> > diff --git a/include/linux/swap.h b/include/linux/swap.h
> >> >> > index 3df75d62a835..cd9154a3e934 100644
> >> >> > --- a/include/linux/swap.h
> >> >> > +++ b/include/linux/swap.h
> >> >> > @@ -242,23 +242,22 @@ enum {
> >> >> >   * space with SWAPFILE_CLUSTER pages long and naturally aligns in disk. All
> >> >> >   * free clusters are organized into a list. We fetch an entry from the list to
> >> >> >   * get a free cluster.
> >> >> > - *
> >> >> > - * The data field stores next cluster if the cluster is free or cluster usage
> >> >> > - * counter otherwise. The flags field determines if a cluster is free. This is
> >> >> > - * protected by swap_info_struct.lock.
> >> >> >   */
> >> >> >  struct swap_cluster_info {
> >> >> >       spinlock_t lock;        /*
> >> >> > -                              * Protect swap_cluster_info fields
> >> >> > -                              * and swap_info_struct->swap_map
> >> >> > +                              * Protect swap_cluster_info count and state
> >> >>
> >> >> Protect swap_cluster_info fields except 'list' ?
> >> >
> >> > I change it to protect the swap_cluster_info bitfields in the second patch.
> >>
> >> Although I still prefer my version, I will not insist on that.
> >
> > Sure, I actually don't have a strong preference about that. It is just comments.
> >
> >>
> >> >>
> >> >> > +                              * field and swap_info_struct->swap_map
> >> >> >                                * elements correspond to the swap
> >> >> >                                * cluster
> >> >> >                                */
> >> >> > -     unsigned int data:24;
> >> >> > -     unsigned int flags:8;
> >> >> > +     unsigned int count:12;
> >> >> > +     unsigned int state:3;
> >> >>
> >> >> I still prefer normal data type over bit fields.  How about
> >> >>
> >> >>         u16 usage;
> >> >>         u8  state;
> >> >
> >> > I don't mind the "count" rename to "usage". That is probably a better
> >> > name. However I have another patch intended to add more bit fields in
> >> > the cluster info struct. The second patch adds "order" and the later
> >> > patch will add more. That is why I choose bitfield to be more condense
> >> > with bits.
> >>
> >> We still have space for another "u8" for "order".  It appears trivial to
> >> change it to bit fields when necessary in the future.
> >
> > We can, I don't see it necessary to change from bit field to u8 and
> > back to bit field in the future. It is more of a personal preference
> > issue.
>
> I have to say that I don't think that it's just a personal preference.
> IMO, if it's unnecessary, we shouldn't use bit fields.  You cannot
> guarantee that your future changes will be merged in its current state.
> So, I still think that it's better to avoid bit fields for now.

That is surprising to hear, I am not dependent on any hardware
physical bit location.
Anyway, not too big a deal for me. I changed it to u16/u8.

> >> > When you get the cluster from the list, you have a cluster pointer. I
> >> > feel it is unnecessary to convert to index then back convert to
> >> > cluster pointer inside lock_cluster(). I actually feel using indexes
> >> > to refer to the cluster is error prone because we also have offset.
> >>
> >> I don't think so, it's common to use swap offset.
> >
> > Swap offset is not an issue, it is all over the place. The cluster
> > index(offset/512) is the one I try to avoid.
> > I have made some mistakes myself on offset vs index.
>
> Yes.  That's not good, but it's hard to be avoided too.  Can we make the
> variable name more consistent?  index: cluster index, offset: swap
> offset.
>
> And, in fact, swap offset is the parameter of lock_cluster() instead of
> cluster index.

Right, when you get the cluster pointer from the list, it can't
directly use lock_cluster().

>
> >> >
> >> >>
> >> >> > +             __free_cluster(si, ci);
> >> >> >               memset(si->swap_map + idx * SWAPFILE_CLUSTER,
> >> >> >                               0, SWAPFILE_CLUSTER);
> >> >> >               unlock_cluster(ci);
> >> >> > @@ -521,20 +412,19 @@ static void swap_users_ref_free(struct percpu_ref *ref)
> >> >> >       complete(&si->comp);
> >> >> >  }
> >> >> >
>
> [snip]
>
> >> >> > @@ -611,10 +497,10 @@ scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
> >> >> >  {
> >> >> >       struct percpu_cluster *percpu_cluster;
> >> >> >       bool conflict;
> >> >> > -
> >> >>
> >> >> Usually we use one blank line after local variable declaration.
> >> > Ack.
> >> >
> >> >>
> >> >> > +     struct swap_cluster_info *first = list_first_entry(&si->free_clusters, struct swap_cluster_info, list);
> >> >> >       offset /= SWAPFILE_CLUSTER;
> >> >> > -     conflict = !cluster_list_empty(&si->free_clusters) &&
> >> >> > -             offset != cluster_list_first(&si->free_clusters) &&
> >> >> > +     conflict = !list_empty(&si->free_clusters) &&
> >> >> > +             offset !=  first - si->cluster_info &&
> >> >> >               cluster_is_free(&si->cluster_info[offset]);
> >> >> >
> >> >> >       if (!conflict)
> >> >> > @@ -655,10 +541,14 @@ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
> >> >> >       cluster = this_cpu_ptr(si->percpu_cluster);
> >> >> >       tmp = cluster->next[order];
> >> >> >       if (tmp == SWAP_NEXT_INVALID) {
> >> >> > -             if (!cluster_list_empty(&si->free_clusters)) {
> >> >> > -                     tmp = cluster_next(&si->free_clusters.head) *
> >> >> > -                                     SWAPFILE_CLUSTER;
> >> >> > -             } else if (!cluster_list_empty(&si->discard_clusters)) {
> >> >> > +             if (!list_empty(&si->free_clusters)) {
> >> >> > +                     ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list);
> >> >> > +                     list_del(&ci->list);
> >> >>
> >> >> The free cluster is deleted from si->free_clusters now.  But later you
> >> >> will call scan_swap_map_ssd_cluster_conflict() and may abandon the
> >> >> cluster.  And in alloc_cluster() later, it may be deleted again.
> >> >
> >> > Yes, that is a bug. Thanks for catching that.
> >> >
> >> >>
> >> >> > +                     spin_lock(&ci->lock);
> >> >> > +                     ci->state = CLUSTER_STATE_PER_CPU;
> >> >>
> >> >> Need to change ci->state when move a cluster off the percpu_cluster.
> >> >
> >> > In the next patch. This patch does not use the off state yet.
> >>
> >> But that is confusing to use wrong state name, the really meaning is
> >> something like CLUSTER_STATE_NON_FREE.  But as I suggested above, we can
> >
> > It can be FREE and on the per cpu pointer as well. That is the tricky part.
> > It can happen on the current code as well.
>
> cluster_set_count_flag(0, 0) is called in alloc_cluster().  So, it's not
> an issue in current code.  If you need more, that shouldn't be done in
> this patch.

Revert to V1 like using the flags.

Chris
diff mbox series

Patch

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 3df75d62a835..cd9154a3e934 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -242,23 +242,22 @@  enum {
  * space with SWAPFILE_CLUSTER pages long and naturally aligns in disk. All
  * free clusters are organized into a list. We fetch an entry from the list to
  * get a free cluster.
- *
- * The data field stores next cluster if the cluster is free or cluster usage
- * counter otherwise. The flags field determines if a cluster is free. This is
- * protected by swap_info_struct.lock.
  */
 struct swap_cluster_info {
 	spinlock_t lock;	/*
-				 * Protect swap_cluster_info fields
-				 * and swap_info_struct->swap_map
+				 * Protect swap_cluster_info count and state
+				 * field and swap_info_struct->swap_map
 				 * elements correspond to the swap
 				 * cluster
 				 */
-	unsigned int data:24;
-	unsigned int flags:8;
+	unsigned int count:12;
+	unsigned int state:3;
+	struct list_head list;	/* Protected by swap_info_struct->lock */
 };
-#define CLUSTER_FLAG_FREE 1 /* This cluster is free */
-#define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */
+
+#define CLUSTER_STATE_FREE	1 /* This cluster is free */
+#define CLUSTER_STATE_PER_CPU	2 /* This cluster on per_cpu_cluster  */
+
 
 /*
  * The first page in the swap file is the swap header, which is always marked
@@ -283,11 +282,6 @@  struct percpu_cluster {
 	unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */
 };
 
-struct swap_cluster_list {
-	struct swap_cluster_info head;
-	struct swap_cluster_info tail;
-};
-
 /*
  * The in-memory structure used to track swap areas.
  */
@@ -300,7 +294,7 @@  struct swap_info_struct {
 	unsigned int	max;		/* extent of the swap_map */
 	unsigned char *swap_map;	/* vmalloc'ed array of usage counts */
 	struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
-	struct swap_cluster_list free_clusters; /* free clusters list */
+	struct list_head free_clusters; /* free clusters list */
 	unsigned int lowest_bit;	/* index of first free in swap_map */
 	unsigned int highest_bit;	/* index of last free in swap_map */
 	unsigned int pages;		/* total of usable pages of swap */
@@ -331,7 +325,7 @@  struct swap_info_struct {
 					 * list.
 					 */
 	struct work_struct discard_work; /* discard worker */
-	struct swap_cluster_list discard_clusters; /* discard clusters list */
+	struct list_head discard_clusters; /* discard clusters list */
 	struct plist_node avail_lists[]; /*
 					   * entries in swap_avail_heads, one
 					   * entry per node.
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 9c6d8e557c0f..2f878b374349 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -290,62 +290,9 @@  static void discard_swap_cluster(struct swap_info_struct *si,
 #endif
 #define LATENCY_LIMIT		256
 
-static inline void cluster_set_flag(struct swap_cluster_info *info,
-	unsigned int flag)
-{
-	info->flags = flag;
-}
-
-static inline unsigned int cluster_count(struct swap_cluster_info *info)
-{
-	return info->data;
-}
-
-static inline void cluster_set_count(struct swap_cluster_info *info,
-				     unsigned int c)
-{
-	info->data = c;
-}
-
-static inline void cluster_set_count_flag(struct swap_cluster_info *info,
-					 unsigned int c, unsigned int f)
-{
-	info->flags = f;
-	info->data = c;
-}
-
-static inline unsigned int cluster_next(struct swap_cluster_info *info)
-{
-	return info->data;
-}
-
-static inline void cluster_set_next(struct swap_cluster_info *info,
-				    unsigned int n)
-{
-	info->data = n;
-}
-
-static inline void cluster_set_next_flag(struct swap_cluster_info *info,
-					 unsigned int n, unsigned int f)
-{
-	info->flags = f;
-	info->data = n;
-}
-
 static inline bool cluster_is_free(struct swap_cluster_info *info)
 {
-	return info->flags & CLUSTER_FLAG_FREE;
-}
-
-static inline bool cluster_is_null(struct swap_cluster_info *info)
-{
-	return info->flags & CLUSTER_FLAG_NEXT_NULL;
-}
-
-static inline void cluster_set_null(struct swap_cluster_info *info)
-{
-	info->flags = CLUSTER_FLAG_NEXT_NULL;
-	info->data = 0;
+	return info->state == CLUSTER_STATE_FREE;
 }
 
 static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
@@ -394,65 +341,11 @@  static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si,
 		spin_unlock(&si->lock);
 }
 
-static inline bool cluster_list_empty(struct swap_cluster_list *list)
-{
-	return cluster_is_null(&list->head);
-}
-
-static inline unsigned int cluster_list_first(struct swap_cluster_list *list)
-{
-	return cluster_next(&list->head);
-}
-
-static void cluster_list_init(struct swap_cluster_list *list)
-{
-	cluster_set_null(&list->head);
-	cluster_set_null(&list->tail);
-}
-
-static void cluster_list_add_tail(struct swap_cluster_list *list,
-				  struct swap_cluster_info *ci,
-				  unsigned int idx)
-{
-	if (cluster_list_empty(list)) {
-		cluster_set_next_flag(&list->head, idx, 0);
-		cluster_set_next_flag(&list->tail, idx, 0);
-	} else {
-		struct swap_cluster_info *ci_tail;
-		unsigned int tail = cluster_next(&list->tail);
-
-		/*
-		 * Nested cluster lock, but both cluster locks are
-		 * only acquired when we held swap_info_struct->lock
-		 */
-		ci_tail = ci + tail;
-		spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING);
-		cluster_set_next(ci_tail, idx);
-		spin_unlock(&ci_tail->lock);
-		cluster_set_next_flag(&list->tail, idx, 0);
-	}
-}
-
-static unsigned int cluster_list_del_first(struct swap_cluster_list *list,
-					   struct swap_cluster_info *ci)
-{
-	unsigned int idx;
-
-	idx = cluster_next(&list->head);
-	if (cluster_next(&list->tail) == idx) {
-		cluster_set_null(&list->head);
-		cluster_set_null(&list->tail);
-	} else
-		cluster_set_next_flag(&list->head,
-				      cluster_next(&ci[idx]), 0);
-
-	return idx;
-}
-
 /* Add a cluster to discard list and schedule it to do discard */
 static void swap_cluster_schedule_discard(struct swap_info_struct *si,
-		unsigned int idx)
+		struct swap_cluster_info *ci)
 {
+	unsigned int idx = ci - si->cluster_info;
 	/*
 	 * If scan_swap_map_slots() can't find a free cluster, it will check
 	 * si->swap_map directly. To make sure the discarding cluster isn't
@@ -462,17 +355,14 @@  static void swap_cluster_schedule_discard(struct swap_info_struct *si,
 	memset(si->swap_map + idx * SWAPFILE_CLUSTER,
 			SWAP_MAP_BAD, SWAPFILE_CLUSTER);
 
-	cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx);
-
+	list_add_tail(&ci->list, &si->discard_clusters);
 	schedule_work(&si->discard_work);
 }
 
-static void __free_cluster(struct swap_info_struct *si, unsigned long idx)
+static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
 {
-	struct swap_cluster_info *ci = si->cluster_info;
-
-	cluster_set_flag(ci + idx, CLUSTER_FLAG_FREE);
-	cluster_list_add_tail(&si->free_clusters, ci, idx);
+	ci->state = CLUSTER_STATE_FREE;
+	list_add_tail(&ci->list, &si->free_clusters);
 }
 
 /*
@@ -481,21 +371,22 @@  static void __free_cluster(struct swap_info_struct *si, unsigned long idx)
 */
 static void swap_do_scheduled_discard(struct swap_info_struct *si)
 {
-	struct swap_cluster_info *info, *ci;
+	struct swap_cluster_info *ci;
 	unsigned int idx;
 
-	info = si->cluster_info;
-
-	while (!cluster_list_empty(&si->discard_clusters)) {
-		idx = cluster_list_del_first(&si->discard_clusters, info);
+	while (!list_empty(&si->discard_clusters)) {
+		ci = list_first_entry(&si->discard_clusters, struct swap_cluster_info, list);
+		list_del(&ci->list);
+		idx = ci - si->cluster_info;
 		spin_unlock(&si->lock);
 
 		discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
 				SWAPFILE_CLUSTER);
 
 		spin_lock(&si->lock);
-		ci = lock_cluster(si, idx * SWAPFILE_CLUSTER);
-		__free_cluster(si, idx);
+
+		spin_lock(&ci->lock);
+		__free_cluster(si, ci);
 		memset(si->swap_map + idx * SWAPFILE_CLUSTER,
 				0, SWAPFILE_CLUSTER);
 		unlock_cluster(ci);
@@ -521,20 +412,19 @@  static void swap_users_ref_free(struct percpu_ref *ref)
 	complete(&si->comp);
 }
 
-static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
+static struct swap_cluster_info *alloc_cluster(struct swap_info_struct *si, unsigned long idx)
 {
-	struct swap_cluster_info *ci = si->cluster_info;
+	struct swap_cluster_info *ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list);
 
-	VM_BUG_ON(cluster_list_first(&si->free_clusters) != idx);
-	cluster_list_del_first(&si->free_clusters, ci);
-	cluster_set_count_flag(ci + idx, 0, 0);
+	VM_BUG_ON(ci - si->cluster_info != idx);
+	list_del(&ci->list);
+	ci->count = 0;
+	return ci;
 }
 
-static void free_cluster(struct swap_info_struct *si, unsigned long idx)
+static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
 {
-	struct swap_cluster_info *ci = si->cluster_info + idx;
-
-	VM_BUG_ON(cluster_count(ci) != 0);
+	VM_BUG_ON(ci->count != 0);
 	/*
 	 * If the swap is discardable, prepare discard the cluster
 	 * instead of free it immediately. The cluster will be freed
@@ -542,11 +432,11 @@  static void free_cluster(struct swap_info_struct *si, unsigned long idx)
 	 */
 	if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
 	    (SWP_WRITEOK | SWP_PAGE_DISCARD)) {
-		swap_cluster_schedule_discard(si, idx);
+		swap_cluster_schedule_discard(si, ci);
 		return;
 	}
 
-	__free_cluster(si, idx);
+	__free_cluster(si, ci);
 }
 
 /*
@@ -559,15 +449,15 @@  static void add_cluster_info_page(struct swap_info_struct *p,
 	unsigned long count)
 {
 	unsigned long idx = page_nr / SWAPFILE_CLUSTER;
+	struct swap_cluster_info *ci = cluster_info + idx;
 
 	if (!cluster_info)
 		return;
-	if (cluster_is_free(&cluster_info[idx]))
+	if (cluster_is_free(ci))
 		alloc_cluster(p, idx);
 
-	VM_BUG_ON(cluster_count(&cluster_info[idx]) + count > SWAPFILE_CLUSTER);
-	cluster_set_count(&cluster_info[idx],
-		cluster_count(&cluster_info[idx]) + count);
+	VM_BUG_ON(ci->count + count > SWAPFILE_CLUSTER);
+	ci->count += count;
 }
 
 /*
@@ -581,24 +471,20 @@  static void inc_cluster_info_page(struct swap_info_struct *p,
 }
 
 /*
- * The cluster corresponding to page_nr decreases one usage. If the usage
- * counter becomes 0, which means no page in the cluster is in using, we can
- * optionally discard the cluster and add it to free cluster list.
+ * The cluster ci decreases one usage. If the usage counter becomes 0,
+ * which means no page in the cluster is in using, we can optionally discard
+ * the cluster and add it to free cluster list.
  */
-static void dec_cluster_info_page(struct swap_info_struct *p,
-	struct swap_cluster_info *cluster_info, unsigned long page_nr)
+static void dec_cluster_info_page(struct swap_info_struct *p, struct swap_cluster_info *ci)
 {
-	unsigned long idx = page_nr / SWAPFILE_CLUSTER;
-
-	if (!cluster_info)
+	if (!p->cluster_info)
 		return;
 
-	VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0);
-	cluster_set_count(&cluster_info[idx],
-		cluster_count(&cluster_info[idx]) - 1);
+	VM_BUG_ON(ci->count == 0);
+	ci->count--;
 
-	if (cluster_count(&cluster_info[idx]) == 0)
-		free_cluster(p, idx);
+	if (!ci->count)
+		free_cluster(p, ci);
 }
 
 /*
@@ -611,10 +497,10 @@  scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
 {
 	struct percpu_cluster *percpu_cluster;
 	bool conflict;
-
+	struct swap_cluster_info *first = list_first_entry(&si->free_clusters, struct swap_cluster_info, list);
 	offset /= SWAPFILE_CLUSTER;
-	conflict = !cluster_list_empty(&si->free_clusters) &&
-		offset != cluster_list_first(&si->free_clusters) &&
+	conflict = !list_empty(&si->free_clusters) &&
+		offset !=  first - si->cluster_info &&
 		cluster_is_free(&si->cluster_info[offset]);
 
 	if (!conflict)
@@ -655,10 +541,14 @@  static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
 	cluster = this_cpu_ptr(si->percpu_cluster);
 	tmp = cluster->next[order];
 	if (tmp == SWAP_NEXT_INVALID) {
-		if (!cluster_list_empty(&si->free_clusters)) {
-			tmp = cluster_next(&si->free_clusters.head) *
-					SWAPFILE_CLUSTER;
-		} else if (!cluster_list_empty(&si->discard_clusters)) {
+		if (!list_empty(&si->free_clusters)) {
+			ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list);
+			list_del(&ci->list);
+			spin_lock(&ci->lock);
+			ci->state = CLUSTER_STATE_PER_CPU;
+			spin_unlock(&ci->lock);
+			tmp = (ci - si->cluster_info) * SWAPFILE_CLUSTER;
+		} else if (!list_empty(&si->discard_clusters)) {
 			/*
 			 * we don't have free cluster but have some clusters in
 			 * discarding, do discard now and reclaim them, then
@@ -1062,8 +952,8 @@  static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
 
 	ci = lock_cluster(si, offset);
 	memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
-	cluster_set_count_flag(ci, 0, 0);
-	free_cluster(si, idx);
+	ci->count = 0;
+	free_cluster(si, ci);
 	unlock_cluster(ci);
 	swap_range_free(si, offset, SWAPFILE_CLUSTER);
 }
@@ -1336,7 +1226,7 @@  static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry)
 	count = p->swap_map[offset];
 	VM_BUG_ON(count != SWAP_HAS_CACHE);
 	p->swap_map[offset] = 0;
-	dec_cluster_info_page(p, p->cluster_info, offset);
+	dec_cluster_info_page(p, ci);
 	unlock_cluster(ci);
 
 	mem_cgroup_uncharge_swap(entry, 1);
@@ -3003,8 +2893,8 @@  static int setup_swap_map_and_extents(struct swap_info_struct *p,
 
 	nr_good_pages = maxpages - 1;	/* omit header page */
 
-	cluster_list_init(&p->free_clusters);
-	cluster_list_init(&p->discard_clusters);
+	INIT_LIST_HEAD(&p->free_clusters);
+	INIT_LIST_HEAD(&p->discard_clusters);
 
 	for (i = 0; i < swap_header->info.nr_badpages; i++) {
 		unsigned int page_nr = swap_header->info.badpages[i];
@@ -3055,14 +2945,15 @@  static int setup_swap_map_and_extents(struct swap_info_struct *p,
 	for (k = 0; k < SWAP_CLUSTER_COLS; k++) {
 		j = (k + col) % SWAP_CLUSTER_COLS;
 		for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) {
+			struct swap_cluster_info *ci;
 			idx = i * SWAP_CLUSTER_COLS + j;
+			ci = cluster_info + idx;
 			if (idx >= nr_clusters)
 				continue;
-			if (cluster_count(&cluster_info[idx]))
+			if (ci->count)
 				continue;
-			cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
-			cluster_list_add_tail(&p->free_clusters, cluster_info,
-					      idx);
+			ci->state = CLUSTER_STATE_FREE;
+			list_add_tail(&ci->list, &p->free_clusters);
 		}
 	}
 	return nr_extents;