diff mbox

[-mm,-v4,04/21] mm, THP, swap: Support PMD swap mapping in swapcache_free_cluster()

Message ID 20180622035151.6676-5-ying.huang@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Huang, Ying June 22, 2018, 3:51 a.m. UTC
From: Huang Ying <ying.huang@intel.com>

Previously, during swapout, all PMD page mapping will be split and
replaced with PTE swap mapping.  And when clearing the SWAP_HAS_CACHE
flag for the huge swap cluster in swapcache_free_cluster(), the huge
swap cluster will be split.  Now, during swapout, the PMD page mapping
will be changed to PMD swap mapping.  So when clearing the
SWAP_HAS_CACHE flag, the huge swap cluster will only be split if the
PMD swap mapping count is 0.  Otherwise, we will keep it as the huge
swap cluster.  So that we can swapin a THP as a whole later.

Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Shaohua Li <shli@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Zi Yan <zi.yan@cs.rutgers.edu>
Cc: Daniel Jordan <daniel.m.jordan@oracle.com>
---
 mm/swapfile.c | 41 ++++++++++++++++++++++++++++++-----------
 1 file changed, 30 insertions(+), 11 deletions(-)

Comments

Dave Hansen July 9, 2018, 5:11 p.m. UTC | #1
> +#ifdef CONFIG_THP_SWAP
> +static inline int cluster_swapcount(struct swap_cluster_info *ci)
> +{
> +	if (!ci || !cluster_is_huge(ci))
> +		return 0;
> +
> +	return cluster_count(ci) - SWAPFILE_CLUSTER;
> +}
> +#else
> +#define cluster_swapcount(ci)			0
> +#endif

Dumb questions, round 2:  On a CONFIG_THP_SWAP=n build, presumably,
cluster_is_huge()=0 always, so cluster_swapout() always returns 0.  Right?

So, why the #ifdef?

>  /*
>   * It's possible scan_swap_map() uses a free cluster in the middle of free
>   * cluster list. Avoiding such abuse to avoid list corruption.
> @@ -905,6 +917,7 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
>  	struct swap_cluster_info *ci;
>  
>  	ci = lock_cluster(si, offset);
> +	memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
>  	cluster_set_count_flag(ci, 0, 0);
>  	free_cluster(si, idx);
>  	unlock_cluster(ci);

This is another case of gloriously comment-free code, but stuff that
_was_ covered in the changelog.  I'd much rather have code comments than
changelog comments.  Could we fix that?

I'm generally finding it quite hard to review this because I keep having
to refer back to the changelog to see if what you are doing matches what
you said you were doing.

> @@ -1288,24 +1301,30 @@ static void swapcache_free_cluster(swp_entry_t entry)
>  
>  	ci = lock_cluster(si, offset);
>  	VM_BUG_ON(!cluster_is_huge(ci));
> +	VM_BUG_ON(!is_cluster_offset(offset));
> +	VM_BUG_ON(cluster_count(ci) < SWAPFILE_CLUSTER);
>  	map = si->swap_map + offset;
> -	for (i = 0; i < SWAPFILE_CLUSTER; i++) {
> -		val = map[i];
> -		VM_BUG_ON(!(val & SWAP_HAS_CACHE));
> -		if (val == SWAP_HAS_CACHE)
> -			free_entries++;
> +	if (!cluster_swapcount(ci)) {
> +		for (i = 0; i < SWAPFILE_CLUSTER; i++) {
> +			val = map[i];
> +			VM_BUG_ON(!(val & SWAP_HAS_CACHE));
> +			if (val == SWAP_HAS_CACHE)
> +				free_entries++;
> +		}
> +		if (free_entries != SWAPFILE_CLUSTER)
> +			cluster_clear_huge(ci);
>  	}

Also, I'll point out that cluster_swapcount() continues the horrific
naming of cluster_couunt(), not saying what the count is *of*.  The
return value doesn't help much:

	return cluster_count(ci) - SWAPFILE_CLUSTER;
Huang, Ying July 10, 2018, 6:53 a.m. UTC | #2
Dave Hansen <dave.hansen@linux.intel.com> writes:

>> +#ifdef CONFIG_THP_SWAP
>> +static inline int cluster_swapcount(struct swap_cluster_info *ci)
>> +{
>> +	if (!ci || !cluster_is_huge(ci))
>> +		return 0;
>> +
>> +	return cluster_count(ci) - SWAPFILE_CLUSTER;
>> +}
>> +#else
>> +#define cluster_swapcount(ci)			0
>> +#endif
>
> Dumb questions, round 2:  On a CONFIG_THP_SWAP=n build, presumably,
> cluster_is_huge()=0 always, so cluster_swapout() always returns 0.  Right?
>
> So, why the #ifdef?

#ifdef here is to reduce the code size for !CONFIG_THP_SWAP.

>>  /*
>>   * It's possible scan_swap_map() uses a free cluster in the middle of free
>>   * cluster list. Avoiding such abuse to avoid list corruption.
>> @@ -905,6 +917,7 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
>>  	struct swap_cluster_info *ci;
>>  
>>  	ci = lock_cluster(si, offset);
>> +	memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
>>  	cluster_set_count_flag(ci, 0, 0);
>>  	free_cluster(si, idx);
>>  	unlock_cluster(ci);
>
> This is another case of gloriously comment-free code, but stuff that
> _was_ covered in the changelog.  I'd much rather have code comments than
> changelog comments.  Could we fix that?
>
> I'm generally finding it quite hard to review this because I keep having
> to refer back to the changelog to see if what you are doing matches what
> you said you were doing.

Sure.  Will fix this.

>> @@ -1288,24 +1301,30 @@ static void swapcache_free_cluster(swp_entry_t entry)
>>  
>>  	ci = lock_cluster(si, offset);
>>  	VM_BUG_ON(!cluster_is_huge(ci));
>> +	VM_BUG_ON(!is_cluster_offset(offset));
>> +	VM_BUG_ON(cluster_count(ci) < SWAPFILE_CLUSTER);
>>  	map = si->swap_map + offset;
>> -	for (i = 0; i < SWAPFILE_CLUSTER; i++) {
>> -		val = map[i];
>> -		VM_BUG_ON(!(val & SWAP_HAS_CACHE));
>> -		if (val == SWAP_HAS_CACHE)
>> -			free_entries++;
>> +	if (!cluster_swapcount(ci)) {
>> +		for (i = 0; i < SWAPFILE_CLUSTER; i++) {
>> +			val = map[i];
>> +			VM_BUG_ON(!(val & SWAP_HAS_CACHE));
>> +			if (val == SWAP_HAS_CACHE)
>> +				free_entries++;
>> +		}
>> +		if (free_entries != SWAPFILE_CLUSTER)
>> +			cluster_clear_huge(ci);
>>  	}
>
> Also, I'll point out that cluster_swapcount() continues the horrific
> naming of cluster_couunt(), not saying what the count is *of*.  The
> return value doesn't help much:
>
> 	return cluster_count(ci) - SWAPFILE_CLUSTER;

We have page_swapcount() for page, swp_swapcount() for swap entry.
cluster_swapcount() tries to mimic them for swap cluster.  But I am not
good at naming in general.  What's your suggestion?

Best Regards,
Huang, Ying
Dave Hansen July 10, 2018, 1:54 p.m. UTC | #3
On 07/09/2018 11:53 PM, Huang, Ying wrote:
> Dave Hansen <dave.hansen@linux.intel.com> writes:
>>> +#ifdef CONFIG_THP_SWAP
>>> +static inline int cluster_swapcount(struct swap_cluster_info *ci)
>>> +{
>>> +	if (!ci || !cluster_is_huge(ci))
>>> +		return 0;
>>> +
>>> +	return cluster_count(ci) - SWAPFILE_CLUSTER;
>>> +}
>>> +#else
>>> +#define cluster_swapcount(ci)			0
>>> +#endif
>>
>> Dumb questions, round 2:  On a CONFIG_THP_SWAP=n build, presumably,
>> cluster_is_huge()=0 always, so cluster_swapout() always returns 0.  Right?
>>
>> So, why the #ifdef?
> 
> #ifdef here is to reduce the code size for !CONFIG_THP_SWAP.

I'd just remove the !CONFIG_THP_SWAP version entirely.

>>> @@ -1288,24 +1301,30 @@ static void swapcache_free_cluster(swp_entry_t entry)
>>>  
>>>  	ci = lock_cluster(si, offset);
>>>  	VM_BUG_ON(!cluster_is_huge(ci));
>>> +	VM_BUG_ON(!is_cluster_offset(offset));
>>> +	VM_BUG_ON(cluster_count(ci) < SWAPFILE_CLUSTER);
>>>  	map = si->swap_map + offset;
>>> -	for (i = 0; i < SWAPFILE_CLUSTER; i++) {
>>> -		val = map[i];
>>> -		VM_BUG_ON(!(val & SWAP_HAS_CACHE));
>>> -		if (val == SWAP_HAS_CACHE)
>>> -			free_entries++;
>>> +	if (!cluster_swapcount(ci)) {
>>> +		for (i = 0; i < SWAPFILE_CLUSTER; i++) {
>>> +			val = map[i];
>>> +			VM_BUG_ON(!(val & SWAP_HAS_CACHE));
>>> +			if (val == SWAP_HAS_CACHE)
>>> +				free_entries++;
>>> +		}
>>> +		if (free_entries != SWAPFILE_CLUSTER)
>>> +			cluster_clear_huge(ci);
>>>  	}
>>
>> Also, I'll point out that cluster_swapcount() continues the horrific
>> naming of cluster_couunt(), not saying what the count is *of*.  The
>> return value doesn't help much:
>>
>> 	return cluster_count(ci) - SWAPFILE_CLUSTER;
> 
> We have page_swapcount() for page, swp_swapcount() for swap entry.
> cluster_swapcount() tries to mimic them for swap cluster.  But I am not
> good at naming in general.  What's your suggestion?

I don't have a suggestion because I haven't the foggiest idea what it is
doing. :)

Is it the number of instantiated swap cache pages that are referring to
this cluster?  Is it just huge pages?  Huge and small?  One refcount per
huge page, or 512?
Huang, Ying July 11, 2018, 1:08 a.m. UTC | #4
Dave Hansen <dave.hansen@linux.intel.com> writes:

> On 07/09/2018 11:53 PM, Huang, Ying wrote:
>> Dave Hansen <dave.hansen@linux.intel.com> writes:
>>>> +#ifdef CONFIG_THP_SWAP
>>>> +static inline int cluster_swapcount(struct swap_cluster_info *ci)
>>>> +{
>>>> +	if (!ci || !cluster_is_huge(ci))
>>>> +		return 0;
>>>> +
>>>> +	return cluster_count(ci) - SWAPFILE_CLUSTER;
>>>> +}
>>>> +#else
>>>> +#define cluster_swapcount(ci)			0
>>>> +#endif
>>>
>>> Dumb questions, round 2:  On a CONFIG_THP_SWAP=n build, presumably,
>>> cluster_is_huge()=0 always, so cluster_swapout() always returns 0.  Right?
>>>
>>> So, why the #ifdef?
>> 
>> #ifdef here is to reduce the code size for !CONFIG_THP_SWAP.
>
> I'd just remove the !CONFIG_THP_SWAP version entirely.

Sure.  Unless there are some build errors after some other refactoring.

>>>> @@ -1288,24 +1301,30 @@ static void swapcache_free_cluster(swp_entry_t entry)
>>>>  
>>>>  	ci = lock_cluster(si, offset);
>>>>  	VM_BUG_ON(!cluster_is_huge(ci));
>>>> +	VM_BUG_ON(!is_cluster_offset(offset));
>>>> +	VM_BUG_ON(cluster_count(ci) < SWAPFILE_CLUSTER);
>>>>  	map = si->swap_map + offset;
>>>> -	for (i = 0; i < SWAPFILE_CLUSTER; i++) {
>>>> -		val = map[i];
>>>> -		VM_BUG_ON(!(val & SWAP_HAS_CACHE));
>>>> -		if (val == SWAP_HAS_CACHE)
>>>> -			free_entries++;
>>>> +	if (!cluster_swapcount(ci)) {
>>>> +		for (i = 0; i < SWAPFILE_CLUSTER; i++) {
>>>> +			val = map[i];
>>>> +			VM_BUG_ON(!(val & SWAP_HAS_CACHE));
>>>> +			if (val == SWAP_HAS_CACHE)
>>>> +				free_entries++;
>>>> +		}
>>>> +		if (free_entries != SWAPFILE_CLUSTER)
>>>> +			cluster_clear_huge(ci);
>>>>  	}
>>>
>>> Also, I'll point out that cluster_swapcount() continues the horrific
>>> naming of cluster_couunt(), not saying what the count is *of*.  The
>>> return value doesn't help much:
>>>
>>> 	return cluster_count(ci) - SWAPFILE_CLUSTER;
>> 
>> We have page_swapcount() for page, swp_swapcount() for swap entry.
>> cluster_swapcount() tries to mimic them for swap cluster.  But I am not
>> good at naming in general.  What's your suggestion?
>
> I don't have a suggestion because I haven't the foggiest idea what it is
> doing. :)
>
> Is it the number of instantiated swap cache pages that are referring to
> this cluster?  Is it just huge pages?  Huge and small?  One refcount per
> huge page, or 512?

page_swapcount() and swp_swapcount() for a normal swap entry is the
number of PTE swap mapping to the normal swap entry.

cluster_swapcount() for a huge swap entry (or huge swap cluster) is the
number of PMD swap mapping to the huge swap entry.

Originally, cluster_count is the reference count of the swap entries in
the swap cluster (that is, how many entries are in use).  Now, it is the
sum of the reference count of the swap entries in the swap cluster and
the number of PMD swap mapping to the huge swap entry.

I need to add comments for this at least.

Best Regards,
Huang, Ying
diff mbox

Patch

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 48e2c54385ee..36f4b6451360 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -514,6 +514,18 @@  static void dec_cluster_info_page(struct swap_info_struct *p,
 		free_cluster(p, idx);
 }
 
+#ifdef CONFIG_THP_SWAP
+static inline int cluster_swapcount(struct swap_cluster_info *ci)
+{
+	if (!ci || !cluster_is_huge(ci))
+		return 0;
+
+	return cluster_count(ci) - SWAPFILE_CLUSTER;
+}
+#else
+#define cluster_swapcount(ci)			0
+#endif
+
 /*
  * It's possible scan_swap_map() uses a free cluster in the middle of free
  * cluster list. Avoiding such abuse to avoid list corruption.
@@ -905,6 +917,7 @@  static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
 	struct swap_cluster_info *ci;
 
 	ci = lock_cluster(si, offset);
+	memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
 	cluster_set_count_flag(ci, 0, 0);
 	free_cluster(si, idx);
 	unlock_cluster(ci);
@@ -1288,24 +1301,30 @@  static void swapcache_free_cluster(swp_entry_t entry)
 
 	ci = lock_cluster(si, offset);
 	VM_BUG_ON(!cluster_is_huge(ci));
+	VM_BUG_ON(!is_cluster_offset(offset));
+	VM_BUG_ON(cluster_count(ci) < SWAPFILE_CLUSTER);
 	map = si->swap_map + offset;
-	for (i = 0; i < SWAPFILE_CLUSTER; i++) {
-		val = map[i];
-		VM_BUG_ON(!(val & SWAP_HAS_CACHE));
-		if (val == SWAP_HAS_CACHE)
-			free_entries++;
+	if (!cluster_swapcount(ci)) {
+		for (i = 0; i < SWAPFILE_CLUSTER; i++) {
+			val = map[i];
+			VM_BUG_ON(!(val & SWAP_HAS_CACHE));
+			if (val == SWAP_HAS_CACHE)
+				free_entries++;
+		}
+		if (free_entries != SWAPFILE_CLUSTER)
+			cluster_clear_huge(ci);
 	}
 	if (!free_entries) {
-		for (i = 0; i < SWAPFILE_CLUSTER; i++)
-			map[i] &= ~SWAP_HAS_CACHE;
+		for (i = 0; i < SWAPFILE_CLUSTER; i++) {
+			val = map[i];
+			VM_BUG_ON(!(val & SWAP_HAS_CACHE) ||
+				  val == SWAP_HAS_CACHE);
+			map[i] = val & ~SWAP_HAS_CACHE;
+		}
 	}
-	cluster_clear_huge(ci);
 	unlock_cluster(ci);
 	if (free_entries == SWAPFILE_CLUSTER) {
 		spin_lock(&si->lock);
-		ci = lock_cluster(si, offset);
-		memset(map, 0, SWAPFILE_CLUSTER);
-		unlock_cluster(ci);
 		mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
 		swap_free_cluster(si, idx);
 		spin_unlock(&si->lock);