diff mbox series

[v4,4/5] mm/shmem: fix infinite loop when swap in shmem error at swapoff time

Message ID 20220519125030.21486-5-linmiaohe@huawei.com (mailing list archive)
State New
Headers show
Series A few fixup patches for mm | expand

Commit Message

Miaohe Lin May 19, 2022, 12:50 p.m. UTC
When swap in shmem error at swapoff time, there would be a infinite loop
in the while loop in shmem_unuse_inode(). It's because swapin error is
deliberately ignored now and thus info->swapped will never reach 0. So
we can't escape the loop in shmem_unuse().

In order to fix the issue, swapin_error entry is stored in the mapping
when swapin error occurs. So the swapcache page can be freed and the
user won't end up with a permanently mounted swap because a sector is
bad. If the page is accessed later, the user process will be killed
so that corrupted data is never consumed. On the other hand, if the
page is never accessed, the user won't even notice it.

Reported-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
---
 mm/shmem.c | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

Comments

HORIGUCHI NAOYA(堀口 直也) May 20, 2022, 6:34 a.m. UTC | #1
On Thu, May 19, 2022 at 08:50:29PM +0800, Miaohe Lin wrote:
> When swap in shmem error at swapoff time, there would be a infinite loop
> in the while loop in shmem_unuse_inode(). It's because swapin error is
> deliberately ignored now and thus info->swapped will never reach 0. So
> we can't escape the loop in shmem_unuse().
> 
> In order to fix the issue, swapin_error entry is stored in the mapping
> when swapin error occurs. So the swapcache page can be freed and the
> user won't end up with a permanently mounted swap because a sector is
> bad. If the page is accessed later, the user process will be killed
> so that corrupted data is never consumed. On the other hand, if the
> page is never accessed, the user won't even notice it.
> 
> Reported-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>

Hi Miaohe,

Thank you for the update.  I might miss something, but I still see the same
problem (I checked it on mm-everything-2022-05-19-00-03 + this patchset).

This patch has the effect to change the return value of shmem_swapin_folio(),
-EIO (without this patch) to -EEXIST (with this patch).
But shmem_unuse_swap_entries() checks neither, so no change from caller's view point.
Maybe breaking in errors (rather than ENOMEM) in for loop in shmem_unuse_swap_entries()
solves the issue?  I briefly checked with the below change, then swapoff can return
with failure.

@@ -1222,7 +1222,7 @@ static int shmem_unuse_swap_entries(struct inode *inode,
                        folio_put(folio);
                        ret++;
                }
-               if (error == -ENOMEM)
+               if (error < 0)
                        break;
                error = 0;
        }

> ---
>  mm/shmem.c | 39 +++++++++++++++++++++++++++++++++++++++
>  1 file changed, 39 insertions(+)
> 
> diff --git a/mm/shmem.c b/mm/shmem.c
> index d3c7970e0179..d55dd972023a 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -1175,6 +1175,10 @@ static int shmem_find_swap_entries(struct address_space *mapping,
>  			continue;
>  
>  		entry = radix_to_swp_entry(folio);
> +		/*
> +		 * swapin error entries can be found in the mapping. But they're
> +		 * deliberately ignored here as we've done everything we can do.
> +		 */
>  		if (swp_type(entry) != type)
>  			continue;
>  
> @@ -1672,6 +1676,36 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
>  	return error;
>  }
>  
> +static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
> +					 struct folio *folio, swp_entry_t swap)
> +{
> +	struct address_space *mapping = inode->i_mapping;
> +	struct shmem_inode_info *info = SHMEM_I(inode);
> +	swp_entry_t swapin_error;
> +	void *old;
> +
> +	swapin_error = make_swapin_error_entry(&folio->page);
> +	old = xa_cmpxchg_irq(&mapping->i_pages, index,
> +			     swp_to_radix_entry(swap),
> +			     swp_to_radix_entry(swapin_error), 0);
> +	if (old != swp_to_radix_entry(swap))
> +		return;
> +
> +	folio_wait_writeback(folio);
> +	delete_from_swap_cache(&folio->page);
> +	spin_lock_irq(&info->lock);
> +	/*
> +	 * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks won't
> +	 * be 0 when inode is released and thus trigger WARN_ON(inode->i_blocks) in
> +	 * shmem_evict_inode.
> +	 */
> +	info->alloced--;
> +	info->swapped--;
> +	shmem_recalc_inode(inode);
> +	spin_unlock_irq(&info->lock);
> +	swap_free(swap);
> +}
> +
>  /*
>   * Swap in the page pointed to by *pagep.
>   * Caller has to make sure that *pagep contains a valid swapped page.

(off-topic a little) BTW, the comment on shmem_swapin_folio() still mentions
*pagep, but maybe it can be updated to *foliop.

Thanks,
Naoya Horiguchi

> @@ -1695,6 +1729,9 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
>  	swap = radix_to_swp_entry(*foliop);
>  	*foliop = NULL;
>  
> +	if (is_swapin_error_entry(swap))
> +		return -EIO;
> +
>  	/* Look it up and read it in.. */
>  	page = lookup_swap_cache(swap, NULL, 0);
>  	if (!page) {
> @@ -1762,6 +1799,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
>  failed:
>  	if (!shmem_confirm_swap(mapping, index, swap))
>  		error = -EEXIST;
> +	if (error == -EIO)
> +		shmem_set_folio_swapin_error(inode, index, folio, swap);
Miaohe Lin May 20, 2022, 8:17 a.m. UTC | #2
On 2022/5/20 14:34, HORIGUCHI NAOYA(堀口 直也) wrote:
> On Thu, May 19, 2022 at 08:50:29PM +0800, Miaohe Lin wrote:
>> When swap in shmem error at swapoff time, there would be a infinite loop
>> in the while loop in shmem_unuse_inode(). It's because swapin error is
>> deliberately ignored now and thus info->swapped will never reach 0. So
>> we can't escape the loop in shmem_unuse().
>>
>> In order to fix the issue, swapin_error entry is stored in the mapping
>> when swapin error occurs. So the swapcache page can be freed and the
>> user won't end up with a permanently mounted swap because a sector is
>> bad. If the page is accessed later, the user process will be killed
>> so that corrupted data is never consumed. On the other hand, if the
>> page is never accessed, the user won't even notice it.
>>
>> Reported-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
> 
> Hi Miaohe,
> 
> Thank you for the update.  I might miss something, but I still see the same
> problem (I checked it on mm-everything-2022-05-19-00-03 + this patchset).

I was testing this patch on my 5.10 kernel. I reproduced the problem in my env and
fixed it. It seems there might be some critical difference though I checked that by
reviewing the code... Sorry. :(

> 
> This patch has the effect to change the return value of shmem_swapin_folio(),
> -EIO (without this patch) to -EEXIST (with this patch).

In fact, I didn't change the return value from -EIO to -EEXIST:

@@ -1762,6 +1799,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
 failed:
 	if (!shmem_confirm_swap(mapping, index, swap))
 		error = -EEXIST;
+	if (error == -EIO)
+		shmem_set_folio_swapin_error(inode, index, folio, swap)

> But shmem_unuse_swap_entries() checks neither, so no change from caller's view point.
> Maybe breaking in errors (rather than ENOMEM) in for loop in shmem_unuse_swap_entries()
> solves the issue?  I briefly checked with the below change, then swapoff can return
> with failure.
> 
> @@ -1222,7 +1222,7 @@ static int shmem_unuse_swap_entries(struct inode *inode,
>                         folio_put(folio);
>                         ret++;
>                 }
> -               if (error == -ENOMEM)
> +               if (error < 0)
>                         break;
>                 error = 0;
>         }

Yes, this is the simplest and straightforward way to fix the issue. But it has the side effect
that user will end up with a permanently mounted swap just because a sector is bad. That might
be somewhat unacceptable?

> 
>> ---
>>  mm/shmem.c | 39 +++++++++++++++++++++++++++++++++++++++
>>  1 file changed, 39 insertions(+)
>>
>> diff --git a/mm/shmem.c b/mm/shmem.c
>> index d3c7970e0179..d55dd972023a 100644
>> --- a/mm/shmem.c
>> +++ b/mm/shmem.c
>> @@ -1175,6 +1175,10 @@ static int shmem_find_swap_entries(struct address_space *mapping,
>>  			continue;
>>  
>>  		entry = radix_to_swp_entry(folio);
>> +		/*
>> +		 * swapin error entries can be found in the mapping. But they're
>> +		 * deliberately ignored here as we've done everything we can do.
>> +		 */
>>  		if (swp_type(entry) != type)
>>  			continue;
>>  
>> @@ -1672,6 +1676,36 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
>>  	return error;
>>  }
>>  
>> +static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
>> +					 struct folio *folio, swp_entry_t swap)
>> +{
>> +	struct address_space *mapping = inode->i_mapping;
>> +	struct shmem_inode_info *info = SHMEM_I(inode);
>> +	swp_entry_t swapin_error;
>> +	void *old;
>> +
>> +	swapin_error = make_swapin_error_entry(&folio->page);
>> +	old = xa_cmpxchg_irq(&mapping->i_pages, index,
>> +			     swp_to_radix_entry(swap),
>> +			     swp_to_radix_entry(swapin_error), 0);
>> +	if (old != swp_to_radix_entry(swap))
>> +		return;
>> +
>> +	folio_wait_writeback(folio);
>> +	delete_from_swap_cache(&folio->page);
>> +	spin_lock_irq(&info->lock);
>> +	/*
>> +	 * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks won't
>> +	 * be 0 when inode is released and thus trigger WARN_ON(inode->i_blocks) in
>> +	 * shmem_evict_inode.
>> +	 */
>> +	info->alloced--;
>> +	info->swapped--;
>> +	shmem_recalc_inode(inode);
>> +	spin_unlock_irq(&info->lock);
>> +	swap_free(swap);
>> +}
>> +
>>  /*
>>   * Swap in the page pointed to by *pagep.
>>   * Caller has to make sure that *pagep contains a valid swapped page.
> 
> (off-topic a little) BTW, the comment on shmem_swapin_folio() still mentions
> *pagep, but maybe it can be updated to *foliop.

Will do it.

> 
> Thanks,
> Naoya Horiguchi

Many thanks for comment and test ! :)

> 
>> @@ -1695,6 +1729,9 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
>>  	swap = radix_to_swp_entry(*foliop);
>>  	*foliop = NULL;
>>  
>> +	if (is_swapin_error_entry(swap))
>> +		return -EIO;
>> +
>>  	/* Look it up and read it in.. */
>>  	page = lookup_swap_cache(swap, NULL, 0);
>>  	if (!page) {
>> @@ -1762,6 +1799,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
>>  failed:
>>  	if (!shmem_confirm_swap(mapping, index, swap))
>>  		error = -EEXIST;
>> +	if (error == -EIO)
>> +		shmem_set_folio_swapin_error(inode, index, folio, swap);
Miaohe Lin May 21, 2022, 9:34 a.m. UTC | #3
On 2022/5/20 14:34, HORIGUCHI NAOYA(堀口 直也) wrote:
> On Thu, May 19, 2022 at 08:50:29PM +0800, Miaohe Lin wrote:
>> When swap in shmem error at swapoff time, there would be a infinite loop
>> in the while loop in shmem_unuse_inode(). It's because swapin error is
>> deliberately ignored now and thus info->swapped will never reach 0. So
>> we can't escape the loop in shmem_unuse().
>>
>> In order to fix the issue, swapin_error entry is stored in the mapping
>> when swapin error occurs. So the swapcache page can be freed and the
>> user won't end up with a permanently mounted swap because a sector is
>> bad. If the page is accessed later, the user process will be killed
>> so that corrupted data is never consumed. On the other hand, if the
>> page is never accessed, the user won't even notice it.
>>
>> Reported-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
> 
> Hi Miaohe,
> 
> Thank you for the update.  I might miss something, but I still see the same
> problem (I checked it on mm-everything-2022-05-19-00-03 + this patchset).

Hi Naoya,
I reproduce the issue in the linux-next-next-20220520 version. And I found even if
I *do not inject the swapin error*, the deadloop still occurs. After investigating the
code for a long while, I found the root cause:

diff --git a/mm/shmem.c b/mm/shmem.c
index d55dd972023a..6d23ed4d23cb 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1182,7 +1182,7 @@ static int shmem_find_swap_entries(struct address_space *mapping,
                if (swp_type(entry) != type)
                        continue;

-               indices[ret] = xas.xa_index;
+               indices[ret++] = xas.xa_index;
                if (!folio_batch_add(fbatch, folio))
                        break;

The origin code does not increment the ret value when a folio is found. I will send a patch
to fix this next week. Thanks! :)

BTW: With the above change, deadloop doesn't occur when swapin error is injected. I will take
a more close look at next week.

Thanks!

> 
> This patch has the effect to change the return value of shmem_swapin_folio(),
> -EIO (without this patch) to -EEXIST (with this patch).
> But shmem_unuse_swap_entries() checks neither, so no change from caller's view point.
> Maybe breaking in errors (rather than ENOMEM) in for loop in shmem_unuse_swap_entries()
> solves the issue?  I briefly checked with the below change, then swapoff can return
> with failure.
> 
> @@ -1222,7 +1222,7 @@ static int shmem_unuse_swap_entries(struct inode *inode,
>                         folio_put(folio);
>                         ret++;
>                 }
> -               if (error == -ENOMEM)
> +               if (error < 0)
>                         break;
>                 error = 0;
>         }
> 
>> ---
>>  mm/shmem.c | 39 +++++++++++++++++++++++++++++++++++++++
>>  1 file changed, 39 insertions(+)
>>
>> diff --git a/mm/shmem.c b/mm/shmem.c
>> index d3c7970e0179..d55dd972023a 100644
>> --- a/mm/shmem.c
>> +++ b/mm/shmem.c
>> @@ -1175,6 +1175,10 @@ static int shmem_find_swap_entries(struct address_space *mapping,
>>  			continue;
>>  
>>  		entry = radix_to_swp_entry(folio);
>> +		/*
>> +		 * swapin error entries can be found in the mapping. But they're
>> +		 * deliberately ignored here as we've done everything we can do.
>> +		 */
>>  		if (swp_type(entry) != type)
>>  			continue;
>>  
>> @@ -1672,6 +1676,36 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
>>  	return error;
>>  }
>>  
>> +static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
>> +					 struct folio *folio, swp_entry_t swap)
>> +{
>> +	struct address_space *mapping = inode->i_mapping;
>> +	struct shmem_inode_info *info = SHMEM_I(inode);
>> +	swp_entry_t swapin_error;
>> +	void *old;
>> +
>> +	swapin_error = make_swapin_error_entry(&folio->page);
>> +	old = xa_cmpxchg_irq(&mapping->i_pages, index,
>> +			     swp_to_radix_entry(swap),
>> +			     swp_to_radix_entry(swapin_error), 0);
>> +	if (old != swp_to_radix_entry(swap))
>> +		return;
>> +
>> +	folio_wait_writeback(folio);
>> +	delete_from_swap_cache(&folio->page);
>> +	spin_lock_irq(&info->lock);
>> +	/*
>> +	 * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks won't
>> +	 * be 0 when inode is released and thus trigger WARN_ON(inode->i_blocks) in
>> +	 * shmem_evict_inode.
>> +	 */
>> +	info->alloced--;
>> +	info->swapped--;
>> +	shmem_recalc_inode(inode);
>> +	spin_unlock_irq(&info->lock);
>> +	swap_free(swap);
>> +}
>> +
>>  /*
>>   * Swap in the page pointed to by *pagep.
>>   * Caller has to make sure that *pagep contains a valid swapped page.
> 
> (off-topic a little) BTW, the comment on shmem_swapin_folio() still mentions
> *pagep, but maybe it can be updated to *foliop.
> 
> Thanks,
> Naoya Horiguchi
> 
>> @@ -1695,6 +1729,9 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
>>  	swap = radix_to_swp_entry(*foliop);
>>  	*foliop = NULL;
>>  
>> +	if (is_swapin_error_entry(swap))
>> +		return -EIO;
>> +
>>  	/* Look it up and read it in.. */
>>  	page = lookup_swap_cache(swap, NULL, 0);
>>  	if (!page) {
>> @@ -1762,6 +1799,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
>>  failed:
>>  	if (!shmem_confirm_swap(mapping, index, swap))
>>  		error = -EEXIST;
>> +	if (error == -EIO)
>> +		shmem_set_folio_swapin_error(inode, index, folio, swap);
HORIGUCHI NAOYA(堀口 直也) May 22, 2022, 11:53 p.m. UTC | #4
On Fri, May 20, 2022 at 04:17:45PM +0800, Miaohe Lin wrote:
> On 2022/5/20 14:34, HORIGUCHI NAOYA(堀口 直也) wrote:
> > On Thu, May 19, 2022 at 08:50:29PM +0800, Miaohe Lin wrote:
> >> When swap in shmem error at swapoff time, there would be a infinite loop
> >> in the while loop in shmem_unuse_inode(). It's because swapin error is
> >> deliberately ignored now and thus info->swapped will never reach 0. So
> >> we can't escape the loop in shmem_unuse().
> >>
> >> In order to fix the issue, swapin_error entry is stored in the mapping
> >> when swapin error occurs. So the swapcache page can be freed and the
> >> user won't end up with a permanently mounted swap because a sector is
> >> bad. If the page is accessed later, the user process will be killed
> >> so that corrupted data is never consumed. On the other hand, if the
> >> page is never accessed, the user won't even notice it.
> >>
> >> Reported-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
> >> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
> > 
> > Hi Miaohe,
> > 
> > Thank you for the update.  I might miss something, but I still see the same
> > problem (I checked it on mm-everything-2022-05-19-00-03 + this patchset).
> 
> I was testing this patch on my 5.10 kernel. I reproduced the problem in my env and
> fixed it. It seems there might be some critical difference though I checked that by
> reviewing the code... Sorry. :(
> 
> > 
> > This patch has the effect to change the return value of shmem_swapin_folio(),
> > -EIO (without this patch) to -EEXIST (with this patch).
> 
> In fact, I didn't change the return value from -EIO to -EEXIST:
> 
> @@ -1762,6 +1799,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
>  failed:
>  	if (!shmem_confirm_swap(mapping, index, swap))
>  		error = -EEXIST;
> +	if (error == -EIO)
> +		shmem_set_folio_swapin_error(inode, index, folio, swap)
> 
> > But shmem_unuse_swap_entries() checks neither, so no change from caller's view point.
> > Maybe breaking in errors (rather than ENOMEM) in for loop in shmem_unuse_swap_entries()
> > solves the issue?  I briefly checked with the below change, then swapoff can return
> > with failure.
> > 
> > @@ -1222,7 +1222,7 @@ static int shmem_unuse_swap_entries(struct inode *inode,
> >                         folio_put(folio);
> >                         ret++;
> >                 }
> > -               if (error == -ENOMEM)
> > +               if (error < 0)
> >                         break;
> >                 error = 0;
> >         }
> 
> Yes, this is the simplest and straightforward way to fix the issue. But it has the side effect
> that user will end up with a permanently mounted swap just because a sector is bad. That might
> be somewhat unacceptable?

Ah, you're right, swapoff should return with success instead of with
failure.  I tried the fix in your another email, and that makes swapoff
return with success, so your fix looks better than mine.

Thanks,
Naoya Horiguchi
Miaohe Lin May 23, 2022, 3:01 a.m. UTC | #5
On 2022/5/23 7:53, HORIGUCHI NAOYA(堀口 直也) wrote:
> On Fri, May 20, 2022 at 04:17:45PM +0800, Miaohe Lin wrote:
>> On 2022/5/20 14:34, HORIGUCHI NAOYA(堀口 直也) wrote:
>>> On Thu, May 19, 2022 at 08:50:29PM +0800, Miaohe Lin wrote:
>>>> When swap in shmem error at swapoff time, there would be a infinite loop
>>>> in the while loop in shmem_unuse_inode(). It's because swapin error is
>>>> deliberately ignored now and thus info->swapped will never reach 0. So
>>>> we can't escape the loop in shmem_unuse().
>>>>
>>>> In order to fix the issue, swapin_error entry is stored in the mapping
>>>> when swapin error occurs. So the swapcache page can be freed and the
>>>> user won't end up with a permanently mounted swap because a sector is
>>>> bad. If the page is accessed later, the user process will be killed
>>>> so that corrupted data is never consumed. On the other hand, if the
>>>> page is never accessed, the user won't even notice it.
>>>>
>>>> Reported-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
>>>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
>>>
>>> Hi Miaohe,
>>>
>>> Thank you for the update.  I might miss something, but I still see the same
>>> problem (I checked it on mm-everything-2022-05-19-00-03 + this patchset).
>>
>> I was testing this patch on my 5.10 kernel. I reproduced the problem in my env and
>> fixed it. It seems there might be some critical difference though I checked that by
>> reviewing the code... Sorry. :(
>>
>>>
>>> This patch has the effect to change the return value of shmem_swapin_folio(),
>>> -EIO (without this patch) to -EEXIST (with this patch).
>>
>> In fact, I didn't change the return value from -EIO to -EEXIST:
>>
>> @@ -1762,6 +1799,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
>>  failed:
>>  	if (!shmem_confirm_swap(mapping, index, swap))
>>  		error = -EEXIST;
>> +	if (error == -EIO)
>> +		shmem_set_folio_swapin_error(inode, index, folio, swap)
>>
>>> But shmem_unuse_swap_entries() checks neither, so no change from caller's view point.
>>> Maybe breaking in errors (rather than ENOMEM) in for loop in shmem_unuse_swap_entries()
>>> solves the issue?  I briefly checked with the below change, then swapoff can return
>>> with failure.
>>>
>>> @@ -1222,7 +1222,7 @@ static int shmem_unuse_swap_entries(struct inode *inode,
>>>                         folio_put(folio);
>>>                         ret++;
>>>                 }
>>> -               if (error == -ENOMEM)
>>> +               if (error < 0)
>>>                         break;
>>>                 error = 0;
>>>         }
>>
>> Yes, this is the simplest and straightforward way to fix the issue. But it has the side effect
>> that user will end up with a permanently mounted swap just because a sector is bad. That might
>> be somewhat unacceptable?
> 
> Ah, you're right, swapoff should return with success instead of with
> failure.  I tried the fix in your another email, and that makes swapoff
> return with success, so your fix looks better than mine.

I reproduced the deadloop issues when swapin error occurs at swapoff time in my linux-next-next-20220520 env,
and I found this patch could solve the issue now with the fix in my another email.

BTW: When I use dm-dust to inject the swapin IO error, I don't see non-uptodate folio when shmem_swapin_folio
and swapoff succeeds. There might be some issues around that module (so I resort to the another way to inject
the swapin error), but the patch itself works anyway. ;)

> 
> Thanks,

Thanks a lot!

> Naoya Horiguchi
>
Miaohe Lin May 23, 2022, 11:23 a.m. UTC | #6
On 2022/5/23 11:01, Miaohe Lin wrote:
> On 2022/5/23 7:53, HORIGUCHI NAOYA(堀口 直也) wrote:
>> On Fri, May 20, 2022 at 04:17:45PM +0800, Miaohe Lin wrote:
>>> On 2022/5/20 14:34, HORIGUCHI NAOYA(堀口 直也) wrote:
>>>> On Thu, May 19, 2022 at 08:50:29PM +0800, Miaohe Lin wrote:
>>>>> When swap in shmem error at swapoff time, there would be a infinite loop
>>>>> in the while loop in shmem_unuse_inode(). It's because swapin error is
>>>>> deliberately ignored now and thus info->swapped will never reach 0. So
>>>>> we can't escape the loop in shmem_unuse().
>>>>>
>>>>> In order to fix the issue, swapin_error entry is stored in the mapping
>>>>> when swapin error occurs. So the swapcache page can be freed and the
>>>>> user won't end up with a permanently mounted swap because a sector is
>>>>> bad. If the page is accessed later, the user process will be killed
>>>>> so that corrupted data is never consumed. On the other hand, if the
>>>>> page is never accessed, the user won't even notice it.
>>>>>
>>>>> Reported-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
>>>>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
>>>>
>>>> Hi Miaohe,
>>>>
>>>> Thank you for the update.  I might miss something, but I still see the same
>>>> problem (I checked it on mm-everything-2022-05-19-00-03 + this patchset).
>>>
>>> I was testing this patch on my 5.10 kernel. I reproduced the problem in my env and
>>> fixed it. It seems there might be some critical difference though I checked that by
>>> reviewing the code... Sorry. :(
>>>
>>>>
>>>> This patch has the effect to change the return value of shmem_swapin_folio(),
>>>> -EIO (without this patch) to -EEXIST (with this patch).
>>>
>>> In fact, I didn't change the return value from -EIO to -EEXIST:
>>>
>>> @@ -1762,6 +1799,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
>>>  failed:
>>>  	if (!shmem_confirm_swap(mapping, index, swap))
>>>  		error = -EEXIST;
>>> +	if (error == -EIO)
>>> +		shmem_set_folio_swapin_error(inode, index, folio, swap)
>>>
>>>> But shmem_unuse_swap_entries() checks neither, so no change from caller's view point.
>>>> Maybe breaking in errors (rather than ENOMEM) in for loop in shmem_unuse_swap_entries()
>>>> solves the issue?  I briefly checked with the below change, then swapoff can return
>>>> with failure.
>>>>
>>>> @@ -1222,7 +1222,7 @@ static int shmem_unuse_swap_entries(struct inode *inode,
>>>>                         folio_put(folio);
>>>>                         ret++;
>>>>                 }
>>>> -               if (error == -ENOMEM)
>>>> +               if (error < 0)
>>>>                         break;
>>>>                 error = 0;
>>>>         }
>>>
>>> Yes, this is the simplest and straightforward way to fix the issue. But it has the side effect
>>> that user will end up with a permanently mounted swap just because a sector is bad. That might
>>> be somewhat unacceptable?
>>
>> Ah, you're right, swapoff should return with success instead of with
>> failure.  I tried the fix in your another email, and that makes swapoff
>> return with success, so your fix looks better than mine.
> 
> I reproduced the deadloop issues when swapin error occurs at swapoff time in my linux-next-next-20220520 env,
> and I found this patch could solve the issue now with the fix in my another email.
> 
> BTW: When I use dm-dust to inject the swapin IO error, I don't see non-uptodate folio when shmem_swapin_folio
> and swapoff succeeds. There might be some issues around that module (so I resort to the another way to inject
> the swapin error), but the patch itself works anyway. ;)

Sorry, the reason I don't see non-uptodate folio when shmem_swapin_folio is that all the shmem pages are still
in the swapcache. They're not read from disk so there is no really IO error. :) When they're indeed freed, the
deadloop issue occurs.

I am thinking about extending the function of MADV_PAGEOUT to free the swapcache page. The page resides in the
swapcache does not save the system memory anyway. And this could help test the swapin behavior. But I'm not
sure whether it's needed.

Thanks! ;)

> 
>>
>> Thanks,
> 
> Thanks a lot!
> 
>> Naoya Horiguchi
>>
>
HORIGUCHI NAOYA(堀口 直也) May 24, 2022, 6:44 a.m. UTC | #7
On Mon, May 23, 2022 at 07:23:53PM +0800, Miaohe Lin wrote:
...
> > 
> > I reproduced the deadloop issues when swapin error occurs at swapoff time in my linux-next-next-20220520 env,
> > and I found this patch could solve the issue now with the fix in my another email.
> > 
> > BTW: When I use dm-dust to inject the swapin IO error, I don't see non-uptodate folio when shmem_swapin_folio
> > and swapoff succeeds. There might be some issues around that module (so I resort to the another way to inject
> > the swapin error), but the patch itself works anyway. ;)
> 
> Sorry, the reason I don't see non-uptodate folio when shmem_swapin_folio is that all the shmem pages are still
> in the swapcache. They're not read from disk so there is no really IO error. :) When they're indeed freed, the
> deadloop issue occurs.
> 
> I am thinking about extending the function of MADV_PAGEOUT to free the swapcache page. The page resides in the
> swapcache does not save the system memory anyway. And this could help test the swapin behavior. But I'm not
> sure whether it's needed.

The extension make MADV_PAGEOUT free swapcaches makes sense to me,
so I'll support it if the original implementer agrees the change.

Thanks,
Naoya Horiguchi
Miaohe Lin May 24, 2022, 10:56 a.m. UTC | #8
On 2022/5/24 14:44, HORIGUCHI NAOYA(堀口 直也) wrote:
> On Mon, May 23, 2022 at 07:23:53PM +0800, Miaohe Lin wrote:
> ...
>>>
>>> I reproduced the deadloop issues when swapin error occurs at swapoff time in my linux-next-next-20220520 env,
>>> and I found this patch could solve the issue now with the fix in my another email.
>>>
>>> BTW: When I use dm-dust to inject the swapin IO error, I don't see non-uptodate folio when shmem_swapin_folio
>>> and swapoff succeeds. There might be some issues around that module (so I resort to the another way to inject
>>> the swapin error), but the patch itself works anyway. ;)
>>
>> Sorry, the reason I don't see non-uptodate folio when shmem_swapin_folio is that all the shmem pages are still
>> in the swapcache. They're not read from disk so there is no really IO error. :) When they're indeed freed, the
>> deadloop issue occurs.
>>
>> I am thinking about extending the function of MADV_PAGEOUT to free the swapcache page. The page resides in the
>> swapcache does not save the system memory anyway. And this could help test the swapin behavior. But I'm not
>> sure whether it's needed.
> 
> The extension make MADV_PAGEOUT free swapcaches makes sense to me,
> so I'll support it if the original implementer agrees the change.

I'd like trying to do it when I have time. :) Thanks!

> 
> Thanks,
> Naoya Horiguchi
>
Andrew Morton May 24, 2022, 10:10 p.m. UTC | #9
On Sat, 21 May 2022 17:34:28 +0800 Miaohe Lin <linmiaohe@huawei.com> wrote:

> On 2022/5/20 14:34, HORIGUCHI NAOYA(堀口 直也) wrote:
> > On Thu, May 19, 2022 at 08:50:29PM +0800, Miaohe Lin wrote:
> >> When swap in shmem error at swapoff time, there would be a infinite loop
> >> in the while loop in shmem_unuse_inode(). It's because swapin error is
> >> deliberately ignored now and thus info->swapped will never reach 0. So
> >> we can't escape the loop in shmem_unuse().
> >>
> >> In order to fix the issue, swapin_error entry is stored in the mapping
> >> when swapin error occurs. So the swapcache page can be freed and the
> >> user won't end up with a permanently mounted swap because a sector is
> >> bad. If the page is accessed later, the user process will be killed
> >> so that corrupted data is never consumed. On the other hand, if the
> >> page is never accessed, the user won't even notice it.
> >>
> >> Reported-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
> >> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
> > 
> > Hi Miaohe,
> > 
> > Thank you for the update.  I might miss something, but I still see the same
> > problem (I checked it on mm-everything-2022-05-19-00-03 + this patchset).
> 
> Hi Naoya,
> I reproduce the issue in the linux-next-next-20220520 version. And I found even if
> I *do not inject the swapin error*, the deadloop still occurs. After investigating the
> code for a long while, I found the root cause:
> 
> diff --git a/mm/shmem.c b/mm/shmem.c
> index d55dd972023a..6d23ed4d23cb 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -1182,7 +1182,7 @@ static int shmem_find_swap_entries(struct address_space *mapping,
>                 if (swp_type(entry) != type)
>                         continue;
> 
> -               indices[ret] = xas.xa_index;
> +               indices[ret++] = xas.xa_index;
>                 if (!folio_batch_add(fbatch, folio))
>                         break;
> 
> The origin code does not increment the ret value when a folio is found. I will send a patch
> to fix this next week. Thanks! :)

So I'm thinking that with Hugh's "mm/shmem: fix shmem folio swapoff
hang", this patchset is now looking OK, so

mm-swapfile-unuse_pte-can-map-random-data-if-swap-read-fails.patch
mm-swapfile-fix-lost-swap-bits-in-unuse_pte.patch
mm-madvise-free-hwpoison-and-swapin-error-entry-in-madvise_free_pte_range.patch
mm-shmem-fix-infinite-loop-when-swap-in-shmem-error-at-swapoff-time.patch
mm-filter-out-swapin-error-entry-in-shmem-mapping.patch
#

are OK for merging into mainline?
Miaohe Lin May 25, 2022, 1:42 a.m. UTC | #10
On 2022/5/25 6:10, Andrew Morton wrote:
> On Sat, 21 May 2022 17:34:28 +0800 Miaohe Lin <linmiaohe@huawei.com> wrote:
> 
>> On 2022/5/20 14:34, HORIGUCHI NAOYA(堀口 直也) wrote:
>>> On Thu, May 19, 2022 at 08:50:29PM +0800, Miaohe Lin wrote:
>>>> When swap in shmem error at swapoff time, there would be a infinite loop
>>>> in the while loop in shmem_unuse_inode(). It's because swapin error is
>>>> deliberately ignored now and thus info->swapped will never reach 0. So
>>>> we can't escape the loop in shmem_unuse().
>>>>
>>>> In order to fix the issue, swapin_error entry is stored in the mapping
>>>> when swapin error occurs. So the swapcache page can be freed and the
>>>> user won't end up with a permanently mounted swap because a sector is
>>>> bad. If the page is accessed later, the user process will be killed
>>>> so that corrupted data is never consumed. On the other hand, if the
>>>> page is never accessed, the user won't even notice it.
>>>>
>>>> Reported-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
>>>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
>>>
>>> Hi Miaohe,
>>>
>>> Thank you for the update.  I might miss something, but I still see the same
>>> problem (I checked it on mm-everything-2022-05-19-00-03 + this patchset).
>>
>> Hi Naoya,
>> I reproduce the issue in the linux-next-next-20220520 version. And I found even if
>> I *do not inject the swapin error*, the deadloop still occurs. After investigating the
>> code for a long while, I found the root cause:
>>
>> diff --git a/mm/shmem.c b/mm/shmem.c
>> index d55dd972023a..6d23ed4d23cb 100644
>> --- a/mm/shmem.c
>> +++ b/mm/shmem.c
>> @@ -1182,7 +1182,7 @@ static int shmem_find_swap_entries(struct address_space *mapping,
>>                 if (swp_type(entry) != type)
>>                         continue;
>>
>> -               indices[ret] = xas.xa_index;
>> +               indices[ret++] = xas.xa_index;
>>                 if (!folio_batch_add(fbatch, folio))
>>                         break;
>>
>> The origin code does not increment the ret value when a folio is found. I will send a patch
>> to fix this next week. Thanks! :)
> 
> So I'm thinking that with Hugh's "mm/shmem: fix shmem folio swapoff
> hang", this patchset is now looking OK, so
> 
> mm-swapfile-unuse_pte-can-map-random-data-if-swap-read-fails.patch
> mm-swapfile-fix-lost-swap-bits-in-unuse_pte.patch
> mm-madvise-free-hwpoison-and-swapin-error-entry-in-madvise_free_pte_range.patch
> mm-shmem-fix-infinite-loop-when-swap-in-shmem-error-at-swapoff-time.patch
> mm-filter-out-swapin-error-entry-in-shmem-mapping.patch
> #
> 
> are OK for merging into mainline?

I think so. But it might be better to have Acked-by or Reviewed-by tag for [PATCH v4 4/5] first. :)

Thanks!

> .
>
HORIGUCHI NAOYA(堀口 直也) May 25, 2022, 4:32 a.m. UTC | #11
On Thu, May 19, 2022 at 08:50:29PM +0800, Miaohe Lin wrote:
> When swap in shmem error at swapoff time, there would be a infinite loop
> in the while loop in shmem_unuse_inode(). It's because swapin error is
> deliberately ignored now and thus info->swapped will never reach 0. So
> we can't escape the loop in shmem_unuse().
> 
> In order to fix the issue, swapin_error entry is stored in the mapping
> when swapin error occurs. So the swapcache page can be freed and the
> user won't end up with a permanently mounted swap because a sector is
> bad. If the page is accessed later, the user process will be killed
> so that corrupted data is never consumed. On the other hand, if the
> page is never accessed, the user won't even notice it.
> 
> Reported-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
> ---

...
> @@ -1672,6 +1676,36 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
>  	return error;
>  }
>  
> +static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
> +					 struct folio *folio, swp_entry_t swap)
> +{
> +	struct address_space *mapping = inode->i_mapping;
> +	struct shmem_inode_info *info = SHMEM_I(inode);
> +	swp_entry_t swapin_error;
> +	void *old;
> +
> +	swapin_error = make_swapin_error_entry(&folio->page);
> +	old = xa_cmpxchg_irq(&mapping->i_pages, index,
> +			     swp_to_radix_entry(swap),
> +			     swp_to_radix_entry(swapin_error), 0);
> +	if (old != swp_to_radix_entry(swap))
> +		return;
> +
> +	folio_wait_writeback(folio);
> +	delete_from_swap_cache(&folio->page);
> +	spin_lock_irq(&info->lock);
> +	/*
> +	 * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks won't
> +	 * be 0 when inode is released and thus trigger WARN_ON(inode->i_blocks) in
> +	 * shmem_evict_inode.
> +	 */
> +	info->alloced--;
> +	info->swapped--;

I'm not familiar with folio yet and might miss some basic thing,
but is it OK to decrement by one instead of folio_nr_pages()?

Thanks,
Naoya Horiguchi
Miaohe Lin May 25, 2022, 6:40 a.m. UTC | #12
On 2022/5/25 12:32, HORIGUCHI NAOYA(堀口 直也) wrote:
> On Thu, May 19, 2022 at 08:50:29PM +0800, Miaohe Lin wrote:
>> When swap in shmem error at swapoff time, there would be a infinite loop
>> in the while loop in shmem_unuse_inode(). It's because swapin error is
>> deliberately ignored now and thus info->swapped will never reach 0. So
>> we can't escape the loop in shmem_unuse().
>>
>> In order to fix the issue, swapin_error entry is stored in the mapping
>> when swapin error occurs. So the swapcache page can be freed and the
>> user won't end up with a permanently mounted swap because a sector is
>> bad. If the page is accessed later, the user process will be killed
>> so that corrupted data is never consumed. On the other hand, if the
>> page is never accessed, the user won't even notice it.
>>
>> Reported-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
>> ---
> 
> ...
>> @@ -1672,6 +1676,36 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
>>  	return error;
>>  }
>>  
>> +static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
>> +					 struct folio *folio, swp_entry_t swap)
>> +{
>> +	struct address_space *mapping = inode->i_mapping;
>> +	struct shmem_inode_info *info = SHMEM_I(inode);
>> +	swp_entry_t swapin_error;
>> +	void *old;
>> +
>> +	swapin_error = make_swapin_error_entry(&folio->page);
>> +	old = xa_cmpxchg_irq(&mapping->i_pages, index,
>> +			     swp_to_radix_entry(swap),
>> +			     swp_to_radix_entry(swapin_error), 0);
>> +	if (old != swp_to_radix_entry(swap))
>> +		return;
>> +
>> +	folio_wait_writeback(folio);
>> +	delete_from_swap_cache(&folio->page);
>> +	spin_lock_irq(&info->lock);
>> +	/*
>> +	 * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks won't
>> +	 * be 0 when inode is released and thus trigger WARN_ON(inode->i_blocks) in
>> +	 * shmem_evict_inode.
>> +	 */
>> +	info->alloced--;
>> +	info->swapped--;
> 
> I'm not familiar with folio yet and might miss some basic thing,
> but is it OK to decrement by one instead of folio_nr_pages()?

info->swapped is also decremented by one in shmem_swapin_folio(). In fact, no huge page
swapin is supported yet (this is also true for non-shmem case). So I think info->swapped--
should be OK. Or am I miss something?

> 
> Thanks,
> Naoya Horiguchi

Many thanks for review and comment! It's really helpful! :)

>
HORIGUCHI NAOYA(堀口 直也) May 26, 2022, 6:08 a.m. UTC | #13
On Wed, May 25, 2022 at 02:40:40PM +0800, Miaohe Lin wrote:
> On 2022/5/25 12:32, HORIGUCHI NAOYA(堀口 直也) wrote:
> > On Thu, May 19, 2022 at 08:50:29PM +0800, Miaohe Lin wrote:
> >> When swap in shmem error at swapoff time, there would be a infinite loop
> >> in the while loop in shmem_unuse_inode(). It's because swapin error is
> >> deliberately ignored now and thus info->swapped will never reach 0. So
> >> we can't escape the loop in shmem_unuse().
> >>
> >> In order to fix the issue, swapin_error entry is stored in the mapping
> >> when swapin error occurs. So the swapcache page can be freed and the
> >> user won't end up with a permanently mounted swap because a sector is
> >> bad. If the page is accessed later, the user process will be killed
> >> so that corrupted data is never consumed. On the other hand, if the
> >> page is never accessed, the user won't even notice it.
> >>
> >> Reported-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
> >> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
> >> ---
> > 
> > ...
> >> @@ -1672,6 +1676,36 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
> >>  	return error;
> >>  }
> >>  
> >> +static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
> >> +					 struct folio *folio, swp_entry_t swap)
> >> +{
> >> +	struct address_space *mapping = inode->i_mapping;
> >> +	struct shmem_inode_info *info = SHMEM_I(inode);
> >> +	swp_entry_t swapin_error;
> >> +	void *old;
> >> +
> >> +	swapin_error = make_swapin_error_entry(&folio->page);
> >> +	old = xa_cmpxchg_irq(&mapping->i_pages, index,
> >> +			     swp_to_radix_entry(swap),
> >> +			     swp_to_radix_entry(swapin_error), 0);
> >> +	if (old != swp_to_radix_entry(swap))
> >> +		return;
> >> +
> >> +	folio_wait_writeback(folio);
> >> +	delete_from_swap_cache(&folio->page);
> >> +	spin_lock_irq(&info->lock);
> >> +	/*
> >> +	 * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks won't
> >> +	 * be 0 when inode is released and thus trigger WARN_ON(inode->i_blocks) in
> >> +	 * shmem_evict_inode.
> >> +	 */
> >> +	info->alloced--;
> >> +	info->swapped--;
> > 
> > I'm not familiar with folio yet and might miss some basic thing,
> > but is it OK to decrement by one instead of folio_nr_pages()?
> 
> info->swapped is also decremented by one in shmem_swapin_folio(). In fact, no huge page
> swapin is supported yet (this is also true for non-shmem case). So I think info->swapped--
> should be OK. Or am I miss something?

OK, thanks for clarification.

Reviewed-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
diff mbox series

Patch

diff --git a/mm/shmem.c b/mm/shmem.c
index d3c7970e0179..d55dd972023a 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1175,6 +1175,10 @@  static int shmem_find_swap_entries(struct address_space *mapping,
 			continue;
 
 		entry = radix_to_swp_entry(folio);
+		/*
+		 * swapin error entries can be found in the mapping. But they're
+		 * deliberately ignored here as we've done everything we can do.
+		 */
 		if (swp_type(entry) != type)
 			continue;
 
@@ -1672,6 +1676,36 @@  static int shmem_replace_page(struct page **pagep, gfp_t gfp,
 	return error;
 }
 
+static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
+					 struct folio *folio, swp_entry_t swap)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct shmem_inode_info *info = SHMEM_I(inode);
+	swp_entry_t swapin_error;
+	void *old;
+
+	swapin_error = make_swapin_error_entry(&folio->page);
+	old = xa_cmpxchg_irq(&mapping->i_pages, index,
+			     swp_to_radix_entry(swap),
+			     swp_to_radix_entry(swapin_error), 0);
+	if (old != swp_to_radix_entry(swap))
+		return;
+
+	folio_wait_writeback(folio);
+	delete_from_swap_cache(&folio->page);
+	spin_lock_irq(&info->lock);
+	/*
+	 * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks won't
+	 * be 0 when inode is released and thus trigger WARN_ON(inode->i_blocks) in
+	 * shmem_evict_inode.
+	 */
+	info->alloced--;
+	info->swapped--;
+	shmem_recalc_inode(inode);
+	spin_unlock_irq(&info->lock);
+	swap_free(swap);
+}
+
 /*
  * Swap in the page pointed to by *pagep.
  * Caller has to make sure that *pagep contains a valid swapped page.
@@ -1695,6 +1729,9 @@  static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
 	swap = radix_to_swp_entry(*foliop);
 	*foliop = NULL;
 
+	if (is_swapin_error_entry(swap))
+		return -EIO;
+
 	/* Look it up and read it in.. */
 	page = lookup_swap_cache(swap, NULL, 0);
 	if (!page) {
@@ -1762,6 +1799,8 @@  static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
 failed:
 	if (!shmem_confirm_swap(mapping, index, swap))
 		error = -EEXIST;
+	if (error == -EIO)
+		shmem_set_folio_swapin_error(inode, index, folio, swap);
 unlock:
 	if (folio) {
 		folio_unlock(folio);