diff mbox series

[v2] mm/swapfile: unuse_pte can map random data if swap read fails

Message ID 20220416030549.60559-1-linmiaohe@huawei.com (mailing list archive)
State New
Headers show
Series [v2] mm/swapfile: unuse_pte can map random data if swap read fails | expand

Commit Message

Miaohe Lin April 16, 2022, 3:05 a.m. UTC
There is a bug in unuse_pte(): when swap page happens to be unreadable,
page filled with random data is mapped into user address space. In case
of error, a special swap entry indicating swap read fails is set to the
page table. So the swapcache page can be freed and the user won't end up
with a permanently mounted swap because a sector is bad. And if the page
is accessed later, the user process will be killed so that corrupted data
is never consumed. On the other hand, if the page is never accessed, the
user won't even notice it.

Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
---
v2:
  use special swap entry to avoid permanently mounted swap
  free the bad page in swapcache
---
 include/linux/swap.h    |  7 ++++++-
 include/linux/swapops.h | 10 ++++++++++
 mm/memory.c             |  5 ++++-
 mm/swapfile.c           | 11 +++++++++++
 4 files changed, 31 insertions(+), 2 deletions(-)

Comments

Alistair Popple April 19, 2022, 3:51 a.m. UTC | #1
Miaohe Lin <linmiaohe@huawei.com> writes:

> There is a bug in unuse_pte(): when swap page happens to be unreadable,
> page filled with random data is mapped into user address space. In case
> of error, a special swap entry indicating swap read fails is set to the
> page table. So the swapcache page can be freed and the user won't end up
> with a permanently mounted swap because a sector is bad. And if the page
> is accessed later, the user process will be killed so that corrupted data
> is never consumed. On the other hand, if the page is never accessed, the
> user won't even notice it.

Hi Miaohe,

It seems we're not actually using the pfn that gets stored in the special swap
entry here. Is my understanding correct? If so I think it would be better to use
the new PTE markers Peter introduced[1] rather than adding another swap entry
type.

[1] - <https://lore.kernel.org/linux-mm/20220405014833.14015-1-peterx@redhat.com/>

> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
> ---
> v2:
>   use special swap entry to avoid permanently mounted swap
>   free the bad page in swapcache
> ---
>  include/linux/swap.h    |  7 ++++++-
>  include/linux/swapops.h | 10 ++++++++++
>  mm/memory.c             |  5 ++++-
>  mm/swapfile.c           | 11 +++++++++++
>  4 files changed, 31 insertions(+), 2 deletions(-)
>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index d112434f85df..03c576111737 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -55,6 +55,10 @@ static inline int current_is_kswapd(void)
>   * actions on faults.
>   */
>
> +#define SWAP_READ_ERROR_NUM 1
> +#define SWAP_READ_ERROR     (MAX_SWAPFILES + SWP_HWPOISON_NUM + \
> +			     SWP_MIGRATION_NUM + SWP_DEVICE_NUM + \
> +			     SWP_PTE_MARKER_NUM)
>  /*
>   * PTE markers are used to persist information onto PTEs that are mapped with
>   * file-backed memories.  As its name "PTE" hints, it should only be applied to
> @@ -120,7 +124,8 @@ static inline int current_is_kswapd(void)
>
>  #define MAX_SWAPFILES \
>  	((1 << MAX_SWAPFILES_SHIFT) - SWP_DEVICE_NUM - \
> -	SWP_MIGRATION_NUM - SWP_HWPOISON_NUM - SWP_PTE_MARKER_NUM)
> +	SWP_MIGRATION_NUM - SWP_HWPOISON_NUM - \
> +	SWP_PTE_MARKER_NUM - SWAP_READ_ERROR_NUM)
>
>  /*
>   * Magic header for a swap area. The first part of the union is
> diff --git a/include/linux/swapops.h b/include/linux/swapops.h
> index fffbba0036f6..d1093384de9f 100644
> --- a/include/linux/swapops.h
> +++ b/include/linux/swapops.h
> @@ -108,6 +108,16 @@ static inline void *swp_to_radix_entry(swp_entry_t entry)
>  	return xa_mk_value(entry.val);
>  }
>
> +static inline swp_entry_t make_swapin_error_entry(struct page *page)
> +{
> +	return swp_entry(SWAP_READ_ERROR, page_to_pfn(page));
> +}
> +
> +static inline int is_swapin_error_entry(swp_entry_t entry)
> +{
> +	return swp_type(entry) == SWAP_READ_ERROR;
> +}
> +
>  #if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
>  static inline swp_entry_t make_readable_device_private_entry(pgoff_t offset)
>  {
> diff --git a/mm/memory.c b/mm/memory.c
> index e6434b824009..34d1d66a05bd 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -1476,7 +1476,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
>  			/* Only drop the uffd-wp marker if explicitly requested */
>  			if (!zap_drop_file_uffd_wp(details))
>  				continue;
> -		} else if (is_hwpoison_entry(entry)) {
> +		} else if (is_hwpoison_entry(entry) ||
> +			   is_swapin_error_entry(entry)) {
>  			if (!should_zap_cows(details))
>  				continue;
>  		} else {
> @@ -3724,6 +3725,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>  			ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
>  		} else if (is_hwpoison_entry(entry)) {
>  			ret = VM_FAULT_HWPOISON;
> +		} else if (is_swapin_error_entry(entry)) {
> +			ret = VM_FAULT_SIGBUS;
>  		} else if (is_pte_marker_entry(entry)) {
>  			ret = handle_pte_marker(vmf);
>  		} else {
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 9398e915b36b..95b63f69f388 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -1797,6 +1797,17 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
>  		goto out;
>  	}
>
> +	if (unlikely(!PageUptodate(page))) {
> +		pte_t pteval;
> +
> +		dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
> +		pteval = swp_entry_to_pte(make_swapin_error_entry(page));
> +		set_pte_at(vma->vm_mm, addr, pte, pteval);
> +		swap_free(entry);
> +		ret = 0;
> +		goto out;
> +	}
> +
>  	/* See do_swap_page() */
>  	BUG_ON(!PageAnon(page) && PageMappedToDisk(page));
>  	BUG_ON(PageAnon(page) && PageAnonExclusive(page));
Miaohe Lin April 19, 2022, 7:29 a.m. UTC | #2
On 2022/4/19 11:51, Alistair Popple wrote:
> Miaohe Lin <linmiaohe@huawei.com> writes:
> 
>> There is a bug in unuse_pte(): when swap page happens to be unreadable,
>> page filled with random data is mapped into user address space. In case
>> of error, a special swap entry indicating swap read fails is set to the
>> page table. So the swapcache page can be freed and the user won't end up
>> with a permanently mounted swap because a sector is bad. And if the page
>> is accessed later, the user process will be killed so that corrupted data
>> is never consumed. On the other hand, if the page is never accessed, the
>> user won't even notice it.
> 
> Hi Miaohe,
> > It seems we're not actually using the pfn that gets stored in the special swap
> entry here. Is my understanding correct? If so I think it would be better to use

Yes, you're right. The pfn is not used now. What we need here is a special swap entry
to do the right things. I think we can change to store some debugging information instead
of pfn if needed in the future.

> the new PTE markers Peter introduced[1] rather than adding another swap entry
> type.

IIUC, we should not reuse that swap entry here. From definition:

PTE markers
===========
...
PTE marker is a new type of swap entry that is ony applicable to file
backed memories like shmem and hugetlbfs.  It's used to persist some
pte-level information even if the original present ptes in pgtable are
zapped.

It's designed for file backed memories while swapin error entry is for anonymous
memories. And there has some differences in processing. So it's not a good idea
to reuse pte markers. Or am I miss something?

> 
> [1] - <https://lore.kernel.org/linux-mm/20220405014833.14015-1-peterx@redhat.com/>

Many thanks for your comment and suggestion! :)

> 
...
David Hildenbrand April 19, 2022, 7:37 a.m. UTC | #3
On 16.04.22 05:05, Miaohe Lin wrote:
> There is a bug in unuse_pte(): when swap page happens to be unreadable,
> page filled with random data is mapped into user address space. In case
> of error, a special swap entry indicating swap read fails is set to the
> page table. So the swapcache page can be freed and the user won't end up
> with a permanently mounted swap because a sector is bad. And if the page
> is accessed later, the user process will be killed so that corrupted data
> is never consumed. On the other hand, if the page is never accessed, the
> user won't even notice it.
> 
> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
> ---
> v2:
>   use special swap entry to avoid permanently mounted swap
>   free the bad page in swapcache
> ---
>  include/linux/swap.h    |  7 ++++++-
>  include/linux/swapops.h | 10 ++++++++++
>  mm/memory.c             |  5 ++++-
>  mm/swapfile.c           | 11 +++++++++++
>  4 files changed, 31 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index d112434f85df..03c576111737 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -55,6 +55,10 @@ static inline int current_is_kswapd(void)
>   * actions on faults.
>   */
>  
> +#define SWAP_READ_ERROR_NUM 1
> +#define SWAP_READ_ERROR     (MAX_SWAPFILES + SWP_HWPOISON_NUM + \
> +			     SWP_MIGRATION_NUM + SWP_DEVICE_NUM + \
> +			     SWP_PTE_MARKER_NUM)

Does anything speak against reusing the hwpoison marker? At least from a
program POV it's similar "the previously well defined content at this
user space address is no longer readable/writable".

I recall that we can just set the pfn to 0 for the hwpoison marker.

There is e.g., check_hwpoisoned_entry() and it just stops if it finds
"pfn=0".
David Hildenbrand April 19, 2022, 7:39 a.m. UTC | #4
On 19.04.22 09:29, Miaohe Lin wrote:
> On 2022/4/19 11:51, Alistair Popple wrote:
>> Miaohe Lin <linmiaohe@huawei.com> writes:
>>
>>> There is a bug in unuse_pte(): when swap page happens to be unreadable,
>>> page filled with random data is mapped into user address space. In case
>>> of error, a special swap entry indicating swap read fails is set to the
>>> page table. So the swapcache page can be freed and the user won't end up
>>> with a permanently mounted swap because a sector is bad. And if the page
>>> is accessed later, the user process will be killed so that corrupted data
>>> is never consumed. On the other hand, if the page is never accessed, the
>>> user won't even notice it.
>>
>> Hi Miaohe,
>>> It seems we're not actually using the pfn that gets stored in the special swap
>> entry here. Is my understanding correct? If so I think it would be better to use
> 
> Yes, you're right. The pfn is not used now. What we need here is a special swap entry
> to do the right things. I think we can change to store some debugging information instead
> of pfn if needed in the future.
> 
>> the new PTE markers Peter introduced[1] rather than adding another swap entry
>> type.
> 
> IIUC, we should not reuse that swap entry here. From definition:
> 
> PTE markers
> ===========
> ...
> PTE marker is a new type of swap entry that is ony applicable to file
> backed memories like shmem and hugetlbfs.  It's used to persist some
> pte-level information even if the original present ptes in pgtable are
> zapped.
> 
> It's designed for file backed memories while swapin error entry is for anonymous
> memories. And there has some differences in processing. So it's not a good idea
> to reuse pte markers. Or am I miss something?

I tend to agree. As raised in my other reply, maybe we can simply reuse
hwpoison entries and update the documentation of them accordingly.
Alistair Popple April 19, 2022, 7:53 a.m. UTC | #5
Also in madvise_free_pte_range() you could just remove the swap entry as it's no
longer needed.

Alistair Popple <apopple@nvidia.com> writes:

> Miaohe Lin <linmiaohe@huawei.com> writes:
>
>> There is a bug in unuse_pte(): when swap page happens to be unreadable,
>> page filled with random data is mapped into user address space. In case
>> of error, a special swap entry indicating swap read fails is set to the
>> page table. So the swapcache page can be freed and the user won't end up
>> with a permanently mounted swap because a sector is bad. And if the page
>> is accessed later, the user process will be killed so that corrupted data
>> is never consumed. On the other hand, if the page is never accessed, the
>> user won't even notice it.
>
> Hi Miaohe,
>
> It seems we're not actually using the pfn that gets stored in the special swap
> entry here. Is my understanding correct? If so I think it would be better to use
> the new PTE markers Peter introduced[1] rather than adding another swap entry
> type.
>
> [1] - <https://lore.kernel.org/linux-mm/20220405014833.14015-1-peterx@redhat.com/>
>
>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
>> ---
>> v2:
>>   use special swap entry to avoid permanently mounted swap
>>   free the bad page in swapcache
>> ---
>>  include/linux/swap.h    |  7 ++++++-
>>  include/linux/swapops.h | 10 ++++++++++
>>  mm/memory.c             |  5 ++++-
>>  mm/swapfile.c           | 11 +++++++++++
>>  4 files changed, 31 insertions(+), 2 deletions(-)
>>
>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>> index d112434f85df..03c576111737 100644
>> --- a/include/linux/swap.h
>> +++ b/include/linux/swap.h
>> @@ -55,6 +55,10 @@ static inline int current_is_kswapd(void)
>>   * actions on faults.
>>   */
>>
>> +#define SWAP_READ_ERROR_NUM 1
>> +#define SWAP_READ_ERROR     (MAX_SWAPFILES + SWP_HWPOISON_NUM + \
>> +			     SWP_MIGRATION_NUM + SWP_DEVICE_NUM + \
>> +			     SWP_PTE_MARKER_NUM)
>>  /*
>>   * PTE markers are used to persist information onto PTEs that are mapped with
>>   * file-backed memories.  As its name "PTE" hints, it should only be applied to
>> @@ -120,7 +124,8 @@ static inline int current_is_kswapd(void)
>>
>>  #define MAX_SWAPFILES \
>>  	((1 << MAX_SWAPFILES_SHIFT) - SWP_DEVICE_NUM - \
>> -	SWP_MIGRATION_NUM - SWP_HWPOISON_NUM - SWP_PTE_MARKER_NUM)
>> +	SWP_MIGRATION_NUM - SWP_HWPOISON_NUM - \
>> +	SWP_PTE_MARKER_NUM - SWAP_READ_ERROR_NUM)
>>
>>  /*
>>   * Magic header for a swap area. The first part of the union is
>> diff --git a/include/linux/swapops.h b/include/linux/swapops.h
>> index fffbba0036f6..d1093384de9f 100644
>> --- a/include/linux/swapops.h
>> +++ b/include/linux/swapops.h
>> @@ -108,6 +108,16 @@ static inline void *swp_to_radix_entry(swp_entry_t entry)
>>  	return xa_mk_value(entry.val);
>>  }
>>
>> +static inline swp_entry_t make_swapin_error_entry(struct page *page)
>> +{
>> +	return swp_entry(SWAP_READ_ERROR, page_to_pfn(page));
>> +}
>> +
>> +static inline int is_swapin_error_entry(swp_entry_t entry)
>> +{
>> +	return swp_type(entry) == SWAP_READ_ERROR;
>> +}
>> +
>>  #if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
>>  static inline swp_entry_t make_readable_device_private_entry(pgoff_t offset)
>>  {
>> diff --git a/mm/memory.c b/mm/memory.c
>> index e6434b824009..34d1d66a05bd 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -1476,7 +1476,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
>>  			/* Only drop the uffd-wp marker if explicitly requested */
>>  			if (!zap_drop_file_uffd_wp(details))
>>  				continue;
>> -		} else if (is_hwpoison_entry(entry)) {
>> +		} else if (is_hwpoison_entry(entry) ||
>> +			   is_swapin_error_entry(entry)) {
>>  			if (!should_zap_cows(details))
>>  				continue;
>>  		} else {
>> @@ -3724,6 +3725,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>  			ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
>>  		} else if (is_hwpoison_entry(entry)) {
>>  			ret = VM_FAULT_HWPOISON;
>> +		} else if (is_swapin_error_entry(entry)) {
>> +			ret = VM_FAULT_SIGBUS;
>>  		} else if (is_pte_marker_entry(entry)) {
>>  			ret = handle_pte_marker(vmf);
>>  		} else {
>> diff --git a/mm/swapfile.c b/mm/swapfile.c
>> index 9398e915b36b..95b63f69f388 100644
>> --- a/mm/swapfile.c
>> +++ b/mm/swapfile.c
>> @@ -1797,6 +1797,17 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
>>  		goto out;
>>  	}
>>
>> +	if (unlikely(!PageUptodate(page))) {
>> +		pte_t pteval;
>> +
>> +		dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
>> +		pteval = swp_entry_to_pte(make_swapin_error_entry(page));
>> +		set_pte_at(vma->vm_mm, addr, pte, pteval);
>> +		swap_free(entry);
>> +		ret = 0;
>> +		goto out;
>> +	}
>> +
>>  	/* See do_swap_page() */
>>  	BUG_ON(!PageAnon(page) && PageMappedToDisk(page));
>>  	BUG_ON(PageAnon(page) && PageAnonExclusive(page));
Alistair Popple April 19, 2022, 8:08 a.m. UTC | #6
David Hildenbrand <david@redhat.com> writes:

> On 19.04.22 09:29, Miaohe Lin wrote:
>> On 2022/4/19 11:51, Alistair Popple wrote:
>>> Miaohe Lin <linmiaohe@huawei.com> writes:
>>>
>>>> There is a bug in unuse_pte(): when swap page happens to be unreadable,
>>>> page filled with random data is mapped into user address space. In case
>>>> of error, a special swap entry indicating swap read fails is set to the
>>>> page table. So the swapcache page can be freed and the user won't end up
>>>> with a permanently mounted swap because a sector is bad. And if the page
>>>> is accessed later, the user process will be killed so that corrupted data
>>>> is never consumed. On the other hand, if the page is never accessed, the
>>>> user won't even notice it.
>>>
>>> Hi Miaohe,
>>>> It seems we're not actually using the pfn that gets stored in the special swap
>>> entry here. Is my understanding correct? If so I think it would be better to use
>>
>> Yes, you're right. The pfn is not used now. What we need here is a special swap entry
>> to do the right things. I think we can change to store some debugging information instead
>> of pfn if needed in the future.
>>
>>> the new PTE markers Peter introduced[1] rather than adding another swap entry
>>> type.
>>
>> IIUC, we should not reuse that swap entry here. From definition:
>>
>> PTE markers
>> `========='
>> ...
>> PTE marker is a new type of swap entry that is ony applicable to file
>> backed memories like shmem and hugetlbfs.  It's used to persist some
>> pte-level information even if the original present ptes in pgtable are
>> zapped.
>>
>> It's designed for file backed memories while swapin error entry is for anonymous
>> memories. And there has some differences in processing. So it's not a good idea
>> to reuse pte markers. Or am I miss something?
>
> I tend to agree. As raised in my other reply, maybe we can simply reuse
> hwpoison entries and update the documentation of them accordingly.

Unless I've missed something I don't think PTE markers should be restricted
solely to file backed memory. It's true that the only user of them at the moment
is UFFD-WP for file backed memory, but PTE markers are just a special swap entry
same as what is added here.

That said I don't think there has been any attempt to make PTE markers work for
anything other than UFFD-WP because it was unclear if there ever would be
another user.

But I agree re-using hwpoison entries is probably a better fit if possible.

- Alistair
David Hildenbrand April 19, 2022, 11:14 a.m. UTC | #7
On 19.04.22 10:08, Alistair Popple wrote:
> David Hildenbrand <david@redhat.com> writes:
> 
>> On 19.04.22 09:29, Miaohe Lin wrote:
>>> On 2022/4/19 11:51, Alistair Popple wrote:
>>>> Miaohe Lin <linmiaohe@huawei.com> writes:
>>>>
>>>>> There is a bug in unuse_pte(): when swap page happens to be unreadable,
>>>>> page filled with random data is mapped into user address space. In case
>>>>> of error, a special swap entry indicating swap read fails is set to the
>>>>> page table. So the swapcache page can be freed and the user won't end up
>>>>> with a permanently mounted swap because a sector is bad. And if the page
>>>>> is accessed later, the user process will be killed so that corrupted data
>>>>> is never consumed. On the other hand, if the page is never accessed, the
>>>>> user won't even notice it.
>>>>
>>>> Hi Miaohe,
>>>>> It seems we're not actually using the pfn that gets stored in the special swap
>>>> entry here. Is my understanding correct? If so I think it would be better to use
>>>
>>> Yes, you're right. The pfn is not used now. What we need here is a special swap entry
>>> to do the right things. I think we can change to store some debugging information instead
>>> of pfn if needed in the future.
>>>
>>>> the new PTE markers Peter introduced[1] rather than adding another swap entry
>>>> type.
>>>
>>> IIUC, we should not reuse that swap entry here. From definition:
>>>
>>> PTE markers
>>> `========='
>>> ...
>>> PTE marker is a new type of swap entry that is ony applicable to file
>>> backed memories like shmem and hugetlbfs.  It's used to persist some
>>> pte-level information even if the original present ptes in pgtable are
>>> zapped.
>>>
>>> It's designed for file backed memories while swapin error entry is for anonymous
>>> memories. And there has some differences in processing. So it's not a good idea
>>> to reuse pte markers. Or am I miss something?
>>
>> I tend to agree. As raised in my other reply, maybe we can simply reuse
>> hwpoison entries and update the documentation of them accordingly.
> 
> Unless I've missed something I don't think PTE markers should be restricted
> solely to file backed memory. It's true that the only user of them at the moment
> is UFFD-WP for file backed memory, but PTE markers are just a special swap entry
> same as what is added here.

There is a difference.

What we want here is "there used to be something mapped but it's not
readable anymore. Please fail hard when userspace tries accessing
this.". Just like with hwpoison entries.

What a pte marker expresses is that "here is nothing mapped right now
but we have additional metadata available here. For file-backed memory,
it translates to: If we ever touch this page, lookup the pagecache what
to map here."

In the anonymous memory world, this would map to "populate the zeropage
or a fresh anonymous page on access." and keep the metadata around.

Yes, one might argue that for uffd-wp on anonymous memory it might make
sense to use pte marker as well, when no page has been populated yet.

IIRC, trying to arm uffd-wp when there is nothing populated yet will
simply get ignored.

> 
> That said I don't think there has been any attempt to make PTE markers work for
> anything other than UFFD-WP because it was unclear if there ever would be
> another user.

We discussed using it for softdirty tracking as well.
Miaohe Lin April 19, 2022, 11:14 a.m. UTC | #8
On 2022/4/19 16:08, Alistair Popple wrote:
> David Hildenbrand <david@redhat.com> writes:
> 
>> On 19.04.22 09:29, Miaohe Lin wrote:
>>> On 2022/4/19 11:51, Alistair Popple wrote:
>>>> Miaohe Lin <linmiaohe@huawei.com> writes:
>>>>
>>>>> There is a bug in unuse_pte(): when swap page happens to be unreadable,
>>>>> page filled with random data is mapped into user address space. In case
>>>>> of error, a special swap entry indicating swap read fails is set to the
>>>>> page table. So the swapcache page can be freed and the user won't end up
>>>>> with a permanently mounted swap because a sector is bad. And if the page
>>>>> is accessed later, the user process will be killed so that corrupted data
>>>>> is never consumed. On the other hand, if the page is never accessed, the
>>>>> user won't even notice it.
>>>>
>>>> Hi Miaohe,
>>>>> It seems we're not actually using the pfn that gets stored in the special swap
>>>> entry here. Is my understanding correct? If so I think it would be better to use
>>>
>>> Yes, you're right. The pfn is not used now. What we need here is a special swap entry
>>> to do the right things. I think we can change to store some debugging information instead
>>> of pfn if needed in the future.
>>>
>>>> the new PTE markers Peter introduced[1] rather than adding another swap entry
>>>> type.
>>>
>>> IIUC, we should not reuse that swap entry here. From definition:
>>>
>>> PTE markers
>>> `========='
>>> ...
>>> PTE marker is a new type of swap entry that is ony applicable to file
>>> backed memories like shmem and hugetlbfs.  It's used to persist some
>>> pte-level information even if the original present ptes in pgtable are
>>> zapped.
>>>
>>> It's designed for file backed memories while swapin error entry is for anonymous
>>> memories. And there has some differences in processing. So it's not a good idea
>>> to reuse pte markers. Or am I miss something?
>>
>> I tend to agree. As raised in my other reply, maybe we can simply reuse
>> hwpoison entries and update the documentation of them accordingly.
> 
> Unless I've missed something I don't think PTE markers should be restricted
> solely to file backed memory. It's true that the only user of them at the moment
> is UFFD-WP for file backed memory, but PTE markers are just a special swap entry
> same as what is added here.
> 
> That said I don't think there has been any attempt to make PTE markers work for
> anything other than UFFD-WP because it was unclear if there ever would be
> another user.

If PTE markers can also handle the swapin error case, I will try to use it if
we can't reuse hwpoison entries.

> 
> But I agree re-using hwpoison entries is probably a better fit if possible.

Agree. As David said, "At least from a program POV it's similar "the previously well
defined content at this user space address is no longer readable/writable"."

> 
> - Alistair

Many thanks!

>
Miaohe Lin April 19, 2022, 11:21 a.m. UTC | #9
On 2022/4/19 15:37, David Hildenbrand wrote:
> On 16.04.22 05:05, Miaohe Lin wrote:
>> There is a bug in unuse_pte(): when swap page happens to be unreadable,
>> page filled with random data is mapped into user address space. In case
>> of error, a special swap entry indicating swap read fails is set to the
>> page table. So the swapcache page can be freed and the user won't end up
>> with a permanently mounted swap because a sector is bad. And if the page
>> is accessed later, the user process will be killed so that corrupted data
>> is never consumed. On the other hand, if the page is never accessed, the
>> user won't even notice it.
>>
>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
>> ---
>> v2:
>>   use special swap entry to avoid permanently mounted swap
>>   free the bad page in swapcache
>> ---
>>  include/linux/swap.h    |  7 ++++++-
>>  include/linux/swapops.h | 10 ++++++++++
>>  mm/memory.c             |  5 ++++-
>>  mm/swapfile.c           | 11 +++++++++++
>>  4 files changed, 31 insertions(+), 2 deletions(-)
>>
>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>> index d112434f85df..03c576111737 100644
>> --- a/include/linux/swap.h
>> +++ b/include/linux/swap.h
>> @@ -55,6 +55,10 @@ static inline int current_is_kswapd(void)
>>   * actions on faults.
>>   */
>>  
>> +#define SWAP_READ_ERROR_NUM 1
>> +#define SWAP_READ_ERROR     (MAX_SWAPFILES + SWP_HWPOISON_NUM + \
>> +			     SWP_MIGRATION_NUM + SWP_DEVICE_NUM + \
>> +			     SWP_PTE_MARKER_NUM)
> 
> Does anything speak against reusing the hwpoison marker? At least from a
> program POV it's similar "the previously well defined content at this
> user space address is no longer readable/writable".

Looks like a good idea. :)

> 
> I recall that we can just set the pfn to 0 for the hwpoison marker.
> 
> There is e.g., check_hwpoisoned_entry() and it just stops if it finds
> "pfn=0".

Do you mean that we should set the pfn to 0 for the hwpoison marker so that we can
distinguish swapin error case from real hwpoison case?

> 
> 

Will try to do this in next version. Thanks a lot!
Miaohe Lin April 19, 2022, 11:26 a.m. UTC | #10
On 2022/4/19 15:53, Alistair Popple wrote:
> Also in madvise_free_pte_range() you could just remove the swap entry as it's no
> longer needed.
> 

This swap entry will be removed in madvise_dontneed_single_vma().
And in madvise_free_pte_range(), we may need to keep it as same as
hwpoison entry. Or am I supposed to remove it even if hwpoison entry
is reused later?

Thanks!

> Alistair Popple <apopple@nvidia.com> writes:
> 
>> Miaohe Lin <linmiaohe@huawei.com> writes:
>>
...
David Hildenbrand April 19, 2022, 11:46 a.m. UTC | #11
On 19.04.22 13:21, Miaohe Lin wrote:
> On 2022/4/19 15:37, David Hildenbrand wrote:
>> On 16.04.22 05:05, Miaohe Lin wrote:
>>> There is a bug in unuse_pte(): when swap page happens to be unreadable,
>>> page filled with random data is mapped into user address space. In case
>>> of error, a special swap entry indicating swap read fails is set to the
>>> page table. So the swapcache page can be freed and the user won't end up
>>> with a permanently mounted swap because a sector is bad. And if the page
>>> is accessed later, the user process will be killed so that corrupted data
>>> is never consumed. On the other hand, if the page is never accessed, the
>>> user won't even notice it.
>>>
>>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
>>> ---
>>> v2:
>>>   use special swap entry to avoid permanently mounted swap
>>>   free the bad page in swapcache
>>> ---
>>>  include/linux/swap.h    |  7 ++++++-
>>>  include/linux/swapops.h | 10 ++++++++++
>>>  mm/memory.c             |  5 ++++-
>>>  mm/swapfile.c           | 11 +++++++++++
>>>  4 files changed, 31 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>>> index d112434f85df..03c576111737 100644
>>> --- a/include/linux/swap.h
>>> +++ b/include/linux/swap.h
>>> @@ -55,6 +55,10 @@ static inline int current_is_kswapd(void)
>>>   * actions on faults.
>>>   */
>>>  
>>> +#define SWAP_READ_ERROR_NUM 1
>>> +#define SWAP_READ_ERROR     (MAX_SWAPFILES + SWP_HWPOISON_NUM + \
>>> +			     SWP_MIGRATION_NUM + SWP_DEVICE_NUM + \
>>> +			     SWP_PTE_MARKER_NUM)
>>
>> Does anything speak against reusing the hwpoison marker? At least from a
>> program POV it's similar "the previously well defined content at this
>> user space address is no longer readable/writable".
> 
> Looks like a good idea. :)
> 
>>
>> I recall that we can just set the pfn to 0 for the hwpoison marker.
>>
>> There is e.g., check_hwpoisoned_entry() and it just stops if it finds
>> "pfn=0".
> 
> Do you mean that we should set the pfn to 0 for the hwpoison marker so that we can
> distinguish swapin error case from real hwpoison case?

I am not sure if we really have to distinguish. However, "0" seems to
make sense to indicate "this is not an actual problematic PFN, the
information is simply no longer around due to a hardware issue.
Miaohe Lin April 19, 2022, noon UTC | #12
On 2022/4/19 19:46, David Hildenbrand wrote:
...
>> Do you mean that we should set the pfn to 0 for the hwpoison marker so that we can
>> distinguish swapin error case from real hwpoison case?
> 
> I am not sure if we really have to distinguish. However, "0" seems to
> make sense to indicate "this is not an actual problematic PFN, the
> information is simply no longer around due to a hardware issue.
> 

IMHO, we have to distinguish. For example, we might need to return VM_FAULT_SIGBUS
instead of VM_FAULT_HWPOISON when user accesses the error page. Or should we simply
return VM_FAULT_HWPOISON to simplify the handling?

Thanks!
David Hildenbrand April 19, 2022, 12:12 p.m. UTC | #13
On 19.04.22 14:00, Miaohe Lin wrote:
> On 2022/4/19 19:46, David Hildenbrand wrote:
> ...
>>> Do you mean that we should set the pfn to 0 for the hwpoison marker so that we can
>>> distinguish swapin error case from real hwpoison case?
>>
>> I am not sure if we really have to distinguish. However, "0" seems to
>> make sense to indicate "this is not an actual problematic PFN, the
>> information is simply no longer around due to a hardware issue.
>>
> 
> IMHO, we have to distinguish. For example, we might need to return VM_FAULT_SIGBUS
> instead of VM_FAULT_HWPOISON when user accesses the error page. Or should we simply
> return VM_FAULT_HWPOISON to simplify the handling?

Hm, you're right. In e.g., x86 do_sigbus() we would send an BUS_MCEERR_AR.

So yes, if we reuse is_hwpoison_entry() we'd have to convert to either
VM_FAULT_HWPOISON or VM_FAULT_SIGBUS.

Something like "is_error_entry()" that can further be refined to
hwpoison or swapin could make sense. But what you have here is straight
forward to me as well. Whatever you/others prefer.

Acked-by: David Hildenbrand <david@redhat.com>


NIT: I'd make the terminology make_swapin_error_entry() consistent with
SWAP_READ_ERROR and especially existing SWP_.

For example, calling the latter SWP_SWAPIN_ERROR
Miaohe Lin April 19, 2022, 12:45 p.m. UTC | #14
On 2022/4/19 20:12, David Hildenbrand wrote:
> On 19.04.22 14:00, Miaohe Lin wrote:
>> On 2022/4/19 19:46, David Hildenbrand wrote:
>> ...
>>>> Do you mean that we should set the pfn to 0 for the hwpoison marker so that we can
>>>> distinguish swapin error case from real hwpoison case?
>>>
>>> I am not sure if we really have to distinguish. However, "0" seems to
>>> make sense to indicate "this is not an actual problematic PFN, the
>>> information is simply no longer around due to a hardware issue.
>>>
>>
>> IMHO, we have to distinguish. For example, we might need to return VM_FAULT_SIGBUS
>> instead of VM_FAULT_HWPOISON when user accesses the error page. Or should we simply
>> return VM_FAULT_HWPOISON to simplify the handling?
> 
> Hm, you're right. In e.g., x86 do_sigbus() we would send an BUS_MCEERR_AR.
> 
> So yes, if we reuse is_hwpoison_entry() we'd have to convert to either
> VM_FAULT_HWPOISON or VM_FAULT_SIGBUS.
> 
> Something like "is_error_entry()" that can further be refined to
> hwpoison or swapin could make sense. But what you have here is straight
> forward to me as well. Whatever you/others prefer.

IMHO, I prefer to use the separated swapin error maker because reusing hwpoison entry
might make things more complicated and looks somewhat obscure.

> 
> Acked-by: David Hildenbrand <david@redhat.com>

Many thanks for your Acked-by tag!

> 
> 
> NIT: I'd make the terminology make_swapin_error_entry() consistent with
> SWAP_READ_ERROR and especially existing SWP_.
> 
> For example, calling the latter SWP_SWAPIN_ERROR

This looks better. Thanks! Will change to rename the relevant terminology in next
version if no one has an objection. :)

>
Peter Xu April 19, 2022, 4:16 p.m. UTC | #15
On Tue, Apr 19, 2022 at 01:14:29PM +0200, David Hildenbrand wrote:
> On 19.04.22 10:08, Alistair Popple wrote:
> > David Hildenbrand <david@redhat.com> writes:
> > 
> >> On 19.04.22 09:29, Miaohe Lin wrote:
> >>> On 2022/4/19 11:51, Alistair Popple wrote:
> >>>> Miaohe Lin <linmiaohe@huawei.com> writes:
> >>>>
> >>>>> There is a bug in unuse_pte(): when swap page happens to be unreadable,
> >>>>> page filled with random data is mapped into user address space. In case
> >>>>> of error, a special swap entry indicating swap read fails is set to the
> >>>>> page table. So the swapcache page can be freed and the user won't end up
> >>>>> with a permanently mounted swap because a sector is bad. And if the page
> >>>>> is accessed later, the user process will be killed so that corrupted data
> >>>>> is never consumed. On the other hand, if the page is never accessed, the
> >>>>> user won't even notice it.
> >>>>
> >>>> Hi Miaohe,
> >>>>> It seems we're not actually using the pfn that gets stored in the special swap
> >>>> entry here. Is my understanding correct? If so I think it would be better to use
> >>>
> >>> Yes, you're right. The pfn is not used now. What we need here is a special swap entry
> >>> to do the right things. I think we can change to store some debugging information instead
> >>> of pfn if needed in the future.
> >>>
> >>>> the new PTE markers Peter introduced[1] rather than adding another swap entry
> >>>> type.
> >>>
> >>> IIUC, we should not reuse that swap entry here. From definition:
> >>>
> >>> PTE markers
> >>> `========='
> >>> ...
> >>> PTE marker is a new type of swap entry that is ony applicable to file
> >>> backed memories like shmem and hugetlbfs.  It's used to persist some
> >>> pte-level information even if the original present ptes in pgtable are
> >>> zapped.
> >>>
> >>> It's designed for file backed memories while swapin error entry is for anonymous
> >>> memories. And there has some differences in processing. So it's not a good idea
> >>> to reuse pte markers. Or am I miss something?
> >>
> >> I tend to agree. As raised in my other reply, maybe we can simply reuse
> >> hwpoison entries and update the documentation of them accordingly.
> > 
> > Unless I've missed something I don't think PTE markers should be restricted
> > solely to file backed memory. It's true that the only user of them at the moment
> > is UFFD-WP for file backed memory, but PTE markers are just a special swap entry
> > same as what is added here.
> 
> There is a difference.
> 
> What we want here is "there used to be something mapped but it's not
> readable anymore. Please fail hard when userspace tries accessing
> this.". Just like with hwpoison entries.
> 
> What a pte marker expresses is that "here is nothing mapped right now
> but we have additional metadata available here. For file-backed memory,
> it translates to: If we ever touch this page, lookup the pagecache what
> to map here."
> 
> In the anonymous memory world, this would map to "populate the zeropage
> or a fresh anonymous page on access." and keep the metadata around.

So far it's defined like that, but it does not necessarily need to.  IMHO
PTE marker could work here for the anonymous use case as Alistair stated.
Say, it's fairly simple to not go into anonymous page handling at all if we
see this pte marker with the new bit set.  It's indeed just tailored for
such use case where we don't need to store special data like pfn.

Hwpoison entry looks good to me too, but as discussed we may need to
reserve pfn=0 or -1 or anything we're sure an invalid value, and then we'll
also need to cover the rest hwpoison related code (carefully, as rightfully
pointed out by Miaohe on the difference of VM_FAULT_* fields being
returned) to not faultly treat the "swp device read error" with general
MCEs.

From that POV it seems pte markers would be slightly cleaner, we'll need to
touch up existing pte markers code path to start accept anonymous vmas,
though.  No strong opinion on this.

Btw, is there an error dumped into dmesg when the read error happens (e.g.,
would block IO path trigger some warning already)?  I'm wondering whether
we should report it to the user somehow so that the user should know even
earlier than when the bad page is accessed, then the user could potentially
do something useful.
Peter Xu April 19, 2022, 9:36 p.m. UTC | #16
On Sat, Apr 16, 2022 at 11:05:49AM +0800, Miaohe Lin wrote:
> @@ -1797,6 +1797,17 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
>  		goto out;
>  	}
>  
> +	if (unlikely(!PageUptodate(page))) {
> +		pte_t pteval;
> +
> +		dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
> +		pteval = swp_entry_to_pte(make_swapin_error_entry(page));
> +		set_pte_at(vma->vm_mm, addr, pte, pteval);
> +		swap_free(entry);
> +		ret = 0;
> +		goto out;
> +	}
> +
>  	/* See do_swap_page() */
>  	BUG_ON(!PageAnon(page) && PageMappedToDisk(page));
>  	BUG_ON(PageAnon(page) && PageAnonExclusive(page));

Totally off-topic, but.. today when I was looking at the unuse path I just
found that the swp bits could have got lost for either soft-dirty and
uffd-wp here?  A quick patch attached.

Maybe at some point we should start to have some special helpers for
set_pte_at() when we're converting between present/non-present ptes, so as
to make sure all these will always be taken care of properly.
Alistair Popple April 20, 2022, 12:25 a.m. UTC | #17
Miaohe Lin <linmiaohe@huawei.com> writes:

> On 2022/4/19 15:53, Alistair Popple wrote:
>> Also in madvise_free_pte_range() you could just remove the swap entry as it's no
>> longer needed.
>>
>
> This swap entry will be removed in madvise_dontneed_single_vma().
> And in madvise_free_pte_range(), we may need to keep it as same as
> hwpoison entry. Or am I supposed to remove it even if hwpoison entry
> is reused later?

Why would we need to keep it for MADV_FREE though? It only works on private
anonymous memory, and once the MADV_FREE operation has succeeded callers can
expect they might get zero-fill pages if accessing the memory again. Therefore
it should be safe to delete the entry. I think that applies equally to a
hwpoison entry too - there's no reason to kill the process if it has called
MADV_FREE on the range.

>
> Thanks!
>
>> Alistair Popple <apopple@nvidia.com> writes:
>>
>>> Miaohe Lin <linmiaohe@huawei.com> writes:
>>>
> ...
kernel test robot April 20, 2022, 5:56 a.m. UTC | #18
Hi Peter,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on hnaz-mm/master]

url:    https://github.com/intel-lab-lkp/linux/commits/Peter-Xu/mm-swap-Fix-lost-swap-bits-in-unuse_pte/20220420-053845
base:   https://github.com/hnaz/linux-mm master
config: s390-randconfig-r023-20220420 (https://download.01.org/0day-ci/archive/20220420/202204201313.QYiDBRbL-lkp@intel.com/config)
compiler: clang version 15.0.0 (https://github.com/llvm/llvm-project bac6cd5bf85669e3376610cfc4c4f9ca015e7b9b)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # install s390 cross compiling tool for clang build
        # apt-get install binutils-s390x-linux-gnu
        # https://github.com/intel-lab-lkp/linux/commit/355ac3eb45402f7aab25b76af029d4390af05238
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Peter-Xu/mm-swap-Fix-lost-swap-bits-in-unuse_pte/20220420-053845
        git checkout 355ac3eb45402f7aab25b76af029d4390af05238
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=s390 SHELL=/bin/bash

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

>> mm/swapfile.c:1824:2: error: use of undeclared identifier 'new_pte'; did you mean 'newpte'?
           new_pte = pte_mkold(mk_pte(page, vma->vm_page_prot));
           ^~~~~~~
           newpte
   mm/swapfile.c:1786:14: note: 'newpte' declared here
           pte_t *pte, newpte;
                       ^
   mm/swapfile.c:1826:26: error: use of undeclared identifier 'new_pte'
                   pte = pte_mksoft_dirty(new_pte);
                                          ^
   mm/swapfile.c:1828:23: error: use of undeclared identifier 'new_pte'
                   pte = pte_mkuffd_wp(new_pte);
                                       ^
   mm/swapfile.c:1829:36: error: use of undeclared identifier 'new_pte'; did you mean 'newpte'?
           set_pte_at(vma->vm_mm, addr, pte, new_pte);
                                             ^~~~~~~
                                             newpte
   mm/swapfile.c:1786:14: note: 'newpte' declared here
           pte_t *pte, newpte;
                       ^
   4 errors generated.


vim +1824 mm/swapfile.c

  1775	
  1776	/*
  1777	 * No need to decide whether this PTE shares the swap entry with others,
  1778	 * just let do_wp_page work it out if a write is requested later - to
  1779	 * force COW, vm_page_prot omits write permission from any private vma.
  1780	 */
  1781	static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
  1782			unsigned long addr, swp_entry_t entry, struct page *page)
  1783	{
  1784		struct page *swapcache;
  1785		spinlock_t *ptl;
  1786		pte_t *pte, newpte;
  1787		int ret = 1;
  1788	
  1789		swapcache = page;
  1790		page = ksm_might_need_to_copy(page, vma, addr);
  1791		if (unlikely(!page))
  1792			return -ENOMEM;
  1793	
  1794		pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
  1795		if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) {
  1796			ret = 0;
  1797			goto out;
  1798		}
  1799	
  1800		/* See do_swap_page() */
  1801		BUG_ON(!PageAnon(page) && PageMappedToDisk(page));
  1802		BUG_ON(PageAnon(page) && PageAnonExclusive(page));
  1803	
  1804		dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
  1805		inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
  1806		get_page(page);
  1807		if (page == swapcache) {
  1808			rmap_t rmap_flags = RMAP_NONE;
  1809	
  1810			/*
  1811			 * See do_swap_page(): PageWriteback() would be problematic.
  1812			 * However, we do a wait_on_page_writeback() just before this
  1813			 * call and have the page locked.
  1814			 */
  1815			VM_BUG_ON_PAGE(PageWriteback(page), page);
  1816			if (pte_swp_exclusive(*pte))
  1817				rmap_flags |= RMAP_EXCLUSIVE;
  1818	
  1819			page_add_anon_rmap(page, vma, addr, rmap_flags);
  1820		} else { /* ksm created a completely new copy */
  1821			page_add_new_anon_rmap(page, vma, addr);
  1822			lru_cache_add_inactive_or_unevictable(page, vma);
  1823		}
> 1824		new_pte = pte_mkold(mk_pte(page, vma->vm_page_prot));
  1825		if (pte_swp_soft_dirty(*pte))
  1826			pte = pte_mksoft_dirty(new_pte);
  1827		if (pte_swp_uffd_wp(*pte))
  1828			pte = pte_mkuffd_wp(new_pte);
  1829		set_pte_at(vma->vm_mm, addr, pte, new_pte);
  1830		swap_free(entry);
  1831	out:
  1832		pte_unmap_unlock(pte, ptl);
  1833		if (page != swapcache) {
  1834			unlock_page(page);
  1835			put_page(page);
  1836		}
  1837		return ret;
  1838	}
  1839
Miaohe Lin April 20, 2022, 6:15 a.m. UTC | #19
On 2022/4/20 8:25, Alistair Popple wrote:
> Miaohe Lin <linmiaohe@huawei.com> writes:
> 
>> On 2022/4/19 15:53, Alistair Popple wrote:
>>> Also in madvise_free_pte_range() you could just remove the swap entry as it's no
>>> longer needed.
>>>
>>
>> This swap entry will be removed in madvise_dontneed_single_vma().
>> And in madvise_free_pte_range(), we may need to keep it as same as
>> hwpoison entry. Or am I supposed to remove it even if hwpoison entry
>> is reused later?
> 
> Why would we need to keep it for MADV_FREE though? It only works on private
> anonymous memory, and once the MADV_FREE operation has succeeded callers can
> expect they might get zero-fill pages if accessing the memory again. Therefore
> it should be safe to delete the entry. I think that applies equally to a
> hwpoison entry too - there's no reason to kill the process if it has called
> MADV_FREE on the range.

I tend to agree. We can drop the swapin error entry and hwpoison entry when MADV_FREE
is called. Should I squash these into the current patch or a separate one is preferred?

Thanks for your suggestion!

> 
>>
>> Thanks!
>>
>>> Alistair Popple <apopple@nvidia.com> writes:
>>>
>>>> Miaohe Lin <linmiaohe@huawei.com> writes:
>>>>
>> ...
Miaohe Lin April 20, 2022, 6:21 a.m. UTC | #20
On 2022/4/20 5:36, Peter Xu wrote:
> On Sat, Apr 16, 2022 at 11:05:49AM +0800, Miaohe Lin wrote:
>> @@ -1797,6 +1797,17 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
>>  		goto out;
>>  	}
>>  
>> +	if (unlikely(!PageUptodate(page))) {
>> +		pte_t pteval;
>> +
>> +		dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
>> +		pteval = swp_entry_to_pte(make_swapin_error_entry(page));
>> +		set_pte_at(vma->vm_mm, addr, pte, pteval);
>> +		swap_free(entry);
>> +		ret = 0;
>> +		goto out;
>> +	}
>> +
>>  	/* See do_swap_page() */
>>  	BUG_ON(!PageAnon(page) && PageMappedToDisk(page));
>>  	BUG_ON(PageAnon(page) && PageAnonExclusive(page));
> 
> Totally off-topic, but.. today when I was looking at the unuse path I just
> found that the swp bits could have got lost for either soft-dirty and
> uffd-wp here?  A quick patch attached.

Am I supposed to test-and-send this patch? The patch looks good to me except the
build error pointed out by kernel test robot.

> 
> Maybe at some point we should start to have some special helpers for
> set_pte_at() when we're converting between present/non-present ptes, so as
> to make sure all these will always be taken care of properly.

That will be helpful. There are many places doing the similar thing.

> 

Thanks!
Miaohe Lin April 20, 2022, 6:23 a.m. UTC | #21
On 2022/4/20 13:56, kernel test robot wrote:
> Hi Peter,
> 
> Thank you for the patch! Yet something to improve:
> 
> [auto build test ERROR on hnaz-mm/master]
> 
> url:    https://github.com/intel-lab-lkp/linux/commits/Peter-Xu/mm-swap-Fix-lost-swap-bits-in-unuse_pte/20220420-053845
> base:   https://github.com/hnaz/linux-mm master
> config: s390-randconfig-r023-20220420 (https://download.01.org/0day-ci/archive/20220420/202204201313.QYiDBRbL-lkp@intel.com/config)
> compiler: clang version 15.0.0 (https://github.com/llvm/llvm-project bac6cd5bf85669e3376610cfc4c4f9ca015e7b9b)
> reproduce (this is a W=1 build):
>         wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
>         chmod +x ~/bin/make.cross
>         # install s390 cross compiling tool for clang build
>         # apt-get install binutils-s390x-linux-gnu
>         # https://github.com/intel-lab-lkp/linux/commit/355ac3eb45402f7aab25b76af029d4390af05238
>         git remote add linux-review https://github.com/intel-lab-lkp/linux
>         git fetch --no-tags linux-review Peter-Xu/mm-swap-Fix-lost-swap-bits-in-unuse_pte/20220420-053845
>         git checkout 355ac3eb45402f7aab25b76af029d4390af05238
>         # save the config file
>         mkdir build_dir && cp config build_dir/.config
>         COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=s390 SHELL=/bin/bash
> 
> If you fix the issue, kindly add following tag as appropriate
> Reported-by: kernel test robot <lkp@intel.com>
> 

The variable name is newpte. But it's written as new_pte latter. Many thanks for report!

BTW: Since this is not a formal patch (no compile tested). Is Reported-by tag still needed?

Thanks!

> All errors (new ones prefixed by >>):
> 
>>> mm/swapfile.c:1824:2: error: use of undeclared identifier 'new_pte'; did you mean 'newpte'?
>            new_pte = pte_mkold(mk_pte(page, vma->vm_page_prot));
>            ^~~~~~~
>            newpte
>    mm/swapfile.c:1786:14: note: 'newpte' declared here
>            pte_t *pte, newpte;
>                        ^
>    mm/swapfile.c:1826:26: error: use of undeclared identifier 'new_pte'
>                    pte = pte_mksoft_dirty(new_pte);
>                                           ^
>    mm/swapfile.c:1828:23: error: use of undeclared identifier 'new_pte'
>                    pte = pte_mkuffd_wp(new_pte);
>                                        ^
>    mm/swapfile.c:1829:36: error: use of undeclared identifier 'new_pte'; did you mean 'newpte'?
>            set_pte_at(vma->vm_mm, addr, pte, new_pte);
>                                              ^~~~~~~
>                                              newpte
>    mm/swapfile.c:1786:14: note: 'newpte' declared here
>            pte_t *pte, newpte;
>                        ^
>    4 errors generated.
...
>   1839	
>
Philip Li April 20, 2022, 6:39 a.m. UTC | #22
On Wed, Apr 20, 2022 at 02:23:27PM +0800, Miaohe Lin wrote:
> On 2022/4/20 13:56, kernel test robot wrote:
> > Hi Peter,
> > 
> > Thank you for the patch! Yet something to improve:
> > 
> > [auto build test ERROR on hnaz-mm/master]
> > 
> > url:    https://github.com/intel-lab-lkp/linux/commits/Peter-Xu/mm-swap-Fix-lost-swap-bits-in-unuse_pte/20220420-053845
> > base:   https://github.com/hnaz/linux-mm master
> > config: s390-randconfig-r023-20220420 (https://download.01.org/0day-ci/archive/20220420/202204201313.QYiDBRbL-lkp@intel.com/config)
> > compiler: clang version 15.0.0 (https://github.com/llvm/llvm-project bac6cd5bf85669e3376610cfc4c4f9ca015e7b9b)
> > reproduce (this is a W=1 build):
> >         wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
> >         chmod +x ~/bin/make.cross
> >         # install s390 cross compiling tool for clang build
> >         # apt-get install binutils-s390x-linux-gnu
> >         # https://github.com/intel-lab-lkp/linux/commit/355ac3eb45402f7aab25b76af029d4390af05238
> >         git remote add linux-review https://github.com/intel-lab-lkp/linux
> >         git fetch --no-tags linux-review Peter-Xu/mm-swap-Fix-lost-swap-bits-in-unuse_pte/20220420-053845
> >         git checkout 355ac3eb45402f7aab25b76af029d4390af05238
> >         # save the config file
> >         mkdir build_dir && cp config build_dir/.config
> >         COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=s390 SHELL=/bin/bash
> > 
> > If you fix the issue, kindly add following tag as appropriate
> > Reported-by: kernel test robot <lkp@intel.com>
> > 
> 
> The variable name is newpte. But it's written as new_pte latter. Many thanks for report!
> 
> BTW: Since this is not a formal patch (no compile tested). Is Reported-by tag still needed?

thanks, this is not needed. It mostly needed for already upstreamed one
and actually not mandatory from 0-day ci perspective.

> 
> Thanks!
> 
> > All errors (new ones prefixed by >>):
> > 
> >>> mm/swapfile.c:1824:2: error: use of undeclared identifier 'new_pte'; did you mean 'newpte'?
> >            new_pte = pte_mkold(mk_pte(page, vma->vm_page_prot));
> >            ^~~~~~~
> >            newpte
> >    mm/swapfile.c:1786:14: note: 'newpte' declared here
> >            pte_t *pte, newpte;
> >                        ^
> >    mm/swapfile.c:1826:26: error: use of undeclared identifier 'new_pte'
> >                    pte = pte_mksoft_dirty(new_pte);
> >                                           ^
> >    mm/swapfile.c:1828:23: error: use of undeclared identifier 'new_pte'
> >                    pte = pte_mkuffd_wp(new_pte);
> >                                        ^
> >    mm/swapfile.c:1829:36: error: use of undeclared identifier 'new_pte'; did you mean 'newpte'?
> >            set_pte_at(vma->vm_mm, addr, pte, new_pte);
> >                                              ^~~~~~~
> >                                              newpte
> >    mm/swapfile.c:1786:14: note: 'newpte' declared here
> >            pte_t *pte, newpte;
> >                        ^
> >    4 errors generated.
> ...
> >   1839	
> > 
> _______________________________________________
> kbuild-all mailing list -- kbuild-all@lists.01.org
> To unsubscribe send an email to kbuild-all-leave@lists.01.org
Chen, Rong A April 20, 2022, 6:48 a.m. UTC | #23
On 4/20/2022 2:23 PM, Miaohe Lin wrote:
> On 2022/4/20 13:56, kernel test robot wrote:
>> Hi Peter,
>>
>> Thank you for the patch! Yet something to improve:
>>
>> [auto build test ERROR on hnaz-mm/master]
>>
>> url:    https://github.com/intel-lab-lkp/linux/commits/Peter-Xu/mm-swap-Fix-lost-swap-bits-in-unuse_pte/20220420-053845
>> base:   https://github.com/hnaz/linux-mm master
>> config: s390-randconfig-r023-20220420 (https://download.01.org/0day-ci/archive/20220420/202204201313.QYiDBRbL-lkp@intel.com/config)
>> compiler: clang version 15.0.0 (https://github.com/llvm/llvm-project bac6cd5bf85669e3376610cfc4c4f9ca015e7b9b)
>> reproduce (this is a W=1 build):
>>          wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
>>          chmod +x ~/bin/make.cross
>>          # install s390 cross compiling tool for clang build
>>          # apt-get install binutils-s390x-linux-gnu
>>          # https://github.com/intel-lab-lkp/linux/commit/355ac3eb45402f7aab25b76af029d4390af05238
>>          git remote add linux-review https://github.com/intel-lab-lkp/linux
>>          git fetch --no-tags linux-review Peter-Xu/mm-swap-Fix-lost-swap-bits-in-unuse_pte/20220420-053845
>>          git checkout 355ac3eb45402f7aab25b76af029d4390af05238
>>          # save the config file
>>          mkdir build_dir && cp config build_dir/.config
>>          COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=s390 SHELL=/bin/bash
>>
>> If you fix the issue, kindly add following tag as appropriate
>> Reported-by: kernel test robot <lkp@intel.com>
>>
> 
> The variable name is newpte. But it's written as new_pte latter. Many thanks for report!
> 
> BTW: Since this is not a formal patch (no compile tested). Is Reported-by tag still needed?

Hi Miaohe,

Please ignore the tag, it's only a suggestion, the bot doesn't
know the entire picture.

Best Regards,
Rong Chen

> 
> Thanks!
> 
>> All errors (new ones prefixed by >>):
>>
>>>> mm/swapfile.c:1824:2: error: use of undeclared identifier 'new_pte'; did you mean 'newpte'?
>>             new_pte = pte_mkold(mk_pte(page, vma->vm_page_prot));
>>             ^~~~~~~
>>             newpte
>>     mm/swapfile.c:1786:14: note: 'newpte' declared here
>>             pte_t *pte, newpte;
>>                         ^
>>     mm/swapfile.c:1826:26: error: use of undeclared identifier 'new_pte'
>>                     pte = pte_mksoft_dirty(new_pte);
>>                                            ^
>>     mm/swapfile.c:1828:23: error: use of undeclared identifier 'new_pte'
>>                     pte = pte_mkuffd_wp(new_pte);
>>                                         ^
>>     mm/swapfile.c:1829:36: error: use of undeclared identifier 'new_pte'; did you mean 'newpte'?
>>             set_pte_at(vma->vm_mm, addr, pte, new_pte);
>>                                               ^~~~~~~
>>                                               newpte
>>     mm/swapfile.c:1786:14: note: 'newpte' declared here
>>             pte_t *pte, newpte;
>>                         ^
>>     4 errors generated.
> ...
>>    1839	
>>
> _______________________________________________
> kbuild-all mailing list -- kbuild-all@lists.01.org
> To unsubscribe send an email to kbuild-all-leave@lists.01.org
>
Miaohe Lin April 20, 2022, 6:52 a.m. UTC | #24
On 2022/4/20 14:39, Philip Li wrote:
> On Wed, Apr 20, 2022 at 02:23:27PM +0800, Miaohe Lin wrote:
>> On 2022/4/20 13:56, kernel test robot wrote:
>>> Hi Peter,
>>>
>>> Thank you for the patch! Yet something to improve:
>>>
>>> [auto build test ERROR on hnaz-mm/master]
>>>
>>> url:    https://github.com/intel-lab-lkp/linux/commits/Peter-Xu/mm-swap-Fix-lost-swap-bits-in-unuse_pte/20220420-053845
>>> base:   https://github.com/hnaz/linux-mm master
>>> config: s390-randconfig-r023-20220420 (https://download.01.org/0day-ci/archive/20220420/202204201313.QYiDBRbL-lkp@intel.com/config)
>>> compiler: clang version 15.0.0 (https://github.com/llvm/llvm-project bac6cd5bf85669e3376610cfc4c4f9ca015e7b9b)
>>> reproduce (this is a W=1 build):
>>>         wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
>>>         chmod +x ~/bin/make.cross
>>>         # install s390 cross compiling tool for clang build
>>>         # apt-get install binutils-s390x-linux-gnu
>>>         # https://github.com/intel-lab-lkp/linux/commit/355ac3eb45402f7aab25b76af029d4390af05238
>>>         git remote add linux-review https://github.com/intel-lab-lkp/linux
>>>         git fetch --no-tags linux-review Peter-Xu/mm-swap-Fix-lost-swap-bits-in-unuse_pte/20220420-053845
>>>         git checkout 355ac3eb45402f7aab25b76af029d4390af05238
>>>         # save the config file
>>>         mkdir build_dir && cp config build_dir/.config
>>>         COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=s390 SHELL=/bin/bash
>>>
>>> If you fix the issue, kindly add following tag as appropriate
>>> Reported-by: kernel test robot <lkp@intel.com>
>>>
>>
>> The variable name is newpte. But it's written as new_pte latter. Many thanks for report!
>>
>> BTW: Since this is not a formal patch (no compile tested). Is Reported-by tag still needed?
> 
> thanks, this is not needed. It mostly needed for already upstreamed one
> and actually not mandatory from 0-day ci perspective.

I see. Many thanks for your clarifying and meaningful work! :)

>
Miaohe Lin April 20, 2022, 6:56 a.m. UTC | #25
On 2022/4/20 14:48, Chen, Rong A wrote:
> 
> 
> On 4/20/2022 2:23 PM, Miaohe Lin wrote:
>> On 2022/4/20 13:56, kernel test robot wrote:
>>> Hi Peter,
>>>
>>> Thank you for the patch! Yet something to improve:
>>>
>>> [auto build test ERROR on hnaz-mm/master]
>>>
>>> url:    https://github.com/intel-lab-lkp/linux/commits/Peter-Xu/mm-swap-Fix-lost-swap-bits-in-unuse_pte/20220420-053845
>>> base:   https://github.com/hnaz/linux-mm master
>>> config: s390-randconfig-r023-20220420 (https://download.01.org/0day-ci/archive/20220420/202204201313.QYiDBRbL-lkp@intel.com/config)
>>> compiler: clang version 15.0.0 (https://github.com/llvm/llvm-project bac6cd5bf85669e3376610cfc4c4f9ca015e7b9b)
>>> reproduce (this is a W=1 build):
>>>          wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
>>>          chmod +x ~/bin/make.cross
>>>          # install s390 cross compiling tool for clang build
>>>          # apt-get install binutils-s390x-linux-gnu
>>>          # https://github.com/intel-lab-lkp/linux/commit/355ac3eb45402f7aab25b76af029d4390af05238
>>>          git remote add linux-review https://github.com/intel-lab-lkp/linux
>>>          git fetch --no-tags linux-review Peter-Xu/mm-swap-Fix-lost-swap-bits-in-unuse_pte/20220420-053845
>>>          git checkout 355ac3eb45402f7aab25b76af029d4390af05238
>>>          # save the config file
>>>          mkdir build_dir && cp config build_dir/.config
>>>          COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=s390 SHELL=/bin/bash
>>>
>>> If you fix the issue, kindly add following tag as appropriate
>>> Reported-by: kernel test robot <lkp@intel.com>
>>>
>>
>> The variable name is newpte. But it's written as new_pte latter. Many thanks for report!
>>
>> BTW: Since this is not a formal patch (no compile tested). Is Reported-by tag still needed?
> 
> Hi Miaohe,
> 
> Please ignore the tag, it's only a suggestion, the bot doesn't
> know the entire picture.
> 

I see. Many thanks for your clarifying again! :)

> Best Regards,
> Rong Chen
> 
>>
...
>> _______________________________________________
>> kbuild-all mailing list -- kbuild-all@lists.01.org
>> To unsubscribe send an email to kbuild-all-leave@lists.01.org
>>
> .
David Hildenbrand April 20, 2022, 7:07 a.m. UTC | #26
On 20.04.22 08:15, Miaohe Lin wrote:
> On 2022/4/20 8:25, Alistair Popple wrote:
>> Miaohe Lin <linmiaohe@huawei.com> writes:
>>
>>> On 2022/4/19 15:53, Alistair Popple wrote:
>>>> Also in madvise_free_pte_range() you could just remove the swap entry as it's no
>>>> longer needed.
>>>>
>>>
>>> This swap entry will be removed in madvise_dontneed_single_vma().
>>> And in madvise_free_pte_range(), we may need to keep it as same as
>>> hwpoison entry. Or am I supposed to remove it even if hwpoison entry
>>> is reused later?
>>
>> Why would we need to keep it for MADV_FREE though? It only works on private
>> anonymous memory, and once the MADV_FREE operation has succeeded callers can
>> expect they might get zero-fill pages if accessing the memory again. Therefore
>> it should be safe to delete the entry. I think that applies equally to a
>> hwpoison entry too - there's no reason to kill the process if it has called
>> MADV_FREE on the range.
> 
> I tend to agree. We can drop the swapin error entry and hwpoison entry when MADV_FREE
> is called. Should I squash these into the current patch or a separate one is preferred?
> 

That should go into a separate patch.
Miaohe Lin April 20, 2022, 8:37 a.m. UTC | #27
On 2022/4/20 15:07, David Hildenbrand wrote:
> On 20.04.22 08:15, Miaohe Lin wrote:
>> On 2022/4/20 8:25, Alistair Popple wrote:
>>> Miaohe Lin <linmiaohe@huawei.com> writes:
>>>
>>>> On 2022/4/19 15:53, Alistair Popple wrote:
>>>>> Also in madvise_free_pte_range() you could just remove the swap entry as it's no
>>>>> longer needed.
>>>>>
>>>>
>>>> This swap entry will be removed in madvise_dontneed_single_vma().
>>>> And in madvise_free_pte_range(), we may need to keep it as same as
>>>> hwpoison entry. Or am I supposed to remove it even if hwpoison entry
>>>> is reused later?
>>>
>>> Why would we need to keep it for MADV_FREE though? It only works on private
>>> anonymous memory, and once the MADV_FREE operation has succeeded callers can
>>> expect they might get zero-fill pages if accessing the memory again. Therefore
>>> it should be safe to delete the entry. I think that applies equally to a
>>> hwpoison entry too - there's no reason to kill the process if it has called
>>> MADV_FREE on the range.
>>
>> I tend to agree. We can drop the swapin error entry and hwpoison entry when MADV_FREE
>> is called. Should I squash these into the current patch or a separate one is preferred?
>>
> 
> That should go into a separate patch.

Will do. Many thanks for your suggestion.

>
Peter Xu April 20, 2022, 1:32 p.m. UTC | #28
On Wed, Apr 20, 2022 at 02:21:27PM +0800, Miaohe Lin wrote:
> On 2022/4/20 5:36, Peter Xu wrote:
> > On Sat, Apr 16, 2022 at 11:05:49AM +0800, Miaohe Lin wrote:
> >> @@ -1797,6 +1797,17 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
> >>  		goto out;
> >>  	}
> >>  
> >> +	if (unlikely(!PageUptodate(page))) {
> >> +		pte_t pteval;
> >> +
> >> +		dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
> >> +		pteval = swp_entry_to_pte(make_swapin_error_entry(page));
> >> +		set_pte_at(vma->vm_mm, addr, pte, pteval);
> >> +		swap_free(entry);
> >> +		ret = 0;
> >> +		goto out;
> >> +	}
> >> +
> >>  	/* See do_swap_page() */
> >>  	BUG_ON(!PageAnon(page) && PageMappedToDisk(page));
> >>  	BUG_ON(PageAnon(page) && PageAnonExclusive(page));
> > 
> > Totally off-topic, but.. today when I was looking at the unuse path I just
> > found that the swp bits could have got lost for either soft-dirty and
> > uffd-wp here?  A quick patch attached.
> 
> Am I supposed to test-and-send this patch? The patch looks good to me except the
> build error pointed out by kernel test robot.

I was planning to post a patch after yours since they're touching the same
function, but yeah it'll be great if you could also take that over, thanks!
Miaohe Lin April 21, 2022, 1:50 a.m. UTC | #29
On 2022/4/20 21:32, Peter Xu wrote:
> On Wed, Apr 20, 2022 at 02:21:27PM +0800, Miaohe Lin wrote:
>> On 2022/4/20 5:36, Peter Xu wrote:
>>> On Sat, Apr 16, 2022 at 11:05:49AM +0800, Miaohe Lin wrote:
>>>> @@ -1797,6 +1797,17 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
>>>>  		goto out;
>>>>  	}
>>>>  
>>>> +	if (unlikely(!PageUptodate(page))) {
>>>> +		pte_t pteval;
>>>> +
>>>> +		dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
>>>> +		pteval = swp_entry_to_pte(make_swapin_error_entry(page));
>>>> +		set_pte_at(vma->vm_mm, addr, pte, pteval);
>>>> +		swap_free(entry);
>>>> +		ret = 0;
>>>> +		goto out;
>>>> +	}
>>>> +
>>>>  	/* See do_swap_page() */
>>>>  	BUG_ON(!PageAnon(page) && PageMappedToDisk(page));
>>>>  	BUG_ON(PageAnon(page) && PageAnonExclusive(page));
>>>
>>> Totally off-topic, but.. today when I was looking at the unuse path I just
>>> found that the swp bits could have got lost for either soft-dirty and
>>> uffd-wp here?  A quick patch attached.
>>
>> Am I supposed to test-and-send this patch? The patch looks good to me except the
>> build error pointed out by kernel test robot.
> 
> I was planning to post a patch after yours since they're touching the same
> function, but yeah it'll be great if you could also take that over, thanks!

Sure, I will take this in next version. Thanks a lot! :)

>
diff mbox series

Patch

diff --git a/include/linux/swap.h b/include/linux/swap.h
index d112434f85df..03c576111737 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -55,6 +55,10 @@  static inline int current_is_kswapd(void)
  * actions on faults.
  */
 
+#define SWAP_READ_ERROR_NUM 1
+#define SWAP_READ_ERROR     (MAX_SWAPFILES + SWP_HWPOISON_NUM + \
+			     SWP_MIGRATION_NUM + SWP_DEVICE_NUM + \
+			     SWP_PTE_MARKER_NUM)
 /*
  * PTE markers are used to persist information onto PTEs that are mapped with
  * file-backed memories.  As its name "PTE" hints, it should only be applied to
@@ -120,7 +124,8 @@  static inline int current_is_kswapd(void)
 
 #define MAX_SWAPFILES \
 	((1 << MAX_SWAPFILES_SHIFT) - SWP_DEVICE_NUM - \
-	SWP_MIGRATION_NUM - SWP_HWPOISON_NUM - SWP_PTE_MARKER_NUM)
+	SWP_MIGRATION_NUM - SWP_HWPOISON_NUM - \
+	SWP_PTE_MARKER_NUM - SWAP_READ_ERROR_NUM)
 
 /*
  * Magic header for a swap area. The first part of the union is
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index fffbba0036f6..d1093384de9f 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -108,6 +108,16 @@  static inline void *swp_to_radix_entry(swp_entry_t entry)
 	return xa_mk_value(entry.val);
 }
 
+static inline swp_entry_t make_swapin_error_entry(struct page *page)
+{
+	return swp_entry(SWAP_READ_ERROR, page_to_pfn(page));
+}
+
+static inline int is_swapin_error_entry(swp_entry_t entry)
+{
+	return swp_type(entry) == SWAP_READ_ERROR;
+}
+
 #if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
 static inline swp_entry_t make_readable_device_private_entry(pgoff_t offset)
 {
diff --git a/mm/memory.c b/mm/memory.c
index e6434b824009..34d1d66a05bd 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1476,7 +1476,8 @@  static unsigned long zap_pte_range(struct mmu_gather *tlb,
 			/* Only drop the uffd-wp marker if explicitly requested */
 			if (!zap_drop_file_uffd_wp(details))
 				continue;
-		} else if (is_hwpoison_entry(entry)) {
+		} else if (is_hwpoison_entry(entry) ||
+			   is_swapin_error_entry(entry)) {
 			if (!should_zap_cows(details))
 				continue;
 		} else {
@@ -3724,6 +3725,8 @@  vm_fault_t do_swap_page(struct vm_fault *vmf)
 			ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
 		} else if (is_hwpoison_entry(entry)) {
 			ret = VM_FAULT_HWPOISON;
+		} else if (is_swapin_error_entry(entry)) {
+			ret = VM_FAULT_SIGBUS;
 		} else if (is_pte_marker_entry(entry)) {
 			ret = handle_pte_marker(vmf);
 		} else {
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 9398e915b36b..95b63f69f388 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1797,6 +1797,17 @@  static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
 		goto out;
 	}
 
+	if (unlikely(!PageUptodate(page))) {
+		pte_t pteval;
+
+		dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
+		pteval = swp_entry_to_pte(make_swapin_error_entry(page));
+		set_pte_at(vma->vm_mm, addr, pte, pteval);
+		swap_free(entry);
+		ret = 0;
+		goto out;
+	}
+
 	/* See do_swap_page() */
 	BUG_ON(!PageAnon(page) && PageMappedToDisk(page));
 	BUG_ON(PageAnon(page) && PageAnonExclusive(page));