diff mbox series

[mm-unstable,v4,5/9] mm, hwpoison: make unpoison aware of raw error info in hwpoisoned hugepage

Message ID 20220704013312.2415700-6-naoya.horiguchi@linux.dev (mailing list archive)
State New
Headers show
Series mm, hwpoison: enable 1GB hugepage support (v4) | expand

Commit Message

Naoya Horiguchi July 4, 2022, 1:33 a.m. UTC
From: Naoya Horiguchi <naoya.horiguchi@nec.com>

Raw error info list needs to be removed when hwpoisoned hugetlb is
unpoisoned.  And unpoison handler needs to know how many errors there
are in the target hugepage. So add them.

Signed-off-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
---
 include/linux/swapops.h |  9 +++++++++
 mm/memory-failure.c     | 31 +++++++++++++++++++++++++------
 2 files changed, 34 insertions(+), 6 deletions(-)

Comments

Miaohe Lin July 6, 2022, 2:58 a.m. UTC | #1
On 2022/7/4 9:33, Naoya Horiguchi wrote:
> From: Naoya Horiguchi <naoya.horiguchi@nec.com>
> 
> Raw error info list needs to be removed when hwpoisoned hugetlb is
> unpoisoned.  And unpoison handler needs to know how many errors there
> are in the target hugepage. So add them.
> 
> Signed-off-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
> ---
>  include/linux/swapops.h |  9 +++++++++
>  mm/memory-failure.c     | 31 +++++++++++++++++++++++++------
>  2 files changed, 34 insertions(+), 6 deletions(-)
> 
> diff --git a/include/linux/swapops.h b/include/linux/swapops.h
> index a01aeb3fcc0b..ddc98f96ad2c 100644
> --- a/include/linux/swapops.h
> +++ b/include/linux/swapops.h
> @@ -498,6 +498,11 @@ static inline void num_poisoned_pages_dec(void)
>  	atomic_long_dec(&num_poisoned_pages);
>  }
>  
> +static inline void num_poisoned_pages_sub(long i)
> +{
> +	atomic_long_sub(i, &num_poisoned_pages);
> +}
> +
>  #else
>  
>  static inline swp_entry_t make_hwpoison_entry(struct page *page)
> @@ -518,6 +523,10 @@ static inline struct page *hwpoison_entry_to_page(swp_entry_t entry)
>  static inline void num_poisoned_pages_inc(void)
>  {
>  }
> +
> +static inline void num_poisoned_pages_sub(long i)
> +{
> +}
>  #endif
>  
>  static inline int non_swap_entry(swp_entry_t entry)
> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> index 53bf7486a245..6af2096d8ea0 100644
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -1722,22 +1722,33 @@ static inline int hugetlb_set_page_hwpoison(struct page *hpage,
>  	return ret;
>  }
>  
> -inline int hugetlb_clear_page_hwpoison(struct page *hpage)
> +static inline long free_raw_hwp_pages(struct page *hpage, bool move_flag)
>  {
>  	struct llist_head *head;
>  	struct llist_node *t, *tnode;
> +	long count = 0;
>  
> -	if (!HPageRawHwpUnreliable(hpage))
> -		ClearPageHWPoison(hpage);
>  	head = raw_hwp_list_head(hpage);
>  	llist_for_each_safe(tnode, t, head->first) {
>  		struct raw_hwp_page *p = container_of(tnode, struct raw_hwp_page, node);
>  
> -		SetPageHWPoison(p->page);
> +		if (move_flag)
> +			SetPageHWPoison(p->page);
>  		kfree(p);
> +		count++;
>  	}
>  	llist_del_all(head);
> -	return 0;
> +	return count;
> +}
> +
> +inline int hugetlb_clear_page_hwpoison(struct page *hpage)
> +{
> +	int ret = -EBUSY;
> +
> +	if (!HPageRawHwpUnreliable(hpage))
> +		ret = !TestClearPageHWPoison(hpage);
> +	free_raw_hwp_pages(hpage, true);
> +	return ret;
>  }
>  
>  /*
> @@ -1882,6 +1893,9 @@ static inline int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *
>  	return 0;
>  }
>  
> +static inline void free_raw_hwp_pages(struct page *hpage, bool move_flag)
> +{
> +}
>  #endif	/* CONFIG_HUGETLB_PAGE */
>  
>  static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
> @@ -2287,6 +2301,7 @@ int unpoison_memory(unsigned long pfn)

Is it safe to unpoison hugepage when HPageRawHwpUnreliable? I'm afraid because
some raw error info is missing..

Thanks.

>  	struct page *p;
>  	int ret = -EBUSY;
>  	int freeit = 0;
> +	long count = 1;
>  	static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
>  					DEFAULT_RATELIMIT_BURST);
>  
> @@ -2334,6 +2349,8 @@ int unpoison_memory(unsigned long pfn)
>  
>  	ret = get_hwpoison_page(p, MF_UNPOISON);
>  	if (!ret) {
> +		if (PageHuge(p))
> +			count = free_raw_hwp_pages(page, false);
>  		ret = TestClearPageHWPoison(page) ? 0 : -EBUSY;
>  	} else if (ret < 0) {
>  		if (ret == -EHWPOISON) {
> @@ -2342,6 +2359,8 @@ int unpoison_memory(unsigned long pfn)
>  			unpoison_pr_info("Unpoison: failed to grab page %#lx\n",
>  					 pfn, &unpoison_rs);
>  	} else {
> +		if (PageHuge(p))
> +			count = free_raw_hwp_pages(page, false);
>  		freeit = !!TestClearPageHWPoison(p);
>  
>  		put_page(page);
> @@ -2354,7 +2373,7 @@ int unpoison_memory(unsigned long pfn)
>  unlock_mutex:
>  	mutex_unlock(&mf_mutex);
>  	if (!ret || freeit) {
> -		num_poisoned_pages_dec();
> +		num_poisoned_pages_sub(count);
>  		unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
>  				 page_to_pfn(p), &unpoison_rs);
>  	}
>
HORIGUCHI NAOYA(堀口 直也) July 6, 2022, 11:06 p.m. UTC | #2
On Wed, Jul 06, 2022 at 10:58:53AM +0800, Miaohe Lin wrote:
> On 2022/7/4 9:33, Naoya Horiguchi wrote:
> > From: Naoya Horiguchi <naoya.horiguchi@nec.com>
> > 
> > Raw error info list needs to be removed when hwpoisoned hugetlb is
> > unpoisoned.  And unpoison handler needs to know how many errors there
> > are in the target hugepage. So add them.
> > 
> > Signed-off-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
> > ---
> > @@ -2287,6 +2301,7 @@ int unpoison_memory(unsigned long pfn)
> 
> Is it safe to unpoison hugepage when HPageRawHwpUnreliable? I'm afraid because
> some raw error info is missing..

Ah, right. We need prevent it.  I'll fix it by inserting the check.

 static inline long free_raw_hwp_pages(struct page *hpage, bool move_flag)
 {
         struct llist_head *head;
         struct llist_node *t, *tnode;
         long count = 0;
 
+        if (!HPageRawHwpUnreliable(hpage))
+                return 0;

Thanks,
Naoya Horiguchi

> Thanks.
> 
> >  	struct page *p;
> >  	int ret = -EBUSY;
> >  	int freeit = 0;
> > +	long count = 1;
> >  	static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
> >  					DEFAULT_RATELIMIT_BURST);
> >  
> > @@ -2334,6 +2349,8 @@ int unpoison_memory(unsigned long pfn)
> >  
> >  	ret = get_hwpoison_page(p, MF_UNPOISON);
> >  	if (!ret) {
> > +		if (PageHuge(p))
> > +			count = free_raw_hwp_pages(page, false);
> >  		ret = TestClearPageHWPoison(page) ? 0 : -EBUSY;
> >  	} else if (ret < 0) {
> >  		if (ret == -EHWPOISON) {
> > @@ -2342,6 +2359,8 @@ int unpoison_memory(unsigned long pfn)
> >  			unpoison_pr_info("Unpoison: failed to grab page %#lx\n",
> >  					 pfn, &unpoison_rs);
> >  	} else {
> > +		if (PageHuge(p))
> > +			count = free_raw_hwp_pages(page, false);
> >  		freeit = !!TestClearPageHWPoison(p);
> >  
> >  		put_page(page);
> > @@ -2354,7 +2373,7 @@ int unpoison_memory(unsigned long pfn)
> >  unlock_mutex:
> >  	mutex_unlock(&mf_mutex);
> >  	if (!ret || freeit) {
> > -		num_poisoned_pages_dec();
> > +		num_poisoned_pages_sub(count);
> >  		unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
> >  				 page_to_pfn(p), &unpoison_rs);
> >  	}
> >
HORIGUCHI NAOYA(堀口 直也) July 7, 2022, 1:35 a.m. UTC | #3
On Wed, Jul 06, 2022 at 11:06:28PM +0000, HORIGUCHI NAOYA(堀口 直也) wrote:
> On Wed, Jul 06, 2022 at 10:58:53AM +0800, Miaohe Lin wrote:
> > On 2022/7/4 9:33, Naoya Horiguchi wrote:
> > > From: Naoya Horiguchi <naoya.horiguchi@nec.com>
> > > 
> > > Raw error info list needs to be removed when hwpoisoned hugetlb is
> > > unpoisoned.  And unpoison handler needs to know how many errors there
> > > are in the target hugepage. So add them.
> > > 
> > > Signed-off-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
> > > ---
> > > @@ -2287,6 +2301,7 @@ int unpoison_memory(unsigned long pfn)
> > 
> > Is it safe to unpoison hugepage when HPageRawHwpUnreliable? I'm afraid because
> > some raw error info is missing..
> 
> Ah, right. We need prevent it.  I'll fix it by inserting the check.
> 
>  static inline long free_raw_hwp_pages(struct page *hpage, bool move_flag)
>  {
>          struct llist_head *head;
>          struct llist_node *t, *tnode;
>          long count = 0;
>  
> +        if (!HPageRawHwpUnreliable(hpage))
> +                return 0;

No, I meant "if (HPageRawHwpUnreliable(hpage))", sorry for the noise :(

- Naoya Horiguchi
Miaohe Lin July 7, 2022, 3:08 a.m. UTC | #4
On 2022/7/7 9:35, HORIGUCHI NAOYA(堀口 直也) wrote:
> On Wed, Jul 06, 2022 at 11:06:28PM +0000, HORIGUCHI NAOYA(堀口 直也) wrote:
>> On Wed, Jul 06, 2022 at 10:58:53AM +0800, Miaohe Lin wrote:
>>> On 2022/7/4 9:33, Naoya Horiguchi wrote:
>>>> From: Naoya Horiguchi <naoya.horiguchi@nec.com>
>>>>
>>>> Raw error info list needs to be removed when hwpoisoned hugetlb is
>>>> unpoisoned.  And unpoison handler needs to know how many errors there
>>>> are in the target hugepage. So add them.
>>>>
>>>> Signed-off-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
>>>> ---
>>>> @@ -2287,6 +2301,7 @@ int unpoison_memory(unsigned long pfn)
>>>
>>> Is it safe to unpoison hugepage when HPageRawHwpUnreliable? I'm afraid because
>>> some raw error info is missing..
>>
>> Ah, right. We need prevent it.  I'll fix it by inserting the check.
>>
>>  static inline long free_raw_hwp_pages(struct page *hpage, bool move_flag)
>>  {
>>          struct llist_head *head;
>>          struct llist_node *t, *tnode;
>>          long count = 0;
>>  
>> +        if (!HPageRawHwpUnreliable(hpage))
>> +                return 0;

IIUC, even if we return 0 here, the caller will still do TestClearPageHWPoison(please see below
code diff) and succeeds to unpoison the page. Or am I miss something?

@@ -2334,6 +2349,8 @@ int unpoison_memory(unsigned long pfn)

 	ret = get_hwpoison_page(p, MF_UNPOISON);
 	if (!ret) {
+		if (PageHuge(p))
+			count = free_raw_hwp_pages(page, false);
 		ret = TestClearPageHWPoison(page) ? 0 : -EBUSY;
 	} else if (ret < 0) {
 		if (ret == -EHWPOISON) {

> 
> No, I meant "if (HPageRawHwpUnreliable(hpage))", sorry for the noise :(

No, thanks for your hard work!

> 
> - Naoya Horiguchi

Thanks.

>
diff mbox series

Patch

diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index a01aeb3fcc0b..ddc98f96ad2c 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -498,6 +498,11 @@  static inline void num_poisoned_pages_dec(void)
 	atomic_long_dec(&num_poisoned_pages);
 }
 
+static inline void num_poisoned_pages_sub(long i)
+{
+	atomic_long_sub(i, &num_poisoned_pages);
+}
+
 #else
 
 static inline swp_entry_t make_hwpoison_entry(struct page *page)
@@ -518,6 +523,10 @@  static inline struct page *hwpoison_entry_to_page(swp_entry_t entry)
 static inline void num_poisoned_pages_inc(void)
 {
 }
+
+static inline void num_poisoned_pages_sub(long i)
+{
+}
 #endif
 
 static inline int non_swap_entry(swp_entry_t entry)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 53bf7486a245..6af2096d8ea0 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1722,22 +1722,33 @@  static inline int hugetlb_set_page_hwpoison(struct page *hpage,
 	return ret;
 }
 
-inline int hugetlb_clear_page_hwpoison(struct page *hpage)
+static inline long free_raw_hwp_pages(struct page *hpage, bool move_flag)
 {
 	struct llist_head *head;
 	struct llist_node *t, *tnode;
+	long count = 0;
 
-	if (!HPageRawHwpUnreliable(hpage))
-		ClearPageHWPoison(hpage);
 	head = raw_hwp_list_head(hpage);
 	llist_for_each_safe(tnode, t, head->first) {
 		struct raw_hwp_page *p = container_of(tnode, struct raw_hwp_page, node);
 
-		SetPageHWPoison(p->page);
+		if (move_flag)
+			SetPageHWPoison(p->page);
 		kfree(p);
+		count++;
 	}
 	llist_del_all(head);
-	return 0;
+	return count;
+}
+
+inline int hugetlb_clear_page_hwpoison(struct page *hpage)
+{
+	int ret = -EBUSY;
+
+	if (!HPageRawHwpUnreliable(hpage))
+		ret = !TestClearPageHWPoison(hpage);
+	free_raw_hwp_pages(hpage, true);
+	return ret;
 }
 
 /*
@@ -1882,6 +1893,9 @@  static inline int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *
 	return 0;
 }
 
+static inline void free_raw_hwp_pages(struct page *hpage, bool move_flag)
+{
+}
 #endif	/* CONFIG_HUGETLB_PAGE */
 
 static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
@@ -2287,6 +2301,7 @@  int unpoison_memory(unsigned long pfn)
 	struct page *p;
 	int ret = -EBUSY;
 	int freeit = 0;
+	long count = 1;
 	static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
 					DEFAULT_RATELIMIT_BURST);
 
@@ -2334,6 +2349,8 @@  int unpoison_memory(unsigned long pfn)
 
 	ret = get_hwpoison_page(p, MF_UNPOISON);
 	if (!ret) {
+		if (PageHuge(p))
+			count = free_raw_hwp_pages(page, false);
 		ret = TestClearPageHWPoison(page) ? 0 : -EBUSY;
 	} else if (ret < 0) {
 		if (ret == -EHWPOISON) {
@@ -2342,6 +2359,8 @@  int unpoison_memory(unsigned long pfn)
 			unpoison_pr_info("Unpoison: failed to grab page %#lx\n",
 					 pfn, &unpoison_rs);
 	} else {
+		if (PageHuge(p))
+			count = free_raw_hwp_pages(page, false);
 		freeit = !!TestClearPageHWPoison(p);
 
 		put_page(page);
@@ -2354,7 +2373,7 @@  int unpoison_memory(unsigned long pfn)
 unlock_mutex:
 	mutex_unlock(&mf_mutex);
 	if (!ret || freeit) {
-		num_poisoned_pages_dec();
+		num_poisoned_pages_sub(count);
 		unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
 				 page_to_pfn(p), &unpoison_rs);
 	}