Message ID | 20210614021212.223326-2-nao.horiguchi@gmail.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | mm/hwpoison: fix unpoison_memory() | expand |
On 2021/6/14 10:12, Naoya Horiguchi wrote: > > @@ -2171,6 +2177,8 @@ int soft_offline_page(unsigned long pfn, int flags) > return -EIO; > } > > + mutex_lock(&mf_mutex); > + > if (PageHWPoison(page)) { > pr_info("%s: %#lx page already poisoned\n", __func__, pfn); > put_ref_page(ref_page); Did you miss mutex_unlock() here when page already poisoned ? > @@ -2194,5 +2202,7 @@ int soft_offline_page(unsigned long pfn, int flags) > __func__, pfn, page->flags, &page->flags); > } > > + mutex_unlock(&mf_mutex); > + > return ret; > } > -- Thanks, - Ding Hui
On Tue, Jun 15, 2021 at 07:41:34PM +0800, Ding Hui wrote: > On 2021/6/14 10:12, Naoya Horiguchi wrote: > > > @@ -2171,6 +2177,8 @@ int soft_offline_page(unsigned long pfn, int flags) > > return -EIO; > > } > > + mutex_lock(&mf_mutex); > > + > > if (PageHWPoison(page)) { > > pr_info("%s: %#lx page already poisoned\n", __func__, pfn); > > put_ref_page(ref_page); > > Did you miss mutex_unlock() here when page already poisoned ? Thanks, you're right. I'll fix it. - Naoya > > > @@ -2194,5 +2202,7 @@ int soft_offline_page(unsigned long pfn, int flags) > > __func__, pfn, page->flags, &page->flags); > > } > > + mutex_unlock(&mf_mutex); > > + > > return ret; > > } > > > > -- > Thanks, > - Ding Hui >
Hi: On 2021/6/14 10:12, Naoya Horiguchi wrote: > From: Naoya Horiguchi <naoya.horiguchi@nec.com> > > Originally mf_mutex is introduced to serialize multiple MCE events, but > it's also helpful to exclude races among soft_offline_page() and > unpoison_memory(). So apply mf_mutex to them. > When I was investigating the memory-failure code, I realized these possible races too. It's very kind of you to fix these races! Many thanks! > Signed-off-by: Naoya Horiguchi <naoya.horiguchi@nec.com> > --- > mm/memory-failure.c | 26 ++++++++++++++++++-------- > 1 file changed, 18 insertions(+), 8 deletions(-) > > diff --git v5.13-rc5/mm/memory-failure.c v5.13-rc5_patched/mm/memory-failure.c > index ae30fd6d575a..280eb6d6dd15 100644 > --- v5.13-rc5/mm/memory-failure.c > +++ v5.13-rc5_patched/mm/memory-failure.c > @@ -1583,6 +1583,8 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, > return rc; > } > > +static DEFINE_MUTEX(mf_mutex); > + > /** > * memory_failure - Handle memory failure of a page. > * @pfn: Page Number of the corrupted page > @@ -1609,7 +1611,6 @@ int memory_failure(unsigned long pfn, int flags) > int res = 0; > unsigned long page_flags; > bool retry = true; > - static DEFINE_MUTEX(mf_mutex); > > if (!sysctl_memory_failure_recovery) > panic("Memory failure on page %lx", pfn); > @@ -1918,6 +1919,7 @@ int unpoison_memory(unsigned long pfn) > struct page *page; > struct page *p; > int freeit = 0; > + int ret = 0; > unsigned long flags = 0; > static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL, > DEFAULT_RATELIMIT_BURST); > @@ -1928,28 +1930,30 @@ int unpoison_memory(unsigned long pfn) > p = pfn_to_page(pfn); > page = compound_head(p); > > + mutex_lock(&mf_mutex); > + > if (!PageHWPoison(p)) { > unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n", > pfn, &unpoison_rs); > - return 0; > + goto unlock_mutex; > } > > if (page_count(page) > 1) { > unpoison_pr_info("Unpoison: Someone grabs the hwpoison page %#lx\n", > pfn, &unpoison_rs); > - return 0; > + goto unlock_mutex; > } > > if (page_mapped(page)) { > unpoison_pr_info("Unpoison: Someone maps the hwpoison page %#lx\n", > pfn, &unpoison_rs); > - return 0; > + goto unlock_mutex; > } > > if (page_mapping(page)) { > unpoison_pr_info("Unpoison: the hwpoison page has non-NULL mapping %#lx\n", > pfn, &unpoison_rs); > - return 0; > + goto unlock_mutex; > } > > /* > @@ -1960,7 +1964,7 @@ int unpoison_memory(unsigned long pfn) > if (!PageHuge(page) && PageTransHuge(page)) { > unpoison_pr_info("Unpoison: Memory failure is now running on %#lx\n", > pfn, &unpoison_rs); > - return 0; > + goto unlock_mutex; > } > Maybe it's more appropriate to start mutex_lock(&mf_mutex) here? I think these races start here. > if (!get_hwpoison_page(p, flags)) { > @@ -1968,7 +1972,7 @@ int unpoison_memory(unsigned long pfn) > num_poisoned_pages_dec(); > unpoison_pr_info("Unpoison: Software-unpoisoned free page %#lx\n", > pfn, &unpoison_rs); > - return 0; > + goto unlock_mutex; > } > > lock_page(page); > @@ -1990,7 +1994,9 @@ int unpoison_memory(unsigned long pfn) > if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1)) > put_page(page); > > - return 0; > +unlock_mutex: > + mutex_unlock(&mf_mutex); > + return ret; > } > EXPORT_SYMBOL(unpoison_memory); > > @@ -2171,6 +2177,8 @@ int soft_offline_page(unsigned long pfn, int flags) > return -EIO; > } > > + mutex_lock(&mf_mutex); > + > if (PageHWPoison(page)) { > pr_info("%s: %#lx page already poisoned\n", __func__, pfn); > put_ref_page(ref_page); > @@ -2194,5 +2202,7 @@ int soft_offline_page(unsigned long pfn, int flags) > __func__, pfn, page->flags, &page->flags); > } > > + mutex_unlock(&mf_mutex); > + > return ret; > } >
On Tue, Jun 15, 2021 at 08:42:23PM +0800, Miaohe Lin wrote: ... > > @@ -1960,7 +1964,7 @@ int unpoison_memory(unsigned long pfn) > > if (!PageHuge(page) && PageTransHuge(page)) { > > unpoison_pr_info("Unpoison: Memory failure is now running on %#lx\n", > > pfn, &unpoison_rs); > > - return 0; > > + goto unlock_mutex; > > } > > > > Maybe it's more appropriate to start mutex_lock(&mf_mutex) here? I think these races start here. Hi Miaohe, Thank your for the review. Consider that we put mutex_lock() here, and let's think about two concurrent calls of unpoison_memory(), then these events could be processed like below: CPU 0 CPU 1 unpoison_memory check PageHWPoison // true unpoison_memory check PageHWPoison // true mutex_lock get_hwpoison_page TestClearPageHWPoison put_page put_page // freed mutex_unlock // the unpoisoned page can be used for allocation mutex_lock get_hwpoison_page // succeeds ... // unpoison the !PageHWPoison page !? So I thought that we had better do the prechecks in mf_mutex. Maybe the 2nd unpoison_memory() just get and put the page refcount by 1 even in this race, so the impact is not so big, but I feel like avoiding "unpoison the !PageHWPoison page" situation. Does it make sense for you? Thanks, Naoya Horiguchi
On 2021/6/16 8:41, HORIGUCHI NAOYA(堀口 直也) wrote: > On Tue, Jun 15, 2021 at 08:42:23PM +0800, Miaohe Lin wrote: > ... >>> @@ -1960,7 +1964,7 @@ int unpoison_memory(unsigned long pfn) >>> if (!PageHuge(page) && PageTransHuge(page)) { >>> unpoison_pr_info("Unpoison: Memory failure is now running on %#lx\n", >>> pfn, &unpoison_rs); >>> - return 0; >>> + goto unlock_mutex; >>> } >>> >> >> Maybe it's more appropriate to start mutex_lock(&mf_mutex) here? I think these races start here. > > Hi Miaohe, > > Thank your for the review. > > Consider that we put mutex_lock() here, and let's think about two concurrent > calls of unpoison_memory(), then these events could be processed like below: > > CPU 0 CPU 1 > unpoison_memory > check PageHWPoison // true > unpoison_memory > check PageHWPoison // true > mutex_lock > get_hwpoison_page > TestClearPageHWPoison > put_page > put_page // freed > mutex_unlock > > // the unpoisoned page can be used for allocation > > mutex_lock > get_hwpoison_page // succeeds > ... // unpoison the !PageHWPoison page !? > > > So I thought that we had better do the prechecks in mf_mutex. Maybe the 2nd > unpoison_memory() just get and put the page refcount by 1 even in this race, > so the impact is not so big, but I feel like avoiding "unpoison the > !PageHWPoison page" situation. > > Does it make sense for you? Yes, I missed the races inside unpoison_memory() itself. Many thanks for your detailed explanation! > > Thanks, > Naoya Horiguchi >
diff --git v5.13-rc5/mm/memory-failure.c v5.13-rc5_patched/mm/memory-failure.c index ae30fd6d575a..280eb6d6dd15 100644 --- v5.13-rc5/mm/memory-failure.c +++ v5.13-rc5_patched/mm/memory-failure.c @@ -1583,6 +1583,8 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, return rc; } +static DEFINE_MUTEX(mf_mutex); + /** * memory_failure - Handle memory failure of a page. * @pfn: Page Number of the corrupted page @@ -1609,7 +1611,6 @@ int memory_failure(unsigned long pfn, int flags) int res = 0; unsigned long page_flags; bool retry = true; - static DEFINE_MUTEX(mf_mutex); if (!sysctl_memory_failure_recovery) panic("Memory failure on page %lx", pfn); @@ -1918,6 +1919,7 @@ int unpoison_memory(unsigned long pfn) struct page *page; struct page *p; int freeit = 0; + int ret = 0; unsigned long flags = 0; static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); @@ -1928,28 +1930,30 @@ int unpoison_memory(unsigned long pfn) p = pfn_to_page(pfn); page = compound_head(p); + mutex_lock(&mf_mutex); + if (!PageHWPoison(p)) { unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n", pfn, &unpoison_rs); - return 0; + goto unlock_mutex; } if (page_count(page) > 1) { unpoison_pr_info("Unpoison: Someone grabs the hwpoison page %#lx\n", pfn, &unpoison_rs); - return 0; + goto unlock_mutex; } if (page_mapped(page)) { unpoison_pr_info("Unpoison: Someone maps the hwpoison page %#lx\n", pfn, &unpoison_rs); - return 0; + goto unlock_mutex; } if (page_mapping(page)) { unpoison_pr_info("Unpoison: the hwpoison page has non-NULL mapping %#lx\n", pfn, &unpoison_rs); - return 0; + goto unlock_mutex; } /* @@ -1960,7 +1964,7 @@ int unpoison_memory(unsigned long pfn) if (!PageHuge(page) && PageTransHuge(page)) { unpoison_pr_info("Unpoison: Memory failure is now running on %#lx\n", pfn, &unpoison_rs); - return 0; + goto unlock_mutex; } if (!get_hwpoison_page(p, flags)) { @@ -1968,7 +1972,7 @@ int unpoison_memory(unsigned long pfn) num_poisoned_pages_dec(); unpoison_pr_info("Unpoison: Software-unpoisoned free page %#lx\n", pfn, &unpoison_rs); - return 0; + goto unlock_mutex; } lock_page(page); @@ -1990,7 +1994,9 @@ int unpoison_memory(unsigned long pfn) if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1)) put_page(page); - return 0; +unlock_mutex: + mutex_unlock(&mf_mutex); + return ret; } EXPORT_SYMBOL(unpoison_memory); @@ -2171,6 +2177,8 @@ int soft_offline_page(unsigned long pfn, int flags) return -EIO; } + mutex_lock(&mf_mutex); + if (PageHWPoison(page)) { pr_info("%s: %#lx page already poisoned\n", __func__, pfn); put_ref_page(ref_page); @@ -2194,5 +2202,7 @@ int soft_offline_page(unsigned long pfn, int flags) __func__, pfn, page->flags, &page->flags); } + mutex_unlock(&mf_mutex); + return ret; }