Message ID | 20240314083921.1146937-1-zhaoyang.huang@unisoc.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | mm: fix a race scenario in folio_isolate_lru | expand |
On Thu, Mar 14, 2024 at 4:39 PM zhaoyang.huang <zhaoyang.huang@unisoc.com> wrote: > > From: Zhaoyang Huang <zhaoyang.huang@unisoc.com> > > Panic[1] reported which is caused by lruvec->list break. Fix the race > between folio_isolate_lru and release_pages. > > race condition: > release_pages could meet a non-refered folio which escaped from being > deleted from LRU but add to another list_head > #0 folio_isolate_lru #1 release_pages > if (folio_test_clear_lru()) > if (folio_put_testzero()) > if (!folio_test_lru()) > <failed to delete folio from LRU> > folio_get(folio) > list_add(&folio->lru,) > list_del(&folio->lru,) > > fix action 1: > have folio_get prior to folio_test_clear_lru is not enough as there > could be concurrent folio_put(filemap_remove_folios) to make > release_pages pass refcnt check and failed in delete from LRU > > #0 folio_isolate_lru #1 release_pages > folio_get(folio) > if (folio_test_clear_lru()) > if (folio_put_testzero()) > if (!folio_test_lru()) > <failed to delete folio from LRU> > list_add(&folio->lru,) > list_del(&folio->lru,) correct the timing description of fix action 1, that is moving folio_get prior to folio_test_clear_lru, which can't guarantee it happens before folio_put_testzero of release_pages #0 folio_isolate_lru #1 release_pages BUG_ON(!folio_refcnt) if (folio_put_testzero()) folio_get(folio) if (folio_test_clear_lru()) if (!folio_test_lru()) <failed to delete folio from LRU> list_add(&folio->lru,) list_del(&folio->lru,) > > fix action 2: > folio_test_clear_lru should be considered as part of critical section of > lruvec which require be within lruvec->lock. > > #0 folio_isolate_lru #1 release_pages > spin_lock(lruvec->lock) > folio_get(folio) > if (folio_test_clear_lru()) > list_del(&folio->lru,) > <delete folio from LRU> > spin_unlock(lruvec->lock) spin_lock(lruvec->lock) > if (folio_put_testzero()) > if (!folio_test_lru()) > list_add(&folio->lru,) > > [1] > [ 37.562326] pc : __list_del_entry_valid_or_report+0xec/0xf0 > [ 37.562344] lr : __list_del_entry_valid_or_report+0xec/0xf0 > [ 37.562351] sp : ffffffc085953990 > [ 37.562355] x29: ffffffc0859539d0 x28: ffffffc082144000 x27: 000000000000000f > [ 37.562367] x26: 000000000000000d x25: 000000000000000d x24: 00000000ffffffff > [ 37.562377] x23: ffffffc085953a08 x22: ffffff8080389000 x21: ffffff8080389000 > [ 37.562388] x20: fffffffe05c54180 x19: ffffffc085953b30 x18: ffffffc085989098 > [ 37.562399] x17: 20747562202c3838 x16: ffffffffffffffff x15: 0000000000000004 > [ 37.562409] x14: ffffff8176980000 x13: 0000000000007fff x12: 0000000000000003 > [ 37.562420] x11: 00000000ffff7fff x10: ffffffc0820f51c4 x9 : 53b71233d5d50e00 > [ 37.562431] x8 : 53b71233d5d50e00 x7 : ffffffc081161ff0 x6 : 0000000000000000 > [ 37.562441] x5 : 0000000000000001 x4 : 0000000000000001 x3 : 0000000000000000 > [ 37.562451] x2 : ffffff817f2c4178 x1 : ffffff817f2b71c8 x0 : 000000000000006d > [ 37.562461] Call trace: > [ 37.562465] __list_del_entry_valid_or_report+0xec/0xf0 > [ 37.562472] release_pages+0x410/0x4c0 > [ 37.562482] __folio_batch_release+0x34/0x4c > [ 37.562490] truncate_inode_pages_range+0x368/0x63c > [ 37.562497] truncate_inode_pages+0x14/0x24 > [ 37.562504] blkdev_flush_mapping+0x60/0x120 > [ 37.562513] blkdev_put+0x114/0x298 > [ 37.562520] blkdev_release+0x28/0x40 > [ 37.562526] __fput+0xf8/0x2a8 > [ 37.562533] ____fput+0x10/0x20 > [ 37.562539] task_work_run+0xc4/0xec > [ 37.562546] do_exit+0x32c/0xa3c > [ 37.562554] do_group_exit+0x98/0x9c > [ 37.562561] __arm64_sys_exit_group+0x18/0x1c > [ 37.562568] invoke_syscall+0x58/0x114 > [ 37.562575] el0_svc_common+0xac/0xe0 > [ 37.562582] do_el0_svc+0x1c/0x28 > [ 37.562588] el0_svc+0x50/0xe4 > [ 37.562593] el0t_64_sync_handler+0x68/0xbc > [ 37.562599] el0t_64_sync+0x1a8/0x1ac > > Signed-off-by: Zhaoyang Huang <zhaoyang.huang@unisoc.com> > --- > mm/swap.c | 25 +++++++++++++++++-------- > mm/vmscan.c | 25 +++++++++++++++++++------ > 2 files changed, 36 insertions(+), 14 deletions(-) > > diff --git a/mm/swap.c b/mm/swap.c > index cd8f0150ba3a..287cf7379927 100644 > --- a/mm/swap.c > +++ b/mm/swap.c > @@ -968,6 +968,7 @@ void release_pages(release_pages_arg arg, int nr) > > for (i = 0; i < nr; i++) { > struct folio *folio; > + struct lruvec *prev_lruvec; > > /* Turn any of the argument types into a folio */ > folio = page_folio(encoded_page_ptr(encoded[i])); > @@ -996,9 +997,24 @@ void release_pages(release_pages_arg arg, int nr) > free_zone_device_page(&folio->page); > continue; > } > + /* > + * lruvec->lock need to be prior to folio_put_testzero to > + * prevent race with folio_isolate_lru > + */ > + prev_lruvec = lruvec; > + lruvec = folio_lruvec_relock_irqsave(folio, lruvec, > + &flags); > + > + if (prev_lruvec != lruvec) > + lock_batch = 0; > > - if (!folio_put_testzero(folio)) > + if (!folio_put_testzero(folio)) { > + if (lruvec) { > + unlock_page_lruvec_irqrestore(lruvec, flags); > + lruvec = NULL; > + } > continue; > + } > > if (folio_test_large(folio)) { > if (lruvec) { > @@ -1010,13 +1026,6 @@ void release_pages(release_pages_arg arg, int nr) > } > > if (folio_test_lru(folio)) { > - struct lruvec *prev_lruvec = lruvec; > - > - lruvec = folio_lruvec_relock_irqsave(folio, lruvec, > - &flags); > - if (prev_lruvec != lruvec) > - lock_batch = 0; > - > lruvec_del_folio(lruvec, folio); > __folio_clear_lru_flags(folio); > } > diff --git a/mm/vmscan.c b/mm/vmscan.c > index 4255619a1a31..13a4a716c67a 100644 > --- a/mm/vmscan.c > +++ b/mm/vmscan.c > @@ -1721,18 +1721,31 @@ static unsigned long isolate_lru_folios(unsigned long nr_to_scan, > bool folio_isolate_lru(struct folio *folio) > { > bool ret = false; > + struct lruvec *lruvec; > > VM_BUG_ON_FOLIO(!folio_ref_count(folio), folio); > > - if (folio_test_clear_lru(folio)) { > - struct lruvec *lruvec; > + /* > + * The folio_get needs to be prior to clear lru for list integrity. > + * Otherwise: > + * #0 folio_isolate_lru #1 release_pages > + * if (folio_test_clear_lru()) > + * if (folio_put_testzero()) > + * if (!folio_test_lru()) > + * <failed to del folio from LRU> > + * folio_get(folio) > + * list_add(&folio->lru,) > + * list_del(&folio->lru,) > + */ > + lruvec = folio_lruvec_lock_irq(folio); > + folio_get(folio); > > - folio_get(folio); > - lruvec = folio_lruvec_lock_irq(folio); > + if (folio_test_clear_lru(folio)) { > lruvec_del_folio(lruvec, folio); > - unlock_page_lruvec_irq(lruvec); > ret = true; > - } > + } else > + folio_put(folio); > + unlock_page_lruvec_irq(lruvec); > > return ret; > } > -- > 2.25.1 >
On Thu, Mar 14, 2024 at 04:39:21PM +0800, zhaoyang.huang wrote: > From: Zhaoyang Huang <zhaoyang.huang@unisoc.com> > > Panic[1] reported which is caused by lruvec->list break. Fix the race > between folio_isolate_lru and release_pages. > > race condition: > release_pages could meet a non-refered folio which escaped from being > deleted from LRU but add to another list_head I don't think the bug is in folio_isolate_lru() but rather in its caller. * Context: * * (1) Must be called with an elevated refcount on the folio. This is a * fundamental difference from isolate_lru_folios() (which is called * without a stable reference). So when release_pages() runs, it must not see a refcount decremented to zero, because the caller of folio_isolate_lru() is supposed to hold one. Your stack trace is for the thread which is calling release_pages(), not the one calling folio_isolate_lru(), so I can't help you debug further.
On Fri, Mar 15, 2024 at 8:46 PM Matthew Wilcox <willy@infradead.org> wrote: > > On Thu, Mar 14, 2024 at 04:39:21PM +0800, zhaoyang.huang wrote: > > From: Zhaoyang Huang <zhaoyang.huang@unisoc.com> > > > > Panic[1] reported which is caused by lruvec->list break. Fix the race > > between folio_isolate_lru and release_pages. > > > > race condition: > > release_pages could meet a non-refered folio which escaped from being > > deleted from LRU but add to another list_head > > I don't think the bug is in folio_isolate_lru() but rather in its > caller. > > * Context: > * > * (1) Must be called with an elevated refcount on the folio. This is a > * fundamental difference from isolate_lru_folios() (which is called > * without a stable reference). > > So when release_pages() runs, it must not see a refcount decremented to > zero, because the caller of folio_isolate_lru() is supposed to hold one. > > Your stack trace is for the thread which is calling release_pages(), not > the one calling folio_isolate_lru(), so I can't help you debug further. Thanks for the comments. According to my understanding, folio_put_testzero does the decrement before test which makes it possible to have release_pages see refcnt equal zero and proceed further(folio_get in folio_isolate_lru has not run yet). #0 folio_isolate_lru #1 release_pages BUG_ON(!folio_refcnt) if (folio_put_testzero()) folio_get(folio) if (folio_test_clear_lru())
On Sat, Mar 16, 2024 at 04:53:09PM +0800, Zhaoyang Huang wrote: > On Fri, Mar 15, 2024 at 8:46 PM Matthew Wilcox <willy@infradead.org> wrote: > > > > On Thu, Mar 14, 2024 at 04:39:21PM +0800, zhaoyang.huang wrote: > > > From: Zhaoyang Huang <zhaoyang.huang@unisoc.com> > > > > > > Panic[1] reported which is caused by lruvec->list break. Fix the race > > > between folio_isolate_lru and release_pages. > > > > > > race condition: > > > release_pages could meet a non-refered folio which escaped from being > > > deleted from LRU but add to another list_head > > > > I don't think the bug is in folio_isolate_lru() but rather in its > > caller. > > > > * Context: > > * > > * (1) Must be called with an elevated refcount on the folio. This is a > > * fundamental difference from isolate_lru_folios() (which is called > > * without a stable reference). > > > > So when release_pages() runs, it must not see a refcount decremented to > > zero, because the caller of folio_isolate_lru() is supposed to hold one. > > > > Your stack trace is for the thread which is calling release_pages(), not > > the one calling folio_isolate_lru(), so I can't help you debug further. > Thanks for the comments. According to my understanding, > folio_put_testzero does the decrement before test which makes it > possible to have release_pages see refcnt equal zero and proceed > further(folio_get in folio_isolate_lru has not run yet). No, that's not possible. In the scenario below, at entry to folio_isolate_lru(), the folio has refcount 2. It has one refcount from thread 0 (because it must own one before calling folio_isolate_lru()) and it has one refcount from thread 1 (because it's about to call release_pages()). If release_pages() were not running, the folio would have refcount 3 when folio_isolate_lru() returned. > #0 folio_isolate_lru #1 release_pages > BUG_ON(!folio_refcnt) > if (folio_put_testzero()) > folio_get(folio) > if (folio_test_clear_lru())
On Sat, Mar 16, 2024 at 10:59 PM Matthew Wilcox <willy@infradead.org> wrote: > > On Sat, Mar 16, 2024 at 04:53:09PM +0800, Zhaoyang Huang wrote: > > On Fri, Mar 15, 2024 at 8:46 PM Matthew Wilcox <willy@infradead.org> wrote: > > > > > > On Thu, Mar 14, 2024 at 04:39:21PM +0800, zhaoyang.huang wrote: > > > > From: Zhaoyang Huang <zhaoyang.huang@unisoc.com> > > > > > > > > Panic[1] reported which is caused by lruvec->list break. Fix the race > > > > between folio_isolate_lru and release_pages. > > > > > > > > race condition: > > > > release_pages could meet a non-refered folio which escaped from being > > > > deleted from LRU but add to another list_head > > > > > > I don't think the bug is in folio_isolate_lru() but rather in its > > > caller. > > > > > > * Context: > > > * > > > * (1) Must be called with an elevated refcount on the folio. This is a > > > * fundamental difference from isolate_lru_folios() (which is called > > > * without a stable reference). > > > > > > So when release_pages() runs, it must not see a refcount decremented to > > > zero, because the caller of folio_isolate_lru() is supposed to hold one. > > > > > > Your stack trace is for the thread which is calling release_pages(), not > > > the one calling folio_isolate_lru(), so I can't help you debug further. > > Thanks for the comments. According to my understanding, > > folio_put_testzero does the decrement before test which makes it > > possible to have release_pages see refcnt equal zero and proceed > > further(folio_get in folio_isolate_lru has not run yet). > > No, that's not possible. > > In the scenario below, at entry to folio_isolate_lru(), the folio has > refcount 2. It has one refcount from thread 0 (because it must own one > before calling folio_isolate_lru()) and it has one refcount from thread 1 > (because it's about to call release_pages()). If release_pages() were > not running, the folio would have refcount 3 when folio_isolate_lru() > returned. Could it be this scenario, where folio comes from pte(thread 0), local fbatch(thread 1) and page cache(thread 2) concurrently and proceed intermixed without lock's protection? Actually, IMO, thread 1 also could see the folio with refcnt==1 since it doesn't care if the page is on the page cache or not. madivise_cold_and_pageout does no explicit folio_get thing since the folio comes from pte which implies it has one refcnt from pagecache #thread 0(madivise_cold_and_pageout) #1 (lru_add_drain->fbatch_release_pages) #2(read_pages->filemap_remove_folios) refcnt == 1(represent page cache) refcnt==2(another one represent LRU) folio comes from page cache folio_isolate_lru release_pages filemap_free_folio refcnt==1(decrease the one of page cache) folio_put_testzero == true <No lruvec_del_folio> list_add(folio->lru, pages_to_free) //current folio will break LRU's integrity since it has not been deleted In case of gmail's wrap, split above chart to two parts #thread 0(madivise_cold_and_pageout) #1 (lru_add_drain->fbatch_release_pages) refcnt == 1(represent page cache) refcnt==2(another one represent LRU) folio_isolate_lru release_pages folio_put_testzero == true <No lruvec_del_folio> list_add(folio->lru, pages_to_free) //current folio will break LRU's integrity since it has not been deleted #1 (lru_add_drain->fbatch_release_pages) #2(read_pages->filemap_remove_folios) refcnt==2(another one represent LRU) folio comes from page cache release_pages filemap_free_folio refcnt==1(decrease the one of page cache) folio_put_testzero == true <No lruvec_del_folio> list_add(folio->lru, pages_to_free) //current folio will break LRU's integrity since it has not been deleted > > > #0 folio_isolate_lru #1 release_pages > > BUG_ON(!folio_refcnt) > > if (folio_put_testzero()) > > folio_get(folio) > > if (folio_test_clear_lru())
On Sun, Mar 17, 2024 at 12:07:40PM +0800, Zhaoyang Huang wrote: > Could it be this scenario, where folio comes from pte(thread 0), local > fbatch(thread 1) and page cache(thread 2) concurrently and proceed > intermixed without lock's protection? Actually, IMO, thread 1 also > could see the folio with refcnt==1 since it doesn't care if the page > is on the page cache or not. > > madivise_cold_and_pageout does no explicit folio_get thing since the > folio comes from pte which implies it has one refcnt from pagecache Mmm, no. It's implicit, but madvise_cold_or_pageout_pte_range() does guarantee that the folio has at least one refcount. Since we get the folio from vm_normal_folio(vma, addr, ptent); we know that there is at least one mapcount on the folio. refcount is always >= mapcount. Since we hold pte_offset_map_lock(), we know that mapcount (and therefore refcount) cannot be decremented until we call pte_unmap_unlock(), which we don't do until we have called folio_isolate_lru(). Good try though, took me a few minutes of looking at it to convince myself that it was safe. Something to bear in mind is that if the race you outline is real, failing to hold a refcount on the folio leaves the caller susceptible to the VM_BUG_ON_FOLIO(!folio_ref_count(folio), folio); if the other thread calls folio_put(). I can't understand any of the scenarios you outline below. Please try again without relying on indentation. > #thread 0(madivise_cold_and_pageout) #1 > (lru_add_drain->fbatch_release_pages) > #2(read_pages->filemap_remove_folios) > refcnt == 1(represent page cache) > > refcnt==2(another one represent LRU) > folio comes from page cache > folio_isolate_lru > release_pages > filemap_free_folio > > > refcnt==1(decrease the one of page cache) > > folio_put_testzero == true > > <No lruvec_del_folio> > > list_add(folio->lru, pages_to_free) //current folio will break LRU's > integrity since it has not been deleted > > In case of gmail's wrap, split above chart to two parts > > #thread 0(madivise_cold_and_pageout) #1 > (lru_add_drain->fbatch_release_pages) > refcnt == 1(represent page cache) > > refcnt==2(another one represent LRU) > folio_isolate_lru release_pages > > folio_put_testzero == true > > <No lruvec_del_folio> > > list_add(folio->lru, pages_to_free) > > //current folio will break LRU's integrity since it has not been > deleted > > #1 (lru_add_drain->fbatch_release_pages) > #2(read_pages->filemap_remove_folios) > refcnt==2(another one represent LRU) > folio comes from page cache > release_pages > filemap_free_folio > > refcnt==1(decrease the one of page cache) > folio_put_testzero == true > <No lruvec_del_folio> > list_add(folio->lru, pages_to_free) > //current folio will break LRU's integrity since it has not been deleted > > > > > #0 folio_isolate_lru #1 release_pages > > > BUG_ON(!folio_refcnt) > > > if (folio_put_testzero()) > > > folio_get(folio) > > > if (folio_test_clear_lru())
diff --git a/mm/swap.c b/mm/swap.c index cd8f0150ba3a..287cf7379927 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -968,6 +968,7 @@ void release_pages(release_pages_arg arg, int nr) for (i = 0; i < nr; i++) { struct folio *folio; + struct lruvec *prev_lruvec; /* Turn any of the argument types into a folio */ folio = page_folio(encoded_page_ptr(encoded[i])); @@ -996,9 +997,24 @@ void release_pages(release_pages_arg arg, int nr) free_zone_device_page(&folio->page); continue; } + /* + * lruvec->lock need to be prior to folio_put_testzero to + * prevent race with folio_isolate_lru + */ + prev_lruvec = lruvec; + lruvec = folio_lruvec_relock_irqsave(folio, lruvec, + &flags); + + if (prev_lruvec != lruvec) + lock_batch = 0; - if (!folio_put_testzero(folio)) + if (!folio_put_testzero(folio)) { + if (lruvec) { + unlock_page_lruvec_irqrestore(lruvec, flags); + lruvec = NULL; + } continue; + } if (folio_test_large(folio)) { if (lruvec) { @@ -1010,13 +1026,6 @@ void release_pages(release_pages_arg arg, int nr) } if (folio_test_lru(folio)) { - struct lruvec *prev_lruvec = lruvec; - - lruvec = folio_lruvec_relock_irqsave(folio, lruvec, - &flags); - if (prev_lruvec != lruvec) - lock_batch = 0; - lruvec_del_folio(lruvec, folio); __folio_clear_lru_flags(folio); } diff --git a/mm/vmscan.c b/mm/vmscan.c index 4255619a1a31..13a4a716c67a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1721,18 +1721,31 @@ static unsigned long isolate_lru_folios(unsigned long nr_to_scan, bool folio_isolate_lru(struct folio *folio) { bool ret = false; + struct lruvec *lruvec; VM_BUG_ON_FOLIO(!folio_ref_count(folio), folio); - if (folio_test_clear_lru(folio)) { - struct lruvec *lruvec; + /* + * The folio_get needs to be prior to clear lru for list integrity. + * Otherwise: + * #0 folio_isolate_lru #1 release_pages + * if (folio_test_clear_lru()) + * if (folio_put_testzero()) + * if (!folio_test_lru()) + * <failed to del folio from LRU> + * folio_get(folio) + * list_add(&folio->lru,) + * list_del(&folio->lru,) + */ + lruvec = folio_lruvec_lock_irq(folio); + folio_get(folio); - folio_get(folio); - lruvec = folio_lruvec_lock_irq(folio); + if (folio_test_clear_lru(folio)) { lruvec_del_folio(lruvec, folio); - unlock_page_lruvec_irq(lruvec); ret = true; - } + } else + folio_put(folio); + unlock_page_lruvec_irq(lruvec); return ret; }