diff mbox series

[v4] fs/proc: task_mmu.c: don't read mapcount for migration entry

Message ID 20220203182641.824731-1-shy828301@gmail.com (mailing list archive)
State New
Headers show
Series [v4] fs/proc: task_mmu.c: don't read mapcount for migration entry | expand

Commit Message

Yang Shi Feb. 3, 2022, 6:26 p.m. UTC
The syzbot reported the below BUG:

kernel BUG at include/linux/page-flags.h:785!
invalid opcode: 0000 [#1] PREEMPT SMP KASAN
CPU: 1 PID: 4392 Comm: syz-executor560 Not tainted 5.16.0-rc6-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
RIP: 0010:PageDoubleMap include/linux/page-flags.h:785 [inline]
RIP: 0010:__page_mapcount+0x2d2/0x350 mm/util.c:744
Code: e8 d3 16 d1 ff 48 c7 c6 c0 00 b6 89 48 89 ef e8 94 4e 04 00 0f 0b e8 bd 16 d1 ff 48 c7 c6 60 01 b6 89 48 89 ef e8 7e 4e 04 00 <0f> 0b e8 a7 16 d1 ff 48 c7 c6 a0 01 b6 89 4c 89 f7 e8 68 4e 04 00
RSP: 0018:ffffc90002b6f7b8 EFLAGS: 00010293
RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
RDX: ffff888019619d00 RSI: ffffffff81a68c12 RDI: 0000000000000003
RBP: ffffea0001bdc2c0 R08: 0000000000000029 R09: 00000000ffffffff
R10: ffffffff8903e29f R11: 00000000ffffffff R12: 00000000ffffffff
R13: 00000000ffffea00 R14: ffffc90002b6fb30 R15: ffffea0001bd8001
FS:  00007faa2aefd700(0000) GS:ffff8880b9d00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007fff7e663318 CR3: 0000000018c6e000 CR4: 00000000003506e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
 <TASK>
 page_mapcount include/linux/mm.h:837 [inline]
 smaps_account+0x470/0xb10 fs/proc/task_mmu.c:466
 smaps_pte_entry fs/proc/task_mmu.c:538 [inline]
 smaps_pte_range+0x611/0x1250 fs/proc/task_mmu.c:601
 walk_pmd_range mm/pagewalk.c:128 [inline]
 walk_pud_range mm/pagewalk.c:205 [inline]
 walk_p4d_range mm/pagewalk.c:240 [inline]
 walk_pgd_range mm/pagewalk.c:277 [inline]
 __walk_page_range+0xe23/0x1ea0 mm/pagewalk.c:379
 walk_page_vma+0x277/0x350 mm/pagewalk.c:530
 smap_gather_stats.part.0+0x148/0x260 fs/proc/task_mmu.c:768
 smap_gather_stats fs/proc/task_mmu.c:741 [inline]
 show_smap+0xc6/0x440 fs/proc/task_mmu.c:822
 seq_read_iter+0xbb0/0x1240 fs/seq_file.c:272
 seq_read+0x3e0/0x5b0 fs/seq_file.c:162
 vfs_read+0x1b5/0x600 fs/read_write.c:479
 ksys_read+0x12d/0x250 fs/read_write.c:619
 do_syscall_x64 arch/x86/entry/common.c:50 [inline]
 do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
 entry_SYSCALL_64_after_hwframe+0x44/0xae
RIP: 0033:0x7faa2af6c969
Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 11 15 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007faa2aefd288 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
RAX: ffffffffffffffda RBX: 00007faa2aff4418 RCX: 00007faa2af6c969
RDX: 0000000000002025 RSI: 0000000020000100 RDI: 0000000000000003
RBP: 00007faa2aff4410 R08: 00007faa2aefd700 R09: 0000000000000000
R10: 00007faa2aefd700 R11: 0000000000000246 R12: 00007faa2afc20ac
R13: 00007fff7e6632bf R14: 00007faa2aefd400 R15: 0000000000022000
 </TASK>
Modules linked in:
---[ end trace 24ec93ff95e4ac3d ]---
RIP: 0010:PageDoubleMap include/linux/page-flags.h:785 [inline]
RIP: 0010:__page_mapcount+0x2d2/0x350 mm/util.c:744
Code: e8 d3 16 d1 ff 48 c7 c6 c0 00 b6 89 48 89 ef e8 94 4e 04 00 0f 0b e8 bd 16 d1 ff 48 c7 c6 60 01 b6 89 48 89 ef e8 7e 4e 04 00 <0f> 0b e8 a7 16 d1 ff 48 c7 c6 a0 01 b6 89 4c 89 f7 e8 68 4e 04 00
RSP: 0018:ffffc90002b6f7b8 EFLAGS: 00010293
RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
RDX: ffff888019619d00 RSI: ffffffff81a68c12 RDI: 0000000000000003
RBP: ffffea0001bdc2c0 R08: 0000000000000029 R09: 00000000ffffffff
R10: ffffffff8903e29f R11: 00000000ffffffff R12: 00000000ffffffff
R13: 00000000ffffea00 R14: ffffc90002b6fb30 R15: ffffea0001bd8001
FS:  00007faa2aefd700(0000) GS:ffff8880b9d00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007fff7e663318 CR3: 0000000018c6e000 CR4: 00000000003506e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400

The reproducer was trying to reading /proc/$PID/smaps when calling
MADV_FREE at the mean time.  MADV_FREE may split THPs if it is called
for partial THP.  It may trigger the below race:

         CPU A                         CPU B
         -----                         -----
smaps walk:                      MADV_FREE:
page_mapcount()
  PageCompound()
                                 split_huge_page()
  page = compound_head(page)
  PageDoubleMap(page)

When calling PageDoubleMap() this page is not a tail page of THP anymore
so the BUG is triggered.

This could be fixed by elevated refcount of the page before calling
mapcount, but it prevents from counting migration entries, and it seems
overkilling because the race just could happen when PMD is split so all
PTE entries of tail pages are actually migration entries, and
smaps_account() does treat migration entries as mapcount == 1 as Kirill
pointed out.

Add a new parameter for smaps_account() to tell this entry is migration
entry then skip calling page_mapcount().  Don't skip getting mapcount for
device private entries since they do track references with mapcount.

Pagemap also has the similar issue although it was not reported.  Fixed
it as well.

Fixes: e9b61f19858a ("thp: reintroduce split_huge_page()")
Reported-by: syzbot+1f52b3a18d5633fa7f82@syzkaller.appspotmail.com
Acked-by: David Hildenbrand <david@redhat.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Jann Horn <jannh@google.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Yang Shi <shy828301@gmail.com>
---
v4: * s/Treated/Treat per David
    * Collected acked-by tag from David
v3: * Fixed the fix tag, the one used by v2 was not accurate
    * Added comment about the risk calling page_mapcount() per David
    * Fix pagemap
v2: * Added proper fix tag per Jann Horn
    * Rebased to the latest linus's tree

 fs/proc/task_mmu.c | 38 ++++++++++++++++++++++++++++++--------
 1 file changed, 30 insertions(+), 8 deletions(-)

Comments

Andrew Morton Feb. 3, 2022, 10:12 p.m. UTC | #1
On Thu,  3 Feb 2022 10:26:41 -0800 Yang Shi <shy828301@gmail.com> wrote:

> v4: * s/Treated/Treat per David
>     * Collected acked-by tag from David
> v3: * Fixed the fix tag, the one used by v2 was not accurate
>     * Added comment about the risk calling page_mapcount() per David
>     * Fix pagemap
> v2: * Added proper fix tag per Jann Horn
>     * Rebased to the latest linus's tree

The v2->v4 delta shows changes which aren't described above?

--- a/fs/proc/task_mmu.c~fs-proc-task_mmuc-dont-read-mapcount-for-migration-entry-v4
+++ a/fs/proc/task_mmu.c
@@ -469,9 +469,12 @@ static void smaps_account(struct mem_siz
 	 * If any subpage of the compound page mapped with PTE it would elevate
 	 * page_count().
 	 *
-	 * Treated regular migration entries as mapcount == 1 without reading
-	 * mapcount since calling page_mapcount() for migration entries is
-	 * racy against THP splitting.
+	 * The page_mapcount() is called to get a snapshot of the mapcount.
+	 * Without holding the page lock this snapshot can be slightly wrong as
+	 * we cannot always read the mapcount atomically.  It is not safe to
+	 * call page_mapcount() even with PTL held if the page is not mapped,
+	 * especially for migration entries.  Treat regular migration entries
+	 * as mapcount == 1.
 	 */
 	if ((page_count(page) == 1) || migration) {
 		smaps_page_accumulate(mss, page, size, size << PSS_SHIFT, dirty,
@@ -1393,6 +1396,7 @@ static pagemap_entry_t pte_to_pagemap_en
 {
 	u64 frame = 0, flags = 0;
 	struct page *page = NULL;
+	bool migration = false;
 
 	if (pte_present(pte)) {
 		if (pm->show_pfn)
@@ -1414,13 +1418,14 @@ static pagemap_entry_t pte_to_pagemap_en
 			frame = swp_type(entry) |
 				(swp_offset(entry) << MAX_SWAPFILES_SHIFT);
 		flags |= PM_SWAP;
+		migration = is_migration_entry(entry);
 		if (is_pfn_swap_entry(entry))
 			page = pfn_swap_entry_to_page(entry);
 	}
 
 	if (page && !PageAnon(page))
 		flags |= PM_FILE;
-	if (page && page_mapcount(page) == 1)
+	if (page && !migration && page_mapcount(page) == 1)
 		flags |= PM_MMAP_EXCLUSIVE;
 	if (vma->vm_flags & VM_SOFTDIRTY)
 		flags |= PM_SOFT_DIRTY;
@@ -1436,6 +1441,7 @@ static int pagemap_pmd_range(pmd_t *pmdp
 	spinlock_t *ptl;
 	pte_t *pte, *orig_pte;
 	int err = 0;
+	bool migration = false;
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	ptl = pmd_trans_huge_lock(pmdp, vma);
@@ -1476,11 +1482,12 @@ static int pagemap_pmd_range(pmd_t *pmdp
 			if (pmd_swp_uffd_wp(pmd))
 				flags |= PM_UFFD_WP;
 			VM_BUG_ON(!is_pmd_migration_entry(pmd));
+			migration = is_migration_entry(entry);
 			page = pfn_swap_entry_to_page(entry);
 		}
 #endif
 
-		if (page && page_mapcount(page) == 1)
+		if (page && !migration && page_mapcount(page) == 1)
 			flags |= PM_MMAP_EXCLUSIVE;
 
 		for (; addr != end; addr += PAGE_SIZE) {
Yang Shi Feb. 3, 2022, 10:18 p.m. UTC | #2
On Thu, Feb 3, 2022 at 2:12 PM Andrew Morton <akpm@linux-foundation.org> wrote:
>
> On Thu,  3 Feb 2022 10:26:41 -0800 Yang Shi <shy828301@gmail.com> wrote:
>
> > v4: * s/Treated/Treat per David
> >     * Collected acked-by tag from David
> > v3: * Fixed the fix tag, the one used by v2 was not accurate
> >     * Added comment about the risk calling page_mapcount() per David
> >     * Fix pagemap
> > v2: * Added proper fix tag per Jann Horn
> >     * Rebased to the latest linus's tree
>
> The v2->v4 delta shows changes which aren't described above?

They are.

v4: * s/Treated/Treat per David
      * Collected acked-by tag from David
v3: * Fixed the fix tag, the one used by v2 was not accurate
      * Added comment about the risk calling page_mapcount() per David
      * Fix pagemap

>
> --- a/fs/proc/task_mmu.c~fs-proc-task_mmuc-dont-read-mapcount-for-migration-entry-v4
> +++ a/fs/proc/task_mmu.c
> @@ -469,9 +469,12 @@ static void smaps_account(struct mem_siz
>          * If any subpage of the compound page mapped with PTE it would elevate
>          * page_count().
>          *
> -        * Treated regular migration entries as mapcount == 1 without reading
> -        * mapcount since calling page_mapcount() for migration entries is
> -        * racy against THP splitting.
> +        * The page_mapcount() is called to get a snapshot of the mapcount.
> +        * Without holding the page lock this snapshot can be slightly wrong as
> +        * we cannot always read the mapcount atomically.  It is not safe to
> +        * call page_mapcount() even with PTL held if the page is not mapped,
> +        * especially for migration entries.  Treat regular migration entries
> +        * as mapcount == 1.
>          */
>         if ((page_count(page) == 1) || migration) {
>                 smaps_page_accumulate(mss, page, size, size << PSS_SHIFT, dirty,
> @@ -1393,6 +1396,7 @@ static pagemap_entry_t pte_to_pagemap_en
>  {
>         u64 frame = 0, flags = 0;
>         struct page *page = NULL;
> +       bool migration = false;
>
>         if (pte_present(pte)) {
>                 if (pm->show_pfn)
> @@ -1414,13 +1418,14 @@ static pagemap_entry_t pte_to_pagemap_en
>                         frame = swp_type(entry) |
>                                 (swp_offset(entry) << MAX_SWAPFILES_SHIFT);
>                 flags |= PM_SWAP;
> +               migration = is_migration_entry(entry);
>                 if (is_pfn_swap_entry(entry))
>                         page = pfn_swap_entry_to_page(entry);
>         }
>
>         if (page && !PageAnon(page))
>                 flags |= PM_FILE;
> -       if (page && page_mapcount(page) == 1)
> +       if (page && !migration && page_mapcount(page) == 1)
>                 flags |= PM_MMAP_EXCLUSIVE;
>         if (vma->vm_flags & VM_SOFTDIRTY)
>                 flags |= PM_SOFT_DIRTY;
> @@ -1436,6 +1441,7 @@ static int pagemap_pmd_range(pmd_t *pmdp
>         spinlock_t *ptl;
>         pte_t *pte, *orig_pte;
>         int err = 0;
> +       bool migration = false;
>
>  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>         ptl = pmd_trans_huge_lock(pmdp, vma);
> @@ -1476,11 +1482,12 @@ static int pagemap_pmd_range(pmd_t *pmdp
>                         if (pmd_swp_uffd_wp(pmd))
>                                 flags |= PM_UFFD_WP;
>                         VM_BUG_ON(!is_pmd_migration_entry(pmd));
> +                       migration = is_migration_entry(entry);
>                         page = pfn_swap_entry_to_page(entry);
>                 }
>  #endif
>
> -               if (page && page_mapcount(page) == 1)
> +               if (page && !migration && page_mapcount(page) == 1)
>                         flags |= PM_MMAP_EXCLUSIVE;
>
>                 for (; addr != end; addr += PAGE_SIZE) {
> _
>
Vlastimil Babka March 23, 2023, 9:52 a.m. UTC | #3
On 2/3/22 19:26, Yang Shi wrote:
> The syzbot reported the below BUG:
> 
> kernel BUG at include/linux/page-flags.h:785!
> invalid opcode: 0000 [#1] PREEMPT SMP KASAN
> CPU: 1 PID: 4392 Comm: syz-executor560 Not tainted 5.16.0-rc6-syzkaller #0
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
> RIP: 0010:PageDoubleMap include/linux/page-flags.h:785 [inline]
> RIP: 0010:__page_mapcount+0x2d2/0x350 mm/util.c:744
> Code: e8 d3 16 d1 ff 48 c7 c6 c0 00 b6 89 48 89 ef e8 94 4e 04 00 0f 0b e8 bd 16 d1 ff 48 c7 c6 60 01 b6 89 48 89 ef e8 7e 4e 04 00 <0f> 0b e8 a7 16 d1 ff 48 c7 c6 a0 01 b6 89 4c 89 f7 e8 68 4e 04 00
> RSP: 0018:ffffc90002b6f7b8 EFLAGS: 00010293
> RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
> RDX: ffff888019619d00 RSI: ffffffff81a68c12 RDI: 0000000000000003
> RBP: ffffea0001bdc2c0 R08: 0000000000000029 R09: 00000000ffffffff
> R10: ffffffff8903e29f R11: 00000000ffffffff R12: 00000000ffffffff
> R13: 00000000ffffea00 R14: ffffc90002b6fb30 R15: ffffea0001bd8001
> FS:  00007faa2aefd700(0000) GS:ffff8880b9d00000(0000) knlGS:0000000000000000
> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 00007fff7e663318 CR3: 0000000018c6e000 CR4: 00000000003506e0
> DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> Call Trace:
>  <TASK>
>  page_mapcount include/linux/mm.h:837 [inline]
>  smaps_account+0x470/0xb10 fs/proc/task_mmu.c:466
>  smaps_pte_entry fs/proc/task_mmu.c:538 [inline]
>  smaps_pte_range+0x611/0x1250 fs/proc/task_mmu.c:601
>  walk_pmd_range mm/pagewalk.c:128 [inline]
>  walk_pud_range mm/pagewalk.c:205 [inline]
>  walk_p4d_range mm/pagewalk.c:240 [inline]
>  walk_pgd_range mm/pagewalk.c:277 [inline]
>  __walk_page_range+0xe23/0x1ea0 mm/pagewalk.c:379
>  walk_page_vma+0x277/0x350 mm/pagewalk.c:530
>  smap_gather_stats.part.0+0x148/0x260 fs/proc/task_mmu.c:768
>  smap_gather_stats fs/proc/task_mmu.c:741 [inline]
>  show_smap+0xc6/0x440 fs/proc/task_mmu.c:822
>  seq_read_iter+0xbb0/0x1240 fs/seq_file.c:272
>  seq_read+0x3e0/0x5b0 fs/seq_file.c:162
>  vfs_read+0x1b5/0x600 fs/read_write.c:479
>  ksys_read+0x12d/0x250 fs/read_write.c:619
>  do_syscall_x64 arch/x86/entry/common.c:50 [inline]
>  do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
>  entry_SYSCALL_64_after_hwframe+0x44/0xae
> RIP: 0033:0x7faa2af6c969
> Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 11 15 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48
> RSP: 002b:00007faa2aefd288 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
> RAX: ffffffffffffffda RBX: 00007faa2aff4418 RCX: 00007faa2af6c969
> RDX: 0000000000002025 RSI: 0000000020000100 RDI: 0000000000000003
> RBP: 00007faa2aff4410 R08: 00007faa2aefd700 R09: 0000000000000000
> R10: 00007faa2aefd700 R11: 0000000000000246 R12: 00007faa2afc20ac
> R13: 00007fff7e6632bf R14: 00007faa2aefd400 R15: 0000000000022000
>  </TASK>
> Modules linked in:
> ---[ end trace 24ec93ff95e4ac3d ]---
> RIP: 0010:PageDoubleMap include/linux/page-flags.h:785 [inline]
> RIP: 0010:__page_mapcount+0x2d2/0x350 mm/util.c:744
> Code: e8 d3 16 d1 ff 48 c7 c6 c0 00 b6 89 48 89 ef e8 94 4e 04 00 0f 0b e8 bd 16 d1 ff 48 c7 c6 60 01 b6 89 48 89 ef e8 7e 4e 04 00 <0f> 0b e8 a7 16 d1 ff 48 c7 c6 a0 01 b6 89 4c 89 f7 e8 68 4e 04 00
> RSP: 0018:ffffc90002b6f7b8 EFLAGS: 00010293
> RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
> RDX: ffff888019619d00 RSI: ffffffff81a68c12 RDI: 0000000000000003
> RBP: ffffea0001bdc2c0 R08: 0000000000000029 R09: 00000000ffffffff
> R10: ffffffff8903e29f R11: 00000000ffffffff R12: 00000000ffffffff
> R13: 00000000ffffea00 R14: ffffc90002b6fb30 R15: ffffea0001bd8001
> FS:  00007faa2aefd700(0000) GS:ffff8880b9d00000(0000) knlGS:0000000000000000
> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 00007fff7e663318 CR3: 0000000018c6e000 CR4: 00000000003506e0
> DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> 
> The reproducer was trying to reading /proc/$PID/smaps when calling
> MADV_FREE at the mean time.  MADV_FREE may split THPs if it is called
> for partial THP.  It may trigger the below race:
> 
>          CPU A                         CPU B
>          -----                         -----
> smaps walk:                      MADV_FREE:
> page_mapcount()
>   PageCompound()
>                                  split_huge_page()
>   page = compound_head(page)
>   PageDoubleMap(page)
> 
> When calling PageDoubleMap() this page is not a tail page of THP anymore
> so the BUG is triggered.
> 
> This could be fixed by elevated refcount of the page before calling
> mapcount, but it prevents from counting migration entries, and it seems
> overkilling because the race just could happen when PMD is split so all
> PTE entries of tail pages are actually migration entries, and
> smaps_account() does treat migration entries as mapcount == 1 as Kirill
> pointed out.
> 
> Add a new parameter for smaps_account() to tell this entry is migration
> entry then skip calling page_mapcount().  Don't skip getting mapcount for
> device private entries since they do track references with mapcount.
> 
> Pagemap also has the similar issue although it was not reported.  Fixed
> it as well.
> 
> Fixes: e9b61f19858a ("thp: reintroduce split_huge_page()")
> Reported-by: syzbot+1f52b3a18d5633fa7f82@syzkaller.appspotmail.com
> Acked-by: David Hildenbrand <david@redhat.com>
> Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
> Cc: Jann Horn <jannh@google.com>
> Cc: Matthew Wilcox <willy@infradead.org>
> Cc: <stable@vger.kernel.org>
> Signed-off-by: Yang Shi <shy828301@gmail.com>
> ---
> v4: * s/Treated/Treat per David
>     * Collected acked-by tag from David
> v3: * Fixed the fix tag, the one used by v2 was not accurate
>     * Added comment about the risk calling page_mapcount() per David
>     * Fix pagemap
> v2: * Added proper fix tag per Jann Horn
>     * Rebased to the latest linus's tree
> 
>  fs/proc/task_mmu.c | 38 ++++++++++++++++++++++++++++++--------
>  1 file changed, 30 insertions(+), 8 deletions(-)
> 
> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> index 18f8c3acbb85..bc2f46033231 100644
> --- a/fs/proc/task_mmu.c
> +++ b/fs/proc/task_mmu.c
> @@ -440,7 +440,8 @@ static void smaps_page_accumulate(struct mem_size_stats *mss,
>  }
>  
>  static void smaps_account(struct mem_size_stats *mss, struct page *page,
> -		bool compound, bool young, bool dirty, bool locked)
> +		bool compound, bool young, bool dirty, bool locked,
> +		bool migration)
>  {
>  	int i, nr = compound ? compound_nr(page) : 1;
>  	unsigned long size = nr * PAGE_SIZE;
> @@ -467,8 +468,15 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
>  	 * page_count(page) == 1 guarantees the page is mapped exactly once.
>  	 * If any subpage of the compound page mapped with PTE it would elevate
>  	 * page_count().
> +	 *
> +	 * The page_mapcount() is called to get a snapshot of the mapcount.
> +	 * Without holding the page lock this snapshot can be slightly wrong as
> +	 * we cannot always read the mapcount atomically.  It is not safe to
> +	 * call page_mapcount() even with PTL held if the page is not mapped,
> +	 * especially for migration entries.  Treat regular migration entries
> +	 * as mapcount == 1.
>  	 */
> -	if (page_count(page) == 1) {
> +	if ((page_count(page) == 1) || migration) {

Since this is now apparently a CVE-2023-1582 for whatever RHeasons...

wonder if the patch actually works as intended when
(page_count() || migration) is in this particular order and not the other one?

>  		smaps_page_accumulate(mss, page, size, size << PSS_SHIFT, dirty,
>  			locked, true);
>  		return;
> @@ -517,6 +525,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
>  	struct vm_area_struct *vma = walk->vma;
>  	bool locked = !!(vma->vm_flags & VM_LOCKED);
>  	struct page *page = NULL;
> +	bool migration = false;
>  
>  	if (pte_present(*pte)) {
>  		page = vm_normal_page(vma, addr, *pte);
> @@ -536,8 +545,11 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
>  			} else {
>  				mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT;
>  			}
> -		} else if (is_pfn_swap_entry(swpent))
> +		} else if (is_pfn_swap_entry(swpent)) {
> +			if (is_migration_entry(swpent))
> +				migration = true;
>  			page = pfn_swap_entry_to_page(swpent);
> +		}
>  	} else {
>  		smaps_pte_hole_lookup(addr, walk);
>  		return;
> @@ -546,7 +558,8 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
>  	if (!page)
>  		return;
>  
> -	smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte), locked);
> +	smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte),
> +		      locked, migration);
>  }
>  
>  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
> @@ -557,6 +570,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
>  	struct vm_area_struct *vma = walk->vma;
>  	bool locked = !!(vma->vm_flags & VM_LOCKED);
>  	struct page *page = NULL;
> +	bool migration = false;
>  
>  	if (pmd_present(*pmd)) {
>  		/* FOLL_DUMP will return -EFAULT on huge zero page */
> @@ -564,8 +578,10 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
>  	} else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) {
>  		swp_entry_t entry = pmd_to_swp_entry(*pmd);
>  
> -		if (is_migration_entry(entry))
> +		if (is_migration_entry(entry)) {
> +			migration = true;
>  			page = pfn_swap_entry_to_page(entry);
> +		}
>  	}
>  	if (IS_ERR_OR_NULL(page))
>  		return;
> @@ -577,7 +593,9 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
>  		/* pass */;
>  	else
>  		mss->file_thp += HPAGE_PMD_SIZE;
> -	smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd), locked);
> +
> +	smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd),
> +		      locked, migration);
>  }
>  #else
>  static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
> @@ -1378,6 +1396,7 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
>  {
>  	u64 frame = 0, flags = 0;
>  	struct page *page = NULL;
> +	bool migration = false;
>  
>  	if (pte_present(pte)) {
>  		if (pm->show_pfn)
> @@ -1399,13 +1418,14 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
>  			frame = swp_type(entry) |
>  				(swp_offset(entry) << MAX_SWAPFILES_SHIFT);
>  		flags |= PM_SWAP;
> +		migration = is_migration_entry(entry);
>  		if (is_pfn_swap_entry(entry))
>  			page = pfn_swap_entry_to_page(entry);
>  	}
>  
>  	if (page && !PageAnon(page))
>  		flags |= PM_FILE;
> -	if (page && page_mapcount(page) == 1)
> +	if (page && !migration && page_mapcount(page) == 1)
>  		flags |= PM_MMAP_EXCLUSIVE;
>  	if (vma->vm_flags & VM_SOFTDIRTY)
>  		flags |= PM_SOFT_DIRTY;
> @@ -1421,6 +1441,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
>  	spinlock_t *ptl;
>  	pte_t *pte, *orig_pte;
>  	int err = 0;
> +	bool migration = false;
>  
>  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>  	ptl = pmd_trans_huge_lock(pmdp, vma);
> @@ -1461,11 +1482,12 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
>  			if (pmd_swp_uffd_wp(pmd))
>  				flags |= PM_UFFD_WP;
>  			VM_BUG_ON(!is_pmd_migration_entry(pmd));
> +			migration = is_migration_entry(entry);
>  			page = pfn_swap_entry_to_page(entry);
>  		}
>  #endif
>  
> -		if (page && page_mapcount(page) == 1)
> +		if (page && !migration && page_mapcount(page) == 1)
>  			flags |= PM_MMAP_EXCLUSIVE;
>  
>  		for (; addr != end; addr += PAGE_SIZE) {
David Hildenbrand March 23, 2023, 10:08 a.m. UTC | #4
On 23.03.23 10:52, Vlastimil Babka wrote:
> On 2/3/22 19:26, Yang Shi wrote:
>> The syzbot reported the below BUG:
>>
>> kernel BUG at include/linux/page-flags.h:785!
>> invalid opcode: 0000 [#1] PREEMPT SMP KASAN
>> CPU: 1 PID: 4392 Comm: syz-executor560 Not tainted 5.16.0-rc6-syzkaller #0
>> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
>> RIP: 0010:PageDoubleMap include/linux/page-flags.h:785 [inline]
>> RIP: 0010:__page_mapcount+0x2d2/0x350 mm/util.c:744
>> Code: e8 d3 16 d1 ff 48 c7 c6 c0 00 b6 89 48 89 ef e8 94 4e 04 00 0f 0b e8 bd 16 d1 ff 48 c7 c6 60 01 b6 89 48 89 ef e8 7e 4e 04 00 <0f> 0b e8 a7 16 d1 ff 48 c7 c6 a0 01 b6 89 4c 89 f7 e8 68 4e 04 00
>> RSP: 0018:ffffc90002b6f7b8 EFLAGS: 00010293
>> RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
>> RDX: ffff888019619d00 RSI: ffffffff81a68c12 RDI: 0000000000000003
>> RBP: ffffea0001bdc2c0 R08: 0000000000000029 R09: 00000000ffffffff
>> R10: ffffffff8903e29f R11: 00000000ffffffff R12: 00000000ffffffff
>> R13: 00000000ffffea00 R14: ffffc90002b6fb30 R15: ffffea0001bd8001
>> FS:  00007faa2aefd700(0000) GS:ffff8880b9d00000(0000) knlGS:0000000000000000
>> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>> CR2: 00007fff7e663318 CR3: 0000000018c6e000 CR4: 00000000003506e0
>> DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
>> DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
>> Call Trace:
>>   <TASK>
>>   page_mapcount include/linux/mm.h:837 [inline]
>>   smaps_account+0x470/0xb10 fs/proc/task_mmu.c:466
>>   smaps_pte_entry fs/proc/task_mmu.c:538 [inline]
>>   smaps_pte_range+0x611/0x1250 fs/proc/task_mmu.c:601
>>   walk_pmd_range mm/pagewalk.c:128 [inline]
>>   walk_pud_range mm/pagewalk.c:205 [inline]
>>   walk_p4d_range mm/pagewalk.c:240 [inline]
>>   walk_pgd_range mm/pagewalk.c:277 [inline]
>>   __walk_page_range+0xe23/0x1ea0 mm/pagewalk.c:379
>>   walk_page_vma+0x277/0x350 mm/pagewalk.c:530
>>   smap_gather_stats.part.0+0x148/0x260 fs/proc/task_mmu.c:768
>>   smap_gather_stats fs/proc/task_mmu.c:741 [inline]
>>   show_smap+0xc6/0x440 fs/proc/task_mmu.c:822
>>   seq_read_iter+0xbb0/0x1240 fs/seq_file.c:272
>>   seq_read+0x3e0/0x5b0 fs/seq_file.c:162
>>   vfs_read+0x1b5/0x600 fs/read_write.c:479
>>   ksys_read+0x12d/0x250 fs/read_write.c:619
>>   do_syscall_x64 arch/x86/entry/common.c:50 [inline]
>>   do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
>>   entry_SYSCALL_64_after_hwframe+0x44/0xae
>> RIP: 0033:0x7faa2af6c969
>> Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 11 15 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48
>> RSP: 002b:00007faa2aefd288 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
>> RAX: ffffffffffffffda RBX: 00007faa2aff4418 RCX: 00007faa2af6c969
>> RDX: 0000000000002025 RSI: 0000000020000100 RDI: 0000000000000003
>> RBP: 00007faa2aff4410 R08: 00007faa2aefd700 R09: 0000000000000000
>> R10: 00007faa2aefd700 R11: 0000000000000246 R12: 00007faa2afc20ac
>> R13: 00007fff7e6632bf R14: 00007faa2aefd400 R15: 0000000000022000
>>   </TASK>
>> Modules linked in:
>> ---[ end trace 24ec93ff95e4ac3d ]---
>> RIP: 0010:PageDoubleMap include/linux/page-flags.h:785 [inline]
>> RIP: 0010:__page_mapcount+0x2d2/0x350 mm/util.c:744
>> Code: e8 d3 16 d1 ff 48 c7 c6 c0 00 b6 89 48 89 ef e8 94 4e 04 00 0f 0b e8 bd 16 d1 ff 48 c7 c6 60 01 b6 89 48 89 ef e8 7e 4e 04 00 <0f> 0b e8 a7 16 d1 ff 48 c7 c6 a0 01 b6 89 4c 89 f7 e8 68 4e 04 00
>> RSP: 0018:ffffc90002b6f7b8 EFLAGS: 00010293
>> RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
>> RDX: ffff888019619d00 RSI: ffffffff81a68c12 RDI: 0000000000000003
>> RBP: ffffea0001bdc2c0 R08: 0000000000000029 R09: 00000000ffffffff
>> R10: ffffffff8903e29f R11: 00000000ffffffff R12: 00000000ffffffff
>> R13: 00000000ffffea00 R14: ffffc90002b6fb30 R15: ffffea0001bd8001
>> FS:  00007faa2aefd700(0000) GS:ffff8880b9d00000(0000) knlGS:0000000000000000
>> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>> CR2: 00007fff7e663318 CR3: 0000000018c6e000 CR4: 00000000003506e0
>> DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
>> DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
>>
>> The reproducer was trying to reading /proc/$PID/smaps when calling
>> MADV_FREE at the mean time.  MADV_FREE may split THPs if it is called
>> for partial THP.  It may trigger the below race:
>>
>>           CPU A                         CPU B
>>           -----                         -----
>> smaps walk:                      MADV_FREE:
>> page_mapcount()
>>    PageCompound()
>>                                   split_huge_page()
>>    page = compound_head(page)
>>    PageDoubleMap(page)
>>
>> When calling PageDoubleMap() this page is not a tail page of THP anymore
>> so the BUG is triggered.
>>
>> This could be fixed by elevated refcount of the page before calling
>> mapcount, but it prevents from counting migration entries, and it seems
>> overkilling because the race just could happen when PMD is split so all
>> PTE entries of tail pages are actually migration entries, and
>> smaps_account() does treat migration entries as mapcount == 1 as Kirill
>> pointed out.
>>
>> Add a new parameter for smaps_account() to tell this entry is migration
>> entry then skip calling page_mapcount().  Don't skip getting mapcount for
>> device private entries since they do track references with mapcount.
>>
>> Pagemap also has the similar issue although it was not reported.  Fixed
>> it as well.
>>
>> Fixes: e9b61f19858a ("thp: reintroduce split_huge_page()")
>> Reported-by: syzbot+1f52b3a18d5633fa7f82@syzkaller.appspotmail.com
>> Acked-by: David Hildenbrand <david@redhat.com>
>> Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
>> Cc: Jann Horn <jannh@google.com>
>> Cc: Matthew Wilcox <willy@infradead.org>
>> Cc: <stable@vger.kernel.org>
>> Signed-off-by: Yang Shi <shy828301@gmail.com>
>> ---
>> v4: * s/Treated/Treat per David
>>      * Collected acked-by tag from David
>> v3: * Fixed the fix tag, the one used by v2 was not accurate
>>      * Added comment about the risk calling page_mapcount() per David
>>      * Fix pagemap
>> v2: * Added proper fix tag per Jann Horn
>>      * Rebased to the latest linus's tree
>>
>>   fs/proc/task_mmu.c | 38 ++++++++++++++++++++++++++++++--------
>>   1 file changed, 30 insertions(+), 8 deletions(-)
>>
>> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
>> index 18f8c3acbb85..bc2f46033231 100644
>> --- a/fs/proc/task_mmu.c
>> +++ b/fs/proc/task_mmu.c
>> @@ -440,7 +440,8 @@ static void smaps_page_accumulate(struct mem_size_stats *mss,
>>   }
>>   
>>   static void smaps_account(struct mem_size_stats *mss, struct page *page,
>> -		bool compound, bool young, bool dirty, bool locked)
>> +		bool compound, bool young, bool dirty, bool locked,
>> +		bool migration)
>>   {
>>   	int i, nr = compound ? compound_nr(page) : 1;
>>   	unsigned long size = nr * PAGE_SIZE;
>> @@ -467,8 +468,15 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
>>   	 * page_count(page) == 1 guarantees the page is mapped exactly once.
>>   	 * If any subpage of the compound page mapped with PTE it would elevate
>>   	 * page_count().
>> +	 *
>> +	 * The page_mapcount() is called to get a snapshot of the mapcount.
>> +	 * Without holding the page lock this snapshot can be slightly wrong as
>> +	 * we cannot always read the mapcount atomically.  It is not safe to
>> +	 * call page_mapcount() even with PTL held if the page is not mapped,
>> +	 * especially for migration entries.  Treat regular migration entries
>> +	 * as mapcount == 1.
>>   	 */
>> -	if (page_count(page) == 1) {
>> +	if ((page_count(page) == 1) || migration) {
> 
> Since this is now apparently a CVE-2023-1582 for whatever RHeasons...
> 
> wonder if the patch actually works as intended when
> (page_count() || migration) is in this particular order and not the other one?

Only the page_mapcount() call to a page that should be problematic, not 
the page_count() call. There might be the rare chance of the page 
getting remove due to memory offlining... but we're still holding the 
page table lock with the migration entry, so we should be protected 
against that.

Regarding the CVE, IIUC the main reason for the CVE should be 
RHEL-specific -- which behaves differently than other code bases; for 
other code bases, it's just a way to trigger a BUG_ON as described here.
Vlastimil Babka March 23, 2023, 10:11 a.m. UTC | #5
On 3/23/23 11:08, David Hildenbrand wrote:
> On 23.03.23 10:52, Vlastimil Babka wrote:
>> On 2/3/22 19:26, Yang Shi wrote:
>>> --- a/fs/proc/task_mmu.c
>>> +++ b/fs/proc/task_mmu.c
>>> @@ -440,7 +440,8 @@ static void smaps_page_accumulate(struct mem_size_stats *mss,
>>>   }
>>>   
>>>   static void smaps_account(struct mem_size_stats *mss, struct page *page,
>>> -		bool compound, bool young, bool dirty, bool locked)
>>> +		bool compound, bool young, bool dirty, bool locked,
>>> +		bool migration)
>>>   {
>>>   	int i, nr = compound ? compound_nr(page) : 1;
>>>   	unsigned long size = nr * PAGE_SIZE;
>>> @@ -467,8 +468,15 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
>>>   	 * page_count(page) == 1 guarantees the page is mapped exactly once.
>>>   	 * If any subpage of the compound page mapped with PTE it would elevate
>>>   	 * page_count().
>>> +	 *
>>> +	 * The page_mapcount() is called to get a snapshot of the mapcount.
>>> +	 * Without holding the page lock this snapshot can be slightly wrong as
>>> +	 * we cannot always read the mapcount atomically.  It is not safe to
>>> +	 * call page_mapcount() even with PTL held if the page is not mapped,
>>> +	 * especially for migration entries.  Treat regular migration entries
>>> +	 * as mapcount == 1.
>>>   	 */
>>> -	if (page_count(page) == 1) {
>>> +	if ((page_count(page) == 1) || migration) {
>> 
>> Since this is now apparently a CVE-2023-1582 for whatever RHeasons...
>> 
>> wonder if the patch actually works as intended when
>> (page_count() || migration) is in this particular order and not the other one?
> 
> Only the page_mapcount() call to a page that should be problematic, not 
> the page_count() call. There might be the rare chance of the page 

Oh right, page_mapcount() vs page_count(), I need more coffee.

> getting remove due to memory offlining... but we're still holding the 
> page table lock with the migration entry, so we should be protected 
> against that.
> 
> Regarding the CVE, IIUC the main reason for the CVE should be 
> RHEL-specific -- which behaves differently than other code bases; for 
> other code bases, it's just a way to trigger a BUG_ON as described here.

That's good to know so at least my bogus mail was useful for that, thanks!
Yang Shi March 23, 2023, 8:45 p.m. UTC | #6
On Thu, Mar 23, 2023 at 3:11 AM Vlastimil Babka <vbabka@suse.cz> wrote:
>
> On 3/23/23 11:08, David Hildenbrand wrote:
> > On 23.03.23 10:52, Vlastimil Babka wrote:
> >> On 2/3/22 19:26, Yang Shi wrote:
> >>> --- a/fs/proc/task_mmu.c
> >>> +++ b/fs/proc/task_mmu.c
> >>> @@ -440,7 +440,8 @@ static void smaps_page_accumulate(struct mem_size_stats *mss,
> >>>   }
> >>>
> >>>   static void smaps_account(struct mem_size_stats *mss, struct page *page,
> >>> -           bool compound, bool young, bool dirty, bool locked)
> >>> +           bool compound, bool young, bool dirty, bool locked,
> >>> +           bool migration)
> >>>   {
> >>>     int i, nr = compound ? compound_nr(page) : 1;
> >>>     unsigned long size = nr * PAGE_SIZE;
> >>> @@ -467,8 +468,15 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
> >>>      * page_count(page) == 1 guarantees the page is mapped exactly once.
> >>>      * If any subpage of the compound page mapped with PTE it would elevate
> >>>      * page_count().
> >>> +    *
> >>> +    * The page_mapcount() is called to get a snapshot of the mapcount.
> >>> +    * Without holding the page lock this snapshot can be slightly wrong as
> >>> +    * we cannot always read the mapcount atomically.  It is not safe to
> >>> +    * call page_mapcount() even with PTL held if the page is not mapped,
> >>> +    * especially for migration entries.  Treat regular migration entries
> >>> +    * as mapcount == 1.
> >>>      */
> >>> -   if (page_count(page) == 1) {
> >>> +   if ((page_count(page) == 1) || migration) {
> >>
> >> Since this is now apparently a CVE-2023-1582 for whatever RHeasons...
> >>
> >> wonder if the patch actually works as intended when
> >> (page_count() || migration) is in this particular order and not the other one?
> >
> > Only the page_mapcount() call to a page that should be problematic, not
> > the page_count() call. There might be the rare chance of the page
>
> Oh right, page_mapcount() vs page_count(), I need more coffee.
>
> > getting remove due to memory offlining... but we're still holding the
> > page table lock with the migration entry, so we should be protected
> > against that.
> >
> > Regarding the CVE, IIUC the main reason for the CVE should be
> > RHEL-specific -- which behaves differently than other code bases; for
> > other code bases, it's just a way to trigger a BUG_ON as described here.

Out of curiosity, is there any public link for this CVE? Google search
can't find it.

>
> That's good to know so at least my bogus mail was useful for that, thanks!
Vlastimil Babka March 24, 2023, 11:25 a.m. UTC | #7
On 3/23/23 21:45, Yang Shi wrote:
> On Thu, Mar 23, 2023 at 3:11 AM Vlastimil Babka <vbabka@suse.cz> wrote:
> 
> Out of curiosity, is there any public link for this CVE? Google search
> can't find it.

Only this one is live so far, AFAIK

https://bugzilla.redhat.com/show_bug.cgi?id=2180936

>>
>> That's good to know so at least my bogus mail was useful for that, thanks!
Yang Shi March 24, 2023, 8:12 p.m. UTC | #8
On Fri, Mar 24, 2023 at 4:25 AM Vlastimil Babka <vbabka@suse.cz> wrote:
>
> On 3/23/23 21:45, Yang Shi wrote:
> > On Thu, Mar 23, 2023 at 3:11 AM Vlastimil Babka <vbabka@suse.cz> wrote:
> >
> > Out of curiosity, is there any public link for this CVE? Google search
> > can't find it.
>
> Only this one is live so far, AFAIK
>
> https://bugzilla.redhat.com/show_bug.cgi?id=2180936

Thank you.

>
> >>
> >> That's good to know so at least my bogus mail was useful for that, thanks!
>
David Hildenbrand April 3, 2023, 7:29 a.m. UTC | #9
On 24.03.23 21:12, Yang Shi wrote:
> On Fri, Mar 24, 2023 at 4:25 AM Vlastimil Babka <vbabka@suse.cz> wrote:
>>
>> On 3/23/23 21:45, Yang Shi wrote:
>>> On Thu, Mar 23, 2023 at 3:11 AM Vlastimil Babka <vbabka@suse.cz> wrote:
>>>
>>> Out of curiosity, is there any public link for this CVE? Google search
>>> can't find it.
>>
>> Only this one is live so far, AFAIK
>>
>> https://bugzilla.redhat.com/show_bug.cgi?id=2180936
> 
> Thank you.

There is now

https://access.redhat.com/security/cve/cve-2023-1582
Yang Shi April 4, 2023, 12:50 a.m. UTC | #10
On Mon, Apr 3, 2023 at 12:30 AM David Hildenbrand <david@redhat.com> wrote:
>
> On 24.03.23 21:12, Yang Shi wrote:
> > On Fri, Mar 24, 2023 at 4:25 AM Vlastimil Babka <vbabka@suse.cz> wrote:
> >>
> >> On 3/23/23 21:45, Yang Shi wrote:
> >>> On Thu, Mar 23, 2023 at 3:11 AM Vlastimil Babka <vbabka@suse.cz> wrote:
> >>>
> >>> Out of curiosity, is there any public link for this CVE? Google search
> >>> can't find it.
> >>
> >> Only this one is live so far, AFAIK
> >>
> >> https://bugzilla.redhat.com/show_bug.cgi?id=2180936
> >
> > Thank you.
>
> There is now
>
> https://access.redhat.com/security/cve/cve-2023-1582

Thank you.

>
> --
> Thanks,
>
> David / dhildenb
>
David Rientjes April 13, 2023, 11:58 p.m. UTC | #11
On Mon, 3 Apr 2023, Yang Shi wrote:

> On Mon, Apr 3, 2023 at 12:30 AM David Hildenbrand <david@redhat.com> wrote:
> >
> > On 24.03.23 21:12, Yang Shi wrote:
> > > On Fri, Mar 24, 2023 at 4:25 AM Vlastimil Babka <vbabka@suse.cz> wrote:
> > >>
> > >> On 3/23/23 21:45, Yang Shi wrote:
> > >>> On Thu, Mar 23, 2023 at 3:11 AM Vlastimil Babka <vbabka@suse.cz> wrote:
> > >>>
> > >>> Out of curiosity, is there any public link for this CVE? Google search
> > >>> can't find it.
> > >>
> > >> Only this one is live so far, AFAIK
> > >>
> > >> https://bugzilla.redhat.com/show_bug.cgi?id=2180936
> > >
> > > Thank you.
> >
> > There is now
> >
> > https://access.redhat.com/security/cve/cve-2023-1582
> 
> Thank you.
> 

Hi Yang,

commit 24d7275ce2791829953ed4e72f68277ceb2571c6
Author: Yang Shi <shy828301@gmail.com>
Date:   Fri Feb 11 16:32:26 2022 -0800

    fs/proc: task_mmu.c: don't read mapcount for migration entry

is backported to 5.10 stable but not to 5.4 or earlier stable trees.  The 
commit advertises to fix a commit from 4.5.

Do we need stable backports for earlier trees or are they not affected?

Thanks!
Yang Shi April 18, 2023, 9:17 p.m. UTC | #12
On Thu, Apr 13, 2023 at 4:58 PM David Rientjes <rientjes@google.com> wrote:
>
> On Mon, 3 Apr 2023, Yang Shi wrote:
>
> > On Mon, Apr 3, 2023 at 12:30 AM David Hildenbrand <david@redhat.com> wrote:
> > >
> > > On 24.03.23 21:12, Yang Shi wrote:
> > > > On Fri, Mar 24, 2023 at 4:25 AM Vlastimil Babka <vbabka@suse.cz> wrote:
> > > >>
> > > >> On 3/23/23 21:45, Yang Shi wrote:
> > > >>> On Thu, Mar 23, 2023 at 3:11 AM Vlastimil Babka <vbabka@suse.cz> wrote:
> > > >>>
> > > >>> Out of curiosity, is there any public link for this CVE? Google search
> > > >>> can't find it.
> > > >>
> > > >> Only this one is live so far, AFAIK
> > > >>
> > > >> https://bugzilla.redhat.com/show_bug.cgi?id=2180936
> > > >
> > > > Thank you.
> > >
> > > There is now
> > >
> > > https://access.redhat.com/security/cve/cve-2023-1582
> >
> > Thank you.
> >
>
> Hi Yang,
>
> commit 24d7275ce2791829953ed4e72f68277ceb2571c6
> Author: Yang Shi <shy828301@gmail.com>
> Date:   Fri Feb 11 16:32:26 2022 -0800
>
>     fs/proc: task_mmu.c: don't read mapcount for migration entry
>
> is backported to 5.10 stable but not to 5.4 or earlier stable trees.  The
> commit advertises to fix a commit from 4.5.
>
> Do we need stable backports for earlier trees or are they not affected?

IIRC, it is needed. But I can't remember why it wasn't. Maybe just
because the patch failed to apply to those trees and needs to backport
the patch manually.

>
> Thanks!
diff mbox series

Patch

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 18f8c3acbb85..bc2f46033231 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -440,7 +440,8 @@  static void smaps_page_accumulate(struct mem_size_stats *mss,
 }
 
 static void smaps_account(struct mem_size_stats *mss, struct page *page,
-		bool compound, bool young, bool dirty, bool locked)
+		bool compound, bool young, bool dirty, bool locked,
+		bool migration)
 {
 	int i, nr = compound ? compound_nr(page) : 1;
 	unsigned long size = nr * PAGE_SIZE;
@@ -467,8 +468,15 @@  static void smaps_account(struct mem_size_stats *mss, struct page *page,
 	 * page_count(page) == 1 guarantees the page is mapped exactly once.
 	 * If any subpage of the compound page mapped with PTE it would elevate
 	 * page_count().
+	 *
+	 * The page_mapcount() is called to get a snapshot of the mapcount.
+	 * Without holding the page lock this snapshot can be slightly wrong as
+	 * we cannot always read the mapcount atomically.  It is not safe to
+	 * call page_mapcount() even with PTL held if the page is not mapped,
+	 * especially for migration entries.  Treat regular migration entries
+	 * as mapcount == 1.
 	 */
-	if (page_count(page) == 1) {
+	if ((page_count(page) == 1) || migration) {
 		smaps_page_accumulate(mss, page, size, size << PSS_SHIFT, dirty,
 			locked, true);
 		return;
@@ -517,6 +525,7 @@  static void smaps_pte_entry(pte_t *pte, unsigned long addr,
 	struct vm_area_struct *vma = walk->vma;
 	bool locked = !!(vma->vm_flags & VM_LOCKED);
 	struct page *page = NULL;
+	bool migration = false;
 
 	if (pte_present(*pte)) {
 		page = vm_normal_page(vma, addr, *pte);
@@ -536,8 +545,11 @@  static void smaps_pte_entry(pte_t *pte, unsigned long addr,
 			} else {
 				mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT;
 			}
-		} else if (is_pfn_swap_entry(swpent))
+		} else if (is_pfn_swap_entry(swpent)) {
+			if (is_migration_entry(swpent))
+				migration = true;
 			page = pfn_swap_entry_to_page(swpent);
+		}
 	} else {
 		smaps_pte_hole_lookup(addr, walk);
 		return;
@@ -546,7 +558,8 @@  static void smaps_pte_entry(pte_t *pte, unsigned long addr,
 	if (!page)
 		return;
 
-	smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte), locked);
+	smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte),
+		      locked, migration);
 }
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -557,6 +570,7 @@  static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
 	struct vm_area_struct *vma = walk->vma;
 	bool locked = !!(vma->vm_flags & VM_LOCKED);
 	struct page *page = NULL;
+	bool migration = false;
 
 	if (pmd_present(*pmd)) {
 		/* FOLL_DUMP will return -EFAULT on huge zero page */
@@ -564,8 +578,10 @@  static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
 	} else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) {
 		swp_entry_t entry = pmd_to_swp_entry(*pmd);
 
-		if (is_migration_entry(entry))
+		if (is_migration_entry(entry)) {
+			migration = true;
 			page = pfn_swap_entry_to_page(entry);
+		}
 	}
 	if (IS_ERR_OR_NULL(page))
 		return;
@@ -577,7 +593,9 @@  static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
 		/* pass */;
 	else
 		mss->file_thp += HPAGE_PMD_SIZE;
-	smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd), locked);
+
+	smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd),
+		      locked, migration);
 }
 #else
 static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
@@ -1378,6 +1396,7 @@  static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
 {
 	u64 frame = 0, flags = 0;
 	struct page *page = NULL;
+	bool migration = false;
 
 	if (pte_present(pte)) {
 		if (pm->show_pfn)
@@ -1399,13 +1418,14 @@  static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
 			frame = swp_type(entry) |
 				(swp_offset(entry) << MAX_SWAPFILES_SHIFT);
 		flags |= PM_SWAP;
+		migration = is_migration_entry(entry);
 		if (is_pfn_swap_entry(entry))
 			page = pfn_swap_entry_to_page(entry);
 	}
 
 	if (page && !PageAnon(page))
 		flags |= PM_FILE;
-	if (page && page_mapcount(page) == 1)
+	if (page && !migration && page_mapcount(page) == 1)
 		flags |= PM_MMAP_EXCLUSIVE;
 	if (vma->vm_flags & VM_SOFTDIRTY)
 		flags |= PM_SOFT_DIRTY;
@@ -1421,6 +1441,7 @@  static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
 	spinlock_t *ptl;
 	pte_t *pte, *orig_pte;
 	int err = 0;
+	bool migration = false;
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	ptl = pmd_trans_huge_lock(pmdp, vma);
@@ -1461,11 +1482,12 @@  static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
 			if (pmd_swp_uffd_wp(pmd))
 				flags |= PM_UFFD_WP;
 			VM_BUG_ON(!is_pmd_migration_entry(pmd));
+			migration = is_migration_entry(entry);
 			page = pfn_swap_entry_to_page(entry);
 		}
 #endif
 
-		if (page && page_mapcount(page) == 1)
+		if (page && !migration && page_mapcount(page) == 1)
 			flags |= PM_MMAP_EXCLUSIVE;
 
 		for (; addr != end; addr += PAGE_SIZE) {