diff mbox series

mm: fix possible OOB in numa_rebuild_large_mapping()

Message ID 20240607103241.1298388-1-wangkefeng.wang@huawei.com (mailing list archive)
State New
Headers show
Series mm: fix possible OOB in numa_rebuild_large_mapping() | expand

Commit Message

Kefeng Wang June 7, 2024, 10:32 a.m. UTC
The large folio is mapped with folio size aligned virtual address during
the pagefault, eg, 'addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE)'
in do_anonymous_page(), but after the mremap(), the virtual address only
require PAGE_SIZE aligned, also pte is moved to new in move_page_tables(),
then traverse the new pte in numa_rebuild_large_mapping() will hint the
following issue,

   Unable to handle kernel paging request at virtual address 00000a80c021a788
   Mem abort info:
     ESR = 0x0000000096000004
     EC = 0x25: DABT (current EL), IL = 32 bits
     SET = 0, FnV = 0
     EA = 0, S1PTW = 0
     FSC = 0x04: level 0 translation fault
   Data abort info:
     ISV = 0, ISS = 0x00000004, ISS2 = 0x00000000
     CM = 0, WnR = 0, TnD = 0, TagAccess = 0
     GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0
   user pgtable: 4k pages, 48-bit VAs, pgdp=00002040341a6000
   [00000a80c021a788] pgd=0000000000000000, p4d=0000000000000000
   Internal error: Oops: 0000000096000004 [#1] SMP
   ...
   CPU: 76 PID: 15187 Comm: git Kdump: loaded Tainted: G        W          6.10.0-rc2+ #209
   Hardware name: Huawei TaiShan 2280 V2/BC82AMDD, BIOS 1.79 08/21/2021
   pstate: 60400009 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
   pc : numa_rebuild_large_mapping+0x338/0x638
   lr : numa_rebuild_large_mapping+0x320/0x638
   sp : ffff8000b41c3b00
   x29: ffff8000b41c3b30 x28: ffff8000812a0000 x27: 00000000000a8000
   x26: 00000000000000a8 x25: 0010000000000001 x24: ffff20401c7170f0
   x23: 0000ffff33a1e000 x22: 0000ffff33a76000 x21: ffff20400869eca0
   x20: 0000ffff33976000 x19: 00000000000000a8 x18: ffffffffffffffff
   x17: 0000000000000000 x16: 0000000000000020 x15: ffff8000b41c36a8
   x14: 0000000000000000 x13: 205d373831353154 x12: 5b5d333331363732
   x11: 000000000011ff78 x10: 000000000011ff10 x9 : ffff800080273f30
   x8 : 000000320400869e x7 : c0000000ffffd87f x6 : 00000000001e6ba8
   x5 : ffff206f3fb5af88 x4 : 0000000000000000 x3 : 0000000000000000
   x2 : 0000000000000000 x1 : fffffdffc0000000 x0 : 00000a80c021a780
   Call trace:
    numa_rebuild_large_mapping+0x338/0x638
    do_numa_page+0x3e4/0x4e0
    handle_pte_fault+0x1bc/0x238
    __handle_mm_fault+0x20c/0x400
    handle_mm_fault+0xa8/0x288
    do_page_fault+0x124/0x498
    do_translation_fault+0x54/0x80
    do_mem_abort+0x4c/0xa8
    el0_da+0x40/0x110
    el0t_64_sync_handler+0xe4/0x158
    el0t_64_sync+0x188/0x190

Fix it by correct the start and end, which may lead to only rebuild part
of large mapping in one numa page fault, there is no issue since other part
could rebuild by another pagefault.

Fixes: d2136d749d76 ("mm: support multi-size THP numa balancing")
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
---
 mm/memory.c | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

Comments

David Hildenbrand June 7, 2024, 10:37 a.m. UTC | #1
On 07.06.24 12:32, Kefeng Wang wrote:
> The large folio is mapped with folio size aligned virtual address during
> the pagefault, eg, 'addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE)'
> in do_anonymous_page(), but after the mremap(), the virtual address only
> require PAGE_SIZE aligned, also pte is moved to new in move_page_tables(),
> then traverse the new pte in numa_rebuild_large_mapping() will hint the
> following issue,
> 
>     Unable to handle kernel paging request at virtual address 00000a80c021a788
>     Mem abort info:
>       ESR = 0x0000000096000004
>       EC = 0x25: DABT (current EL), IL = 32 bits
>       SET = 0, FnV = 0
>       EA = 0, S1PTW = 0
>       FSC = 0x04: level 0 translation fault
>     Data abort info:
>       ISV = 0, ISS = 0x00000004, ISS2 = 0x00000000
>       CM = 0, WnR = 0, TnD = 0, TagAccess = 0
>       GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0
>     user pgtable: 4k pages, 48-bit VAs, pgdp=00002040341a6000
>     [00000a80c021a788] pgd=0000000000000000, p4d=0000000000000000
>     Internal error: Oops: 0000000096000004 [#1] SMP
>     ...
>     CPU: 76 PID: 15187 Comm: git Kdump: loaded Tainted: G        W          6.10.0-rc2+ #209
>     Hardware name: Huawei TaiShan 2280 V2/BC82AMDD, BIOS 1.79 08/21/2021
>     pstate: 60400009 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
>     pc : numa_rebuild_large_mapping+0x338/0x638
>     lr : numa_rebuild_large_mapping+0x320/0x638
>     sp : ffff8000b41c3b00
>     x29: ffff8000b41c3b30 x28: ffff8000812a0000 x27: 00000000000a8000
>     x26: 00000000000000a8 x25: 0010000000000001 x24: ffff20401c7170f0
>     x23: 0000ffff33a1e000 x22: 0000ffff33a76000 x21: ffff20400869eca0
>     x20: 0000ffff33976000 x19: 00000000000000a8 x18: ffffffffffffffff
>     x17: 0000000000000000 x16: 0000000000000020 x15: ffff8000b41c36a8
>     x14: 0000000000000000 x13: 205d373831353154 x12: 5b5d333331363732
>     x11: 000000000011ff78 x10: 000000000011ff10 x9 : ffff800080273f30
>     x8 : 000000320400869e x7 : c0000000ffffd87f x6 : 00000000001e6ba8
>     x5 : ffff206f3fb5af88 x4 : 0000000000000000 x3 : 0000000000000000
>     x2 : 0000000000000000 x1 : fffffdffc0000000 x0 : 00000a80c021a780
>     Call trace:
>      numa_rebuild_large_mapping+0x338/0x638
>      do_numa_page+0x3e4/0x4e0
>      handle_pte_fault+0x1bc/0x238
>      __handle_mm_fault+0x20c/0x400
>      handle_mm_fault+0xa8/0x288
>      do_page_fault+0x124/0x498
>      do_translation_fault+0x54/0x80
>      do_mem_abort+0x4c/0xa8
>      el0_da+0x40/0x110
>      el0t_64_sync_handler+0xe4/0x158
>      el0t_64_sync+0x188/0x190

Do you have an easy reproducer that we can use to reproduce+verify this 
issue? The description above indicates to me that this should not be too 
complicated to write :)
Dan Carpenter June 9, 2024, 4:03 p.m. UTC | #2
Hi Kefeng,

kernel test robot noticed the following build warnings:

url:    https://github.com/intel-lab-lkp/linux/commits/Kefeng-Wang/mm-fix-possible-OOB-in-numa_rebuild_large_mapping/20240607-183609
base:   https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link:    https://lore.kernel.org/r/20240607103241.1298388-1-wangkefeng.wang%40huawei.com
patch subject: [PATCH] mm: fix possible OOB in numa_rebuild_large_mapping()
config: mips-randconfig-r081-20240609 (https://download.01.org/0day-ci/archive/20240609/202406092325.eDrcikT8-lkp@intel.com/config)
compiler: mips-linux-gcc (GCC) 13.2.0

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
| Closes: https://lore.kernel.org/r/202406092325.eDrcikT8-lkp@intel.com/

smatch warnings:
mm/memory.c:5370 do_numa_page() error: uninitialized symbol 'nr_pages'.

vim +/nr_pages +5370 mm/memory.c

2b7403035459c7 Souptick Joarder  2018-08-23  5265  static vm_fault_t do_numa_page(struct vm_fault *vmf)
d10e63f29488b0 Mel Gorman        2012-10-25  5266  {
82b0f8c39a3869 Jan Kara          2016-12-14  5267  	struct vm_area_struct *vma = vmf->vma;
6695cf68b15c21 Kefeng Wang       2023-09-21  5268  	struct folio *folio = NULL;
6695cf68b15c21 Kefeng Wang       2023-09-21  5269  	int nid = NUMA_NO_NODE;
d2136d749d76af Baolin Wang       2024-03-29  5270  	bool writable = false, ignore_writable = false;
d2136d749d76af Baolin Wang       2024-03-29  5271  	bool pte_write_upgrade = vma_wants_manual_pte_write_upgrade(vma);
90572890d20252 Peter Zijlstra    2013-10-07  5272  	int last_cpupid;
cbee9f88ec1b8d Peter Zijlstra    2012-10-25  5273  	int target_nid;
04a8645304500b Aneesh Kumar K.V  2019-03-05  5274  	pte_t pte, old_pte;
d2136d749d76af Baolin Wang       2024-03-29  5275  	int flags = 0, nr_pages;
d10e63f29488b0 Mel Gorman        2012-10-25  5276  
d10e63f29488b0 Mel Gorman        2012-10-25  5277  	/*
6c1b748ebf27be John Hubbard      2024-02-27  5278  	 * The pte cannot be used safely until we verify, while holding the page
6c1b748ebf27be John Hubbard      2024-02-27  5279  	 * table lock, that its contents have not changed during fault handling.
d10e63f29488b0 Mel Gorman        2012-10-25  5280  	 */
82b0f8c39a3869 Jan Kara          2016-12-14  5281  	spin_lock(vmf->ptl);
6c1b748ebf27be John Hubbard      2024-02-27  5282  	/* Read the live PTE from the page tables: */
6c1b748ebf27be John Hubbard      2024-02-27  5283  	old_pte = ptep_get(vmf->pte);
6c1b748ebf27be John Hubbard      2024-02-27  5284  
6c1b748ebf27be John Hubbard      2024-02-27  5285  	if (unlikely(!pte_same(old_pte, vmf->orig_pte))) {
82b0f8c39a3869 Jan Kara          2016-12-14  5286  		pte_unmap_unlock(vmf->pte, vmf->ptl);
4daae3b4b9e49b Mel Gorman        2012-11-02  5287  		goto out;
4daae3b4b9e49b Mel Gorman        2012-11-02  5288  	}
4daae3b4b9e49b Mel Gorman        2012-11-02  5289  
04a8645304500b Aneesh Kumar K.V  2019-03-05  5290  	pte = pte_modify(old_pte, vma->vm_page_prot);
d10e63f29488b0 Mel Gorman        2012-10-25  5291  
6a56ccbcf6c695 David Hildenbrand 2022-11-08  5292  	/*
6a56ccbcf6c695 David Hildenbrand 2022-11-08  5293  	 * Detect now whether the PTE could be writable; this information
6a56ccbcf6c695 David Hildenbrand 2022-11-08  5294  	 * is only valid while holding the PT lock.
6a56ccbcf6c695 David Hildenbrand 2022-11-08  5295  	 */
6a56ccbcf6c695 David Hildenbrand 2022-11-08  5296  	writable = pte_write(pte);
d2136d749d76af Baolin Wang       2024-03-29  5297  	if (!writable && pte_write_upgrade &&
6a56ccbcf6c695 David Hildenbrand 2022-11-08  5298  	    can_change_pte_writable(vma, vmf->address, pte))
6a56ccbcf6c695 David Hildenbrand 2022-11-08  5299  		writable = true;
6a56ccbcf6c695 David Hildenbrand 2022-11-08  5300  
6695cf68b15c21 Kefeng Wang       2023-09-21  5301  	folio = vm_normal_folio(vma, vmf->address, pte);
6695cf68b15c21 Kefeng Wang       2023-09-21  5302  	if (!folio || folio_is_zone_device(folio))
b99a342d4f11a5 Huang Ying        2021-04-29  5303  		goto out_map;

nr_pages not initialized

d10e63f29488b0 Mel Gorman        2012-10-25  5304  
6688cc05473b36 Peter Zijlstra    2013-10-07  5305  	/*
bea66fbd11af1c Mel Gorman        2015-03-25  5306  	 * Avoid grouping on RO pages in general. RO pages shouldn't hurt as
bea66fbd11af1c Mel Gorman        2015-03-25  5307  	 * much anyway since they can be in shared cache state. This misses
bea66fbd11af1c Mel Gorman        2015-03-25  5308  	 * the case where a mapping is writable but the process never writes
bea66fbd11af1c Mel Gorman        2015-03-25  5309  	 * to it but pte_write gets cleared during protection updates and
bea66fbd11af1c Mel Gorman        2015-03-25  5310  	 * pte_dirty has unpredictable behaviour between PTE scan updates,
bea66fbd11af1c Mel Gorman        2015-03-25  5311  	 * background writeback, dirty balancing and application behaviour.
bea66fbd11af1c Mel Gorman        2015-03-25  5312  	 */
6a56ccbcf6c695 David Hildenbrand 2022-11-08  5313  	if (!writable)
6688cc05473b36 Peter Zijlstra    2013-10-07  5314  		flags |= TNF_NO_GROUP;
6688cc05473b36 Peter Zijlstra    2013-10-07  5315  
dabe1d992414a6 Rik van Riel      2013-10-07  5316  	/*
6695cf68b15c21 Kefeng Wang       2023-09-21  5317  	 * Flag if the folio is shared between multiple address spaces. This
dabe1d992414a6 Rik van Riel      2013-10-07  5318  	 * is later used when determining whether to group tasks together
dabe1d992414a6 Rik van Riel      2013-10-07  5319  	 */
ebb34f78d72c23 David Hildenbrand 2024-02-27  5320  	if (folio_likely_mapped_shared(folio) && (vma->vm_flags & VM_SHARED))
dabe1d992414a6 Rik van Riel      2013-10-07  5321  		flags |= TNF_SHARED;
dabe1d992414a6 Rik van Riel      2013-10-07  5322  
6695cf68b15c21 Kefeng Wang       2023-09-21  5323  	nid = folio_nid(folio);
d2136d749d76af Baolin Wang       2024-03-29  5324  	nr_pages = folio_nr_pages(folio);
33024536bafd91 Huang Ying        2022-07-13  5325  	/*
33024536bafd91 Huang Ying        2022-07-13  5326  	 * For memory tiering mode, cpupid of slow memory page is used
33024536bafd91 Huang Ying        2022-07-13  5327  	 * to record page access time.  So use default value.
33024536bafd91 Huang Ying        2022-07-13  5328  	 */
33024536bafd91 Huang Ying        2022-07-13  5329  	if ((sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
6695cf68b15c21 Kefeng Wang       2023-09-21  5330  	    !node_is_toptier(nid))
33024536bafd91 Huang Ying        2022-07-13  5331  		last_cpupid = (-1 & LAST_CPUPID_MASK);
33024536bafd91 Huang Ying        2022-07-13  5332  	else
67b33e3ff58374 Kefeng Wang       2023-10-18  5333  		last_cpupid = folio_last_cpupid(folio);
f8fd525ba3a298 Donet Tom         2024-03-08  5334  	target_nid = numa_migrate_prep(folio, vmf, vmf->address, nid, &flags);
98fa15f34cb379 Anshuman Khandual 2019-03-05  5335  	if (target_nid == NUMA_NO_NODE) {
6695cf68b15c21 Kefeng Wang       2023-09-21  5336  		folio_put(folio);
b99a342d4f11a5 Huang Ying        2021-04-29  5337  		goto out_map;
4daae3b4b9e49b Mel Gorman        2012-11-02  5338  	}
b99a342d4f11a5 Huang Ying        2021-04-29  5339  	pte_unmap_unlock(vmf->pte, vmf->ptl);
6a56ccbcf6c695 David Hildenbrand 2022-11-08  5340  	writable = false;
d2136d749d76af Baolin Wang       2024-03-29  5341  	ignore_writable = true;
4daae3b4b9e49b Mel Gorman        2012-11-02  5342  
4daae3b4b9e49b Mel Gorman        2012-11-02  5343  	/* Migrate to the requested node */
6695cf68b15c21 Kefeng Wang       2023-09-21  5344  	if (migrate_misplaced_folio(folio, vma, target_nid)) {
6695cf68b15c21 Kefeng Wang       2023-09-21  5345  		nid = target_nid;
6688cc05473b36 Peter Zijlstra    2013-10-07  5346  		flags |= TNF_MIGRATED;
b99a342d4f11a5 Huang Ying        2021-04-29  5347  	} else {
074c238177a75f Mel Gorman        2015-03-25  5348  		flags |= TNF_MIGRATE_FAIL;
c7ad08804fae5b Hugh Dickins      2023-06-08  5349  		vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
c7ad08804fae5b Hugh Dickins      2023-06-08  5350  					       vmf->address, &vmf->ptl);
c7ad08804fae5b Hugh Dickins      2023-06-08  5351  		if (unlikely(!vmf->pte))
c7ad08804fae5b Hugh Dickins      2023-06-08  5352  			goto out;
c33c794828f212 Ryan Roberts      2023-06-12  5353  		if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
b99a342d4f11a5 Huang Ying        2021-04-29  5354  			pte_unmap_unlock(vmf->pte, vmf->ptl);
b99a342d4f11a5 Huang Ying        2021-04-29  5355  			goto out;
b99a342d4f11a5 Huang Ying        2021-04-29  5356  		}
b99a342d4f11a5 Huang Ying        2021-04-29  5357  		goto out_map;
b99a342d4f11a5 Huang Ying        2021-04-29  5358  	}
4daae3b4b9e49b Mel Gorman        2012-11-02  5359  
4daae3b4b9e49b Mel Gorman        2012-11-02  5360  out:
6695cf68b15c21 Kefeng Wang       2023-09-21  5361  	if (nid != NUMA_NO_NODE)
d2136d749d76af Baolin Wang       2024-03-29  5362  		task_numa_fault(last_cpupid, nid, nr_pages, flags);
d10e63f29488b0 Mel Gorman        2012-10-25  5363  	return 0;
b99a342d4f11a5 Huang Ying        2021-04-29  5364  out_map:
b99a342d4f11a5 Huang Ying        2021-04-29  5365  	/*
b99a342d4f11a5 Huang Ying        2021-04-29  5366  	 * Make it present again, depending on how arch implements
b99a342d4f11a5 Huang Ying        2021-04-29  5367  	 * non-accessible ptes, some can allow access by kernel mode.
b99a342d4f11a5 Huang Ying        2021-04-29  5368  	 */
d2136d749d76af Baolin Wang       2024-03-29  5369  	if (folio && folio_test_large(folio))

Are folio_test_large() and folio_is_zone_device() mutually exclusive?
If so then this is a false positive.  Just ignore the warning in that
case.

8d27aa5be8ed93 Kefeng Wang       2024-06-07 @5370  		numa_rebuild_large_mapping(vmf, vma, folio, nr_pages, pte,
8d27aa5be8ed93 Kefeng Wang       2024-06-07  5371  					   ignore_writable, pte_write_upgrade);
d2136d749d76af Baolin Wang       2024-03-29  5372  	else
d2136d749d76af Baolin Wang       2024-03-29  5373  		numa_rebuild_single_mapping(vmf, vma, vmf->address, vmf->pte,
d2136d749d76af Baolin Wang       2024-03-29  5374  					    writable);
b99a342d4f11a5 Huang Ying        2021-04-29  5375  	pte_unmap_unlock(vmf->pte, vmf->ptl);
b99a342d4f11a5 Huang Ying        2021-04-29  5376  	goto out;
d10e63f29488b0 Mel Gorman        2012-10-25  5377  }
Andrew Morton June 9, 2024, 8:53 p.m. UTC | #3
On Sun, 9 Jun 2024 19:03:50 +0300 Dan Carpenter <dan.carpenter@linaro.org> wrote:

> Hi Kefeng,
> 
> kernel test robot noticed the following build warnings:
> 
> smatch warnings:
> mm/memory.c:5370 do_numa_page() error: uninitialized symbol 'nr_pages'.
> 

Yes, thanks, I dropped this version of the patch.
Baolin Wang June 11, 2024, 7:48 a.m. UTC | #4
Hi Kefeng,

On 2024/6/7 18:32, Kefeng Wang wrote:
> The large folio is mapped with folio size aligned virtual address during
> the pagefault, eg, 'addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE)'
> in do_anonymous_page(), but after the mremap(), the virtual address only
> require PAGE_SIZE aligned, also pte is moved to new in move_page_tables(),
> then traverse the new pte in numa_rebuild_large_mapping() will hint the
> following issue,
> 
>     Unable to handle kernel paging request at virtual address 00000a80c021a788
>     Mem abort info:
>       ESR = 0x0000000096000004
>       EC = 0x25: DABT (current EL), IL = 32 bits
>       SET = 0, FnV = 0
>       EA = 0, S1PTW = 0
>       FSC = 0x04: level 0 translation fault
>     Data abort info:
>       ISV = 0, ISS = 0x00000004, ISS2 = 0x00000000
>       CM = 0, WnR = 0, TnD = 0, TagAccess = 0
>       GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0
>     user pgtable: 4k pages, 48-bit VAs, pgdp=00002040341a6000
>     [00000a80c021a788] pgd=0000000000000000, p4d=0000000000000000
>     Internal error: Oops: 0000000096000004 [#1] SMP
>     ...
>     CPU: 76 PID: 15187 Comm: git Kdump: loaded Tainted: G        W          6.10.0-rc2+ #209
>     Hardware name: Huawei TaiShan 2280 V2/BC82AMDD, BIOS 1.79 08/21/2021
>     pstate: 60400009 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
>     pc : numa_rebuild_large_mapping+0x338/0x638
>     lr : numa_rebuild_large_mapping+0x320/0x638
>     sp : ffff8000b41c3b00
>     x29: ffff8000b41c3b30 x28: ffff8000812a0000 x27: 00000000000a8000
>     x26: 00000000000000a8 x25: 0010000000000001 x24: ffff20401c7170f0
>     x23: 0000ffff33a1e000 x22: 0000ffff33a76000 x21: ffff20400869eca0
>     x20: 0000ffff33976000 x19: 00000000000000a8 x18: ffffffffffffffff
>     x17: 0000000000000000 x16: 0000000000000020 x15: ffff8000b41c36a8
>     x14: 0000000000000000 x13: 205d373831353154 x12: 5b5d333331363732
>     x11: 000000000011ff78 x10: 000000000011ff10 x9 : ffff800080273f30
>     x8 : 000000320400869e x7 : c0000000ffffd87f x6 : 00000000001e6ba8
>     x5 : ffff206f3fb5af88 x4 : 0000000000000000 x3 : 0000000000000000
>     x2 : 0000000000000000 x1 : fffffdffc0000000 x0 : 00000a80c021a780
>     Call trace:
>      numa_rebuild_large_mapping+0x338/0x638
>      do_numa_page+0x3e4/0x4e0
>      handle_pte_fault+0x1bc/0x238
>      __handle_mm_fault+0x20c/0x400
>      handle_mm_fault+0xa8/0x288
>      do_page_fault+0x124/0x498
>      do_translation_fault+0x54/0x80
>      do_mem_abort+0x4c/0xa8
>      el0_da+0x40/0x110
>      el0t_64_sync_handler+0xe4/0x158
>      el0t_64_sync+0x188/0x190
> 
> Fix it by correct the start and end, which may lead to only rebuild part
> of large mapping in one numa page fault, there is no issue since other part
> could rebuild by another pagefault.
> 
> Fixes: d2136d749d76 ("mm: support multi-size THP numa balancing")
> Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>

Thanks for fixing the issue.

But could you help to make the issue clear? e.g. how to reproduce it 
like David suggested? Do you mean 'vmf->address - nr * PAGE_SIZE' can be 
overflowed?

> ---
>   mm/memory.c | 24 +++++++++++++++---------
>   1 file changed, 15 insertions(+), 9 deletions(-)
> 
> diff --git a/mm/memory.c b/mm/memory.c
> index db9130488231..0ad57b6485ca 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -5223,15 +5223,21 @@ static void numa_rebuild_single_mapping(struct vm_fault *vmf, struct vm_area_str
>   	update_mmu_cache_range(vmf, vma, fault_addr, fault_pte, 1);
>   }
>   
> -static void numa_rebuild_large_mapping(struct vm_fault *vmf, struct vm_area_struct *vma,
> -				       struct folio *folio, pte_t fault_pte,
> -				       bool ignore_writable, bool pte_write_upgrade)
> +static void numa_rebuild_large_mapping(struct vm_fault *vmf,
> +		struct vm_area_struct *vma, struct folio *folio, int nr_pages,
> +		pte_t fault_pte, bool ignore_writable, bool pte_write_upgrade)
>   {
>   	int nr = pte_pfn(fault_pte) - folio_pfn(folio);
> -	unsigned long start = max(vmf->address - nr * PAGE_SIZE, vma->vm_start);
> -	unsigned long end = min(vmf->address + (folio_nr_pages(folio) - nr) * PAGE_SIZE, vma->vm_end);
> -	pte_t *start_ptep = vmf->pte - (vmf->address - start) / PAGE_SIZE;
> -	unsigned long addr;
> +	unsigned long folio_size = nr_pages * PAGE_SIZE;
> +	unsigned long addr = vmf->address;
> +	unsigned long start, end, align_addr;
> +	pte_t *start_ptep;
> +
> +	align_addr = ALIGN_DOWN(addr, folio_size);
> +	start = max3(addr - nr * PAGE_SIZE, align_addr, vma->vm_start);
> +	end = min3(addr + (nr_pages - nr) * PAGE_SIZE, align_addr + folio_size,
> +		   vma->vm_end);
> +	start_ptep = vmf->pte - (addr - start) / PAGE_SIZE;
>   
>   	/* Restore all PTEs' mapping of the large folio */
>   	for (addr = start; addr != end; start_ptep++, addr += PAGE_SIZE) {
> @@ -5361,8 +5367,8 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
>   	 * non-accessible ptes, some can allow access by kernel mode.
>   	 */
>   	if (folio && folio_test_large(folio))
> -		numa_rebuild_large_mapping(vmf, vma, folio, pte, ignore_writable,
> -					   pte_write_upgrade);
> +		numa_rebuild_large_mapping(vmf, vma, folio, nr_pages, pte,
> +					   ignore_writable, pte_write_upgrade);
>   	else
>   		numa_rebuild_single_mapping(vmf, vma, vmf->address, vmf->pte,
>   					    writable);
David Hildenbrand June 11, 2024, 10:34 a.m. UTC | #5
On 07.06.24 12:32, Kefeng Wang wrote:
> The large folio is mapped with folio size aligned virtual address during
> the pagefault, eg, 'addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE)'
> in do_anonymous_page(), but after the mremap(), the virtual address only
> require PAGE_SIZE aligned, also pte is moved to new in move_page_tables(),
> then traverse the new pte in numa_rebuild_large_mapping() will hint the
> following issue,
> 
>     Unable to handle kernel paging request at virtual address 00000a80c021a788
>     Mem abort info:
>       ESR = 0x0000000096000004
>       EC = 0x25: DABT (current EL), IL = 32 bits
>       SET = 0, FnV = 0
>       EA = 0, S1PTW = 0
>       FSC = 0x04: level 0 translation fault
>     Data abort info:
>       ISV = 0, ISS = 0x00000004, ISS2 = 0x00000000
>       CM = 0, WnR = 0, TnD = 0, TagAccess = 0
>       GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0
>     user pgtable: 4k pages, 48-bit VAs, pgdp=00002040341a6000
>     [00000a80c021a788] pgd=0000000000000000, p4d=0000000000000000
>     Internal error: Oops: 0000000096000004 [#1] SMP
>     ...
>     CPU: 76 PID: 15187 Comm: git Kdump: loaded Tainted: G        W          6.10.0-rc2+ #209
>     Hardware name: Huawei TaiShan 2280 V2/BC82AMDD, BIOS 1.79 08/21/2021
>     pstate: 60400009 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
>     pc : numa_rebuild_large_mapping+0x338/0x638
>     lr : numa_rebuild_large_mapping+0x320/0x638
>     sp : ffff8000b41c3b00
>     x29: ffff8000b41c3b30 x28: ffff8000812a0000 x27: 00000000000a8000
>     x26: 00000000000000a8 x25: 0010000000000001 x24: ffff20401c7170f0
>     x23: 0000ffff33a1e000 x22: 0000ffff33a76000 x21: ffff20400869eca0
>     x20: 0000ffff33976000 x19: 00000000000000a8 x18: ffffffffffffffff
>     x17: 0000000000000000 x16: 0000000000000020 x15: ffff8000b41c36a8
>     x14: 0000000000000000 x13: 205d373831353154 x12: 5b5d333331363732
>     x11: 000000000011ff78 x10: 000000000011ff10 x9 : ffff800080273f30
>     x8 : 000000320400869e x7 : c0000000ffffd87f x6 : 00000000001e6ba8
>     x5 : ffff206f3fb5af88 x4 : 0000000000000000 x3 : 0000000000000000
>     x2 : 0000000000000000 x1 : fffffdffc0000000 x0 : 00000a80c021a780
>     Call trace:
>      numa_rebuild_large_mapping+0x338/0x638
>      do_numa_page+0x3e4/0x4e0
>      handle_pte_fault+0x1bc/0x238
>      __handle_mm_fault+0x20c/0x400
>      handle_mm_fault+0xa8/0x288
>      do_page_fault+0x124/0x498
>      do_translation_fault+0x54/0x80
>      do_mem_abort+0x4c/0xa8
>      el0_da+0x40/0x110
>      el0t_64_sync_handler+0xe4/0x158
>      el0t_64_sync+0x188/0x190
> 
> Fix it by correct the start and end, which may lead to only rebuild part
> of large mapping in one numa page fault, there is no issue since other part
> could rebuild by another pagefault.
> 
> Fixes: d2136d749d76 ("mm: support multi-size THP numa balancing")
> Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
> ---
>   mm/memory.c | 24 +++++++++++++++---------
>   1 file changed, 15 insertions(+), 9 deletions(-)
> 
> diff --git a/mm/memory.c b/mm/memory.c
> index db9130488231..0ad57b6485ca 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -5223,15 +5223,21 @@ static void numa_rebuild_single_mapping(struct vm_fault *vmf, struct vm_area_str
>   	update_mmu_cache_range(vmf, vma, fault_addr, fault_pte, 1);
>   }
>   
> -static void numa_rebuild_large_mapping(struct vm_fault *vmf, struct vm_area_struct *vma,
> -				       struct folio *folio, pte_t fault_pte,
> -				       bool ignore_writable, bool pte_write_upgrade)
> +static void numa_rebuild_large_mapping(struct vm_fault *vmf,
> +		struct vm_area_struct *vma, struct folio *folio, int nr_pages,
> +		pte_t fault_pte, bool ignore_writable, bool pte_write_upgrade)
>   {
>   	int nr = pte_pfn(fault_pte) - folio_pfn(folio);
> -	unsigned long start = max(vmf->address - nr * PAGE_SIZE, vma->vm_start);
> -	unsigned long end = min(vmf->address + (folio_nr_pages(folio) - nr) * PAGE_SIZE, vma->vm_end);
> -	pte_t *start_ptep = vmf->pte - (vmf->address - start) / PAGE_SIZE;
> -	unsigned long addr;
> +	unsigned long folio_size = nr_pages * PAGE_SIZE;
> +	unsigned long addr = vmf->address;
> +	unsigned long start, end, align_addr;
> +	pte_t *start_ptep;
> +
> +	align_addr = ALIGN_DOWN(addr, folio_size);
> +	start = max3(addr - nr * PAGE_SIZE, align_addr, vma->vm_start);
> +	end = min3(addr + (nr_pages - nr) * PAGE_SIZE, align_addr + folio_size,
> +		   vma->vm_end);
> +	start_ptep = vmf->pte - (addr - start) / PAGE_SIZE;
>   
>   	/* Restore all PTEs' mapping of the large folio */
>   	for (addr = start; addr != end; start_ptep++, addr += PAGE_SIZE) {
> @@ -5361,8 +5367,8 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
>   	 * non-accessible ptes, some can allow access by kernel mode.
>   	 */
>   	if (folio && folio_test_large(folio))
> -		numa_rebuild_large_mapping(vmf, vma, folio, pte, ignore_writable,
> -					   pte_write_upgrade);
> +		numa_rebuild_large_mapping(vmf, vma, folio, nr_pages, pte,
> +					   ignore_writable, pte_write_upgrade);
>   	else
>   		numa_rebuild_single_mapping(vmf, vma, vmf->address, vmf->pte,
>   					    writable);

Could the BUG also explain:

https://lore.kernel.org/all/CA+G9fYvKmr84WzTArmfaypKM9+=Aw0uXCtuUKHQKFCNMGJyOgQ@mail.gmail.com
David Hildenbrand June 11, 2024, 12:32 p.m. UTC | #6
On 07.06.24 12:32, Kefeng Wang wrote:
> The large folio is mapped with folio size aligned virtual address during
> the pagefault, eg, 'addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE)'
> in do_anonymous_page(), but after the mremap(), the virtual address only
> require PAGE_SIZE aligned, also pte is moved to new in move_page_tables(),
> then traverse the new pte in numa_rebuild_large_mapping() will hint the
> following issue,
> 
>     Unable to handle kernel paging request at virtual address 00000a80c021a788
>     Mem abort info:
>       ESR = 0x0000000096000004
>       EC = 0x25: DABT (current EL), IL = 32 bits
>       SET = 0, FnV = 0
>       EA = 0, S1PTW = 0
>       FSC = 0x04: level 0 translation fault
>     Data abort info:
>       ISV = 0, ISS = 0x00000004, ISS2 = 0x00000000
>       CM = 0, WnR = 0, TnD = 0, TagAccess = 0
>       GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0
>     user pgtable: 4k pages, 48-bit VAs, pgdp=00002040341a6000
>     [00000a80c021a788] pgd=0000000000000000, p4d=0000000000000000
>     Internal error: Oops: 0000000096000004 [#1] SMP
>     ...
>     CPU: 76 PID: 15187 Comm: git Kdump: loaded Tainted: G        W          6.10.0-rc2+ #209
>     Hardware name: Huawei TaiShan 2280 V2/BC82AMDD, BIOS 1.79 08/21/2021
>     pstate: 60400009 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
>     pc : numa_rebuild_large_mapping+0x338/0x638
>     lr : numa_rebuild_large_mapping+0x320/0x638
>     sp : ffff8000b41c3b00
>     x29: ffff8000b41c3b30 x28: ffff8000812a0000 x27: 00000000000a8000
>     x26: 00000000000000a8 x25: 0010000000000001 x24: ffff20401c7170f0
>     x23: 0000ffff33a1e000 x22: 0000ffff33a76000 x21: ffff20400869eca0
>     x20: 0000ffff33976000 x19: 00000000000000a8 x18: ffffffffffffffff
>     x17: 0000000000000000 x16: 0000000000000020 x15: ffff8000b41c36a8
>     x14: 0000000000000000 x13: 205d373831353154 x12: 5b5d333331363732
>     x11: 000000000011ff78 x10: 000000000011ff10 x9 : ffff800080273f30
>     x8 : 000000320400869e x7 : c0000000ffffd87f x6 : 00000000001e6ba8
>     x5 : ffff206f3fb5af88 x4 : 0000000000000000 x3 : 0000000000000000
>     x2 : 0000000000000000 x1 : fffffdffc0000000 x0 : 00000a80c021a780
>     Call trace:
>      numa_rebuild_large_mapping+0x338/0x638
>      do_numa_page+0x3e4/0x4e0
>      handle_pte_fault+0x1bc/0x238
>      __handle_mm_fault+0x20c/0x400
>      handle_mm_fault+0xa8/0x288
>      do_page_fault+0x124/0x498
>      do_translation_fault+0x54/0x80
>      do_mem_abort+0x4c/0xa8
>      el0_da+0x40/0x110
>      el0t_64_sync_handler+0xe4/0x158
>      el0t_64_sync+0x188/0x190
> 
> Fix it by correct the start and end, which may lead to only rebuild part
> of large mapping in one numa page fault, there is no issue since other part
> could rebuild by another pagefault.
> 
> Fixes: d2136d749d76 ("mm: support multi-size THP numa balancing")
> Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
> ---
>   mm/memory.c | 24 +++++++++++++++---------
>   1 file changed, 15 insertions(+), 9 deletions(-)
> 
> diff --git a/mm/memory.c b/mm/memory.c
> index db9130488231..0ad57b6485ca 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -5223,15 +5223,21 @@ static void numa_rebuild_single_mapping(struct vm_fault *vmf, struct vm_area_str
>   	update_mmu_cache_range(vmf, vma, fault_addr, fault_pte, 1);
>   }
>   
> -static void numa_rebuild_large_mapping(struct vm_fault *vmf, struct vm_area_struct *vma,
> -				       struct folio *folio, pte_t fault_pte,
> -				       bool ignore_writable, bool pte_write_upgrade)
> +static void numa_rebuild_large_mapping(struct vm_fault *vmf,
> +		struct vm_area_struct *vma, struct folio *folio, int nr_pages,
> +		pte_t fault_pte, bool ignore_writable, bool pte_write_upgrade)
>   {
>   	int nr = pte_pfn(fault_pte) - folio_pfn(folio);
> -	unsigned long start = max(vmf->address - nr * PAGE_SIZE, vma->vm_start);
> -	unsigned long end = min(vmf->address + (folio_nr_pages(folio) - nr) * PAGE_SIZE, vma->vm_end);
> -	pte_t *start_ptep = vmf->pte - (vmf->address - start) / PAGE_SIZE;
> -	unsigned long addr;
> +	unsigned long folio_size = nr_pages * PAGE_SIZE;

Just re-read folio_nr_pages() here, it's cheap. Or even better, use 
folio_size();

> +	unsigned long addr = vmf->address;
> +	unsigned long start, end, align_addr;
> +	pte_t *start_ptep;
> +
> +	align_addr = ALIGN_DOWN(addr, folio_size);
> +	start = max3(addr - nr * PAGE_SIZE, align_addr, vma->vm_start);
> +	end = min3(addr + (nr_pages - nr) * PAGE_SIZE, align_addr + folio_size,

Please avoid mixing nr_pages and folio_size.

> +		   vma->vm_end);
> +	start_ptep = vmf->pte - (addr - start) / PAGE_SIZE;
			    writable);

I am not able to convince myself that the old code could not have 
resulted in a vmf->pte that underflows the page table. Am I correct?

Now align_addr would make sure that we are always within one page table 
(as long as our folio size does not exceed PMD size :) ).

Can we use PMD_SIZE instead for that, something like the following?


/* Stay within the VMA and within the page table. */
pt_start = ALIGN_DOWN(addr, PMD_SIZE);
start = max3(addr - nr << PAGE_SHIFT, pt_start, vma->vm_start);
end = min3(addr + folio_size - nr << PAGE_SHIFT,
            pt_start + PMD_SIZE, vma->vm_end);

start_ptep = vmf->pte - (addr - start) >> PAGE_SHIFT;
Baolin Wang June 12, 2024, 1:16 a.m. UTC | #7
On 2024/6/11 20:32, David Hildenbrand wrote:
> On 07.06.24 12:32, Kefeng Wang wrote:
>> The large folio is mapped with folio size aligned virtual address during
>> the pagefault, eg, 'addr = ALIGN_DOWN(vmf->address, nr_pages * 
>> PAGE_SIZE)'
>> in do_anonymous_page(), but after the mremap(), the virtual address only
>> require PAGE_SIZE aligned, also pte is moved to new in 
>> move_page_tables(),
>> then traverse the new pte in numa_rebuild_large_mapping() will hint the
>> following issue,
>>
>>     Unable to handle kernel paging request at virtual address 
>> 00000a80c021a788
>>     Mem abort info:
>>       ESR = 0x0000000096000004
>>       EC = 0x25: DABT (current EL), IL = 32 bits
>>       SET = 0, FnV = 0
>>       EA = 0, S1PTW = 0
>>       FSC = 0x04: level 0 translation fault
>>     Data abort info:
>>       ISV = 0, ISS = 0x00000004, ISS2 = 0x00000000
>>       CM = 0, WnR = 0, TnD = 0, TagAccess = 0
>>       GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0
>>     user pgtable: 4k pages, 48-bit VAs, pgdp=00002040341a6000
>>     [00000a80c021a788] pgd=0000000000000000, p4d=0000000000000000
>>     Internal error: Oops: 0000000096000004 [#1] SMP
>>     ...
>>     CPU: 76 PID: 15187 Comm: git Kdump: loaded Tainted: G        
>> W          6.10.0-rc2+ #209
>>     Hardware name: Huawei TaiShan 2280 V2/BC82AMDD, BIOS 1.79 08/21/2021
>>     pstate: 60400009 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
>>     pc : numa_rebuild_large_mapping+0x338/0x638
>>     lr : numa_rebuild_large_mapping+0x320/0x638
>>     sp : ffff8000b41c3b00
>>     x29: ffff8000b41c3b30 x28: ffff8000812a0000 x27: 00000000000a8000
>>     x26: 00000000000000a8 x25: 0010000000000001 x24: ffff20401c7170f0
>>     x23: 0000ffff33a1e000 x22: 0000ffff33a76000 x21: ffff20400869eca0
>>     x20: 0000ffff33976000 x19: 00000000000000a8 x18: ffffffffffffffff
>>     x17: 0000000000000000 x16: 0000000000000020 x15: ffff8000b41c36a8
>>     x14: 0000000000000000 x13: 205d373831353154 x12: 5b5d333331363732
>>     x11: 000000000011ff78 x10: 000000000011ff10 x9 : ffff800080273f30
>>     x8 : 000000320400869e x7 : c0000000ffffd87f x6 : 00000000001e6ba8
>>     x5 : ffff206f3fb5af88 x4 : 0000000000000000 x3 : 0000000000000000
>>     x2 : 0000000000000000 x1 : fffffdffc0000000 x0 : 00000a80c021a780
>>     Call trace:
>>      numa_rebuild_large_mapping+0x338/0x638
>>      do_numa_page+0x3e4/0x4e0
>>      handle_pte_fault+0x1bc/0x238
>>      __handle_mm_fault+0x20c/0x400
>>      handle_mm_fault+0xa8/0x288
>>      do_page_fault+0x124/0x498
>>      do_translation_fault+0x54/0x80
>>      do_mem_abort+0x4c/0xa8
>>      el0_da+0x40/0x110
>>      el0t_64_sync_handler+0xe4/0x158
>>      el0t_64_sync+0x188/0x190
>>
>> Fix it by correct the start and end, which may lead to only rebuild part
>> of large mapping in one numa page fault, there is no issue since other 
>> part
>> could rebuild by another pagefault.
>>
>> Fixes: d2136d749d76 ("mm: support multi-size THP numa balancing")
>> Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
>> ---
>>   mm/memory.c | 24 +++++++++++++++---------
>>   1 file changed, 15 insertions(+), 9 deletions(-)
>>
>> diff --git a/mm/memory.c b/mm/memory.c
>> index db9130488231..0ad57b6485ca 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -5223,15 +5223,21 @@ static void numa_rebuild_single_mapping(struct 
>> vm_fault *vmf, struct vm_area_str
>>       update_mmu_cache_range(vmf, vma, fault_addr, fault_pte, 1);
>>   }
>> -static void numa_rebuild_large_mapping(struct vm_fault *vmf, struct 
>> vm_area_struct *vma,
>> -                       struct folio *folio, pte_t fault_pte,
>> -                       bool ignore_writable, bool pte_write_upgrade)
>> +static void numa_rebuild_large_mapping(struct vm_fault *vmf,
>> +        struct vm_area_struct *vma, struct folio *folio, int nr_pages,
>> +        pte_t fault_pte, bool ignore_writable, bool pte_write_upgrade)
>>   {
>>       int nr = pte_pfn(fault_pte) - folio_pfn(folio);
>> -    unsigned long start = max(vmf->address - nr * PAGE_SIZE, 
>> vma->vm_start);
>> -    unsigned long end = min(vmf->address + (folio_nr_pages(folio) - 
>> nr) * PAGE_SIZE, vma->vm_end);
>> -    pte_t *start_ptep = vmf->pte - (vmf->address - start) / PAGE_SIZE;
>> -    unsigned long addr;
>> +    unsigned long folio_size = nr_pages * PAGE_SIZE;
> 
> Just re-read folio_nr_pages() here, it's cheap. Or even better, use 
> folio_size();
> 
>> +    unsigned long addr = vmf->address;
>> +    unsigned long start, end, align_addr;
>> +    pte_t *start_ptep;
>> +
>> +    align_addr = ALIGN_DOWN(addr, folio_size);
>> +    start = max3(addr - nr * PAGE_SIZE, align_addr, vma->vm_start);
>> +    end = min3(addr + (nr_pages - nr) * PAGE_SIZE, align_addr + 
>> folio_size,
> 
> Please avoid mixing nr_pages and folio_size.
> 
>> +           vma->vm_end);
>> +    start_ptep = vmf->pte - (addr - start) / PAGE_SIZE;
>                  writable);
> 
> I am not able to convince myself that the old code could not have 
> resulted in a vmf->pte that underflows the page table. Am I correct?

Yes, I think you are right, and I realized the problem now.

> 
> Now align_addr would make sure that we are always within one page table 
> (as long as our folio size does not exceed PMD size :) ).
> 
> Can we use PMD_SIZE instead for that, something like the following?
> 
> 
> /* Stay within the VMA and within the page table. */
> pt_start = ALIGN_DOWN(addr, PMD_SIZE);
> start = max3(addr - nr << PAGE_SHIFT, pt_start, vma->vm_start);
> end = min3(addr + folio_size - nr << PAGE_SHIFT,
>             pt_start + PMD_SIZE, vma->vm_end);
> 
> start_ptep = vmf->pte - (addr - start) >> PAGE_SHIFT;

The changes look good to me. Thanks.
Kefeng Wang June 12, 2024, 2:41 a.m. UTC | #8
Hi David and Baolin,

On 2024/6/7 18:37, David Hildenbrand wrote:
> On 07.06.24 12:32, Kefeng Wang wrote:
>> The large folio is mapped with folio size aligned virtual address during
>> the pagefault, eg, 'addr = ALIGN_DOWN(vmf->address, nr_pages * 
>> PAGE_SIZE)'
>> in do_anonymous_page(), but after the mremap(), the virtual address only
>> require PAGE_SIZE aligned, also pte is moved to new in 
>> move_page_tables(),
>> then traverse the new pte in numa_rebuild_large_mapping() will hint the
>> following issue,
>>
>>     Unable to handle kernel paging request at virtual address 
>> 00000a80c021a788
>>     Mem abort info:
>>       ESR = 0x0000000096000004
>>       EC = 0x25: DABT (current EL), IL = 32 bits
>>       SET = 0, FnV = 0
>>       EA = 0, S1PTW = 0
>>       FSC = 0x04: level 0 translation fault
>>     Data abort info:
>>       ISV = 0, ISS = 0x00000004, ISS2 = 0x00000000
>>       CM = 0, WnR = 0, TnD = 0, TagAccess = 0
>>       GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0
>>     user pgtable: 4k pages, 48-bit VAs, pgdp=00002040341a6000
>>     [00000a80c021a788] pgd=0000000000000000, p4d=0000000000000000
>>     Internal error: Oops: 0000000096000004 [#1] SMP
>>     ...
>>     CPU: 76 PID: 15187 Comm: git Kdump: loaded Tainted: G        
>> W          6.10.0-rc2+ #209
>>     Hardware name: Huawei TaiShan 2280 V2/BC82AMDD, BIOS 1.79 08/21/2021
>>     pstate: 60400009 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
>>     pc : numa_rebuild_large_mapping+0x338/0x638
>>     lr : numa_rebuild_large_mapping+0x320/0x638
>>     sp : ffff8000b41c3b00
>>     x29: ffff8000b41c3b30 x28: ffff8000812a0000 x27: 00000000000a8000
>>     x26: 00000000000000a8 x25: 0010000000000001 x24: ffff20401c7170f0
>>     x23: 0000ffff33a1e000 x22: 0000ffff33a76000 x21: ffff20400869eca0
>>     x20: 0000ffff33976000 x19: 00000000000000a8 x18: ffffffffffffffff
>>     x17: 0000000000000000 x16: 0000000000000020 x15: ffff8000b41c36a8
>>     x14: 0000000000000000 x13: 205d373831353154 x12: 5b5d333331363732
>>     x11: 000000000011ff78 x10: 000000000011ff10 x9 : ffff800080273f30
>>     x8 : 000000320400869e x7 : c0000000ffffd87f x6 : 00000000001e6ba8
>>     x5 : ffff206f3fb5af88 x4 : 0000000000000000 x3 : 0000000000000000
>>     x2 : 0000000000000000 x1 : fffffdffc0000000 x0 : 00000a80c021a780
>>     Call trace:
>>      numa_rebuild_large_mapping+0x338/0x638
>>      do_numa_page+0x3e4/0x4e0
>>      handle_pte_fault+0x1bc/0x238
>>      __handle_mm_fault+0x20c/0x400
>>      handle_mm_fault+0xa8/0x288
>>      do_page_fault+0x124/0x498
>>      do_translation_fault+0x54/0x80
>>      do_mem_abort+0x4c/0xa8
>>      el0_da+0x40/0x110
>>      el0t_64_sync_handler+0xe4/0x158
>>      el0t_64_sync+0x188/0x190
> 
> Do you have an easy reproducer that we can use to reproduce+verify this 
> issue? The description above indicates to me that this should not be too 
> complicated to write :)
> 

Sorry for the late due to traditional Chinese festival, the issue is
easily reproduced when enable mTHP but drop the "align larger anonymous
mappings on THP boundaries" on arm64, here is the step,

   drop the align anon mapping on THP[Optional, but not very easy to 
reproduce]
   cd /sys/kernel/mm/transparent_hugepage/
   echo never  > hugepage-2048kB/never
   echo always > hugepage-1024kB/never  (other size could reproduce 
issue too)
   git clone root@127.0.0.1:/home/git/linux  (clone the local kernel repo)

and in numa_rebuild_large_mapping(), we hint some different errors, but
most are the bad access when

   if (pfn_folio(pte_pfn(ptent)) != folio)

The page is invalid, so guess the ptent/start_ptep is wrong when
traverse.
Kefeng Wang June 12, 2024, 2:50 a.m. UTC | #9
On 2024/6/10 0:03, Dan Carpenter wrote:
> Hi Kefeng,
> 
> kernel test robot noticed the following build warnings:
> 
> url:    https://github.com/intel-lab-lkp/linux/commits/Kefeng-Wang/mm-fix-possible-OOB-in-numa_rebuild_large_mapping/20240607-183609
> base:   https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
> patch link:    https://lore.kernel.org/r/20240607103241.1298388-1-wangkefeng.wang%40huawei.com
> patch subject: [PATCH] mm: fix possible OOB in numa_rebuild_large_mapping()
> config: mips-randconfig-r081-20240609 (https://download.01.org/0day-ci/archive/20240609/202406092325.eDrcikT8-lkp@intel.com/config)
> compiler: mips-linux-gcc (GCC) 13.2.0
> 
> If you fix the issue in a separate patch/commit (i.e. not just a new version of
> the same patch/commit), kindly add following tags
> | Reported-by: kernel test robot <lkp@intel.com>
> | Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
> | Closes: https://lore.kernel.org/r/202406092325.eDrcikT8-lkp@intel.com/
> 
> smatch warnings:
> mm/memory.c:5370 do_numa_page() error: uninitialized symbol 'nr_pages'.
> 
> vim +/nr_pages +5370 mm/memory.c
> 
> 2b7403035459c7 Souptick Joarder  2018-08-23  5265  static vm_fault_t do_numa_page(struct vm_fault *vmf)
> d10e63f29488b0 Mel Gorman        2012-10-25  5266  {
> 82b0f8c39a3869 Jan Kara          2016-12-14  5267  	struct vm_area_struct *vma = vmf->vma;
> 6695cf68b15c21 Kefeng Wang       2023-09-21  5268  	struct folio *folio = NULL;
> 6695cf68b15c21 Kefeng Wang       2023-09-21  5269  	int nid = NUMA_NO_NODE;
> d2136d749d76af Baolin Wang       2024-03-29  5270  	bool writable = false, ignore_writable = false;
> d2136d749d76af Baolin Wang       2024-03-29  5271  	bool pte_write_upgrade = vma_wants_manual_pte_write_upgrade(vma);
> 90572890d20252 Peter Zijlstra    2013-10-07  5272  	int last_cpupid;
> cbee9f88ec1b8d Peter Zijlstra    2012-10-25  5273  	int target_nid;
> 04a8645304500b Aneesh Kumar K.V  2019-03-05  5274  	pte_t pte, old_pte;
> d2136d749d76af Baolin Wang       2024-03-29  5275  	int flags = 0, nr_pages;
> d10e63f29488b0 Mel Gorman        2012-10-25  5276
> d10e63f29488b0 Mel Gorman        2012-10-25  5277  	/*
> 6c1b748ebf27be John Hubbard      2024-02-27  5278  	 * The pte cannot be used safely until we verify, while holding the page
> 6c1b748ebf27be John Hubbard      2024-02-27  5279  	 * table lock, that its contents have not changed during fault handling.
> d10e63f29488b0 Mel Gorman        2012-10-25  5280  	 */
> 82b0f8c39a3869 Jan Kara          2016-12-14  5281  	spin_lock(vmf->ptl);
> 6c1b748ebf27be John Hubbard      2024-02-27  5282  	/* Read the live PTE from the page tables: */
> 6c1b748ebf27be John Hubbard      2024-02-27  5283  	old_pte = ptep_get(vmf->pte);
> 6c1b748ebf27be John Hubbard      2024-02-27  5284
> 6c1b748ebf27be John Hubbard      2024-02-27  5285  	if (unlikely(!pte_same(old_pte, vmf->orig_pte))) {
> 82b0f8c39a3869 Jan Kara          2016-12-14  5286  		pte_unmap_unlock(vmf->pte, vmf->ptl);
> 4daae3b4b9e49b Mel Gorman        2012-11-02  5287  		goto out;
> 4daae3b4b9e49b Mel Gorman        2012-11-02  5288  	}
> 4daae3b4b9e49b Mel Gorman        2012-11-02  5289
> 04a8645304500b Aneesh Kumar K.V  2019-03-05  5290  	pte = pte_modify(old_pte, vma->vm_page_prot);
> d10e63f29488b0 Mel Gorman        2012-10-25  5291
> 6a56ccbcf6c695 David Hildenbrand 2022-11-08  5292  	/*
> 6a56ccbcf6c695 David Hildenbrand 2022-11-08  5293  	 * Detect now whether the PTE could be writable; this information
> 6a56ccbcf6c695 David Hildenbrand 2022-11-08  5294  	 * is only valid while holding the PT lock.
> 6a56ccbcf6c695 David Hildenbrand 2022-11-08  5295  	 */
> 6a56ccbcf6c695 David Hildenbrand 2022-11-08  5296  	writable = pte_write(pte);
> d2136d749d76af Baolin Wang       2024-03-29  5297  	if (!writable && pte_write_upgrade &&
> 6a56ccbcf6c695 David Hildenbrand 2022-11-08  5298  	    can_change_pte_writable(vma, vmf->address, pte))
> 6a56ccbcf6c695 David Hildenbrand 2022-11-08  5299  		writable = true;
> 6a56ccbcf6c695 David Hildenbrand 2022-11-08  5300
> 6695cf68b15c21 Kefeng Wang       2023-09-21  5301  	folio = vm_normal_folio(vma, vmf->address, pte);
> 6695cf68b15c21 Kefeng Wang       2023-09-21  5302  	if (!folio || folio_is_zone_device(folio))
> b99a342d4f11a5 Huang Ying        2021-04-29  5303  		goto out_map;
> 
> nr_pages not initialized
> 
> d10e63f29488b0 Mel Gorman        2012-10-25  5304
> 6688cc05473b36 Peter Zijlstra    2013-10-07  5305  	/*
> bea66fbd11af1c Mel Gorman        2015-03-25  5306  	 * Avoid grouping on RO pages in general. RO pages shouldn't hurt as
> bea66fbd11af1c Mel Gorman        2015-03-25  5307  	 * much anyway since they can be in shared cache state. This misses
> bea66fbd11af1c Mel Gorman        2015-03-25  5308  	 * the case where a mapping is writable but the process never writes
> bea66fbd11af1c Mel Gorman        2015-03-25  5309  	 * to it but pte_write gets cleared during protection updates and
> bea66fbd11af1c Mel Gorman        2015-03-25  5310  	 * pte_dirty has unpredictable behaviour between PTE scan updates,
> bea66fbd11af1c Mel Gorman        2015-03-25  5311  	 * background writeback, dirty balancing and application behaviour.
> bea66fbd11af1c Mel Gorman        2015-03-25  5312  	 */
> 6a56ccbcf6c695 David Hildenbrand 2022-11-08  5313  	if (!writable)
> 6688cc05473b36 Peter Zijlstra    2013-10-07  5314  		flags |= TNF_NO_GROUP;
> 6688cc05473b36 Peter Zijlstra    2013-10-07  5315
> dabe1d992414a6 Rik van Riel      2013-10-07  5316  	/*
> 6695cf68b15c21 Kefeng Wang       2023-09-21  5317  	 * Flag if the folio is shared between multiple address spaces. This
> dabe1d992414a6 Rik van Riel      2013-10-07  5318  	 * is later used when determining whether to group tasks together
> dabe1d992414a6 Rik van Riel      2013-10-07  5319  	 */
> ebb34f78d72c23 David Hildenbrand 2024-02-27  5320  	if (folio_likely_mapped_shared(folio) && (vma->vm_flags & VM_SHARED))
> dabe1d992414a6 Rik van Riel      2013-10-07  5321  		flags |= TNF_SHARED;
> dabe1d992414a6 Rik van Riel      2013-10-07  5322
> 6695cf68b15c21 Kefeng Wang       2023-09-21  5323  	nid = folio_nid(folio);
> d2136d749d76af Baolin Wang       2024-03-29  5324  	nr_pages = folio_nr_pages(folio);
> 33024536bafd91 Huang Ying        2022-07-13  5325  	/*
> 33024536bafd91 Huang Ying        2022-07-13  5326  	 * For memory tiering mode, cpupid of slow memory page is used
> 33024536bafd91 Huang Ying        2022-07-13  5327  	 * to record page access time.  So use default value.
> 33024536bafd91 Huang Ying        2022-07-13  5328  	 */
> 33024536bafd91 Huang Ying        2022-07-13  5329  	if ((sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
> 6695cf68b15c21 Kefeng Wang       2023-09-21  5330  	    !node_is_toptier(nid))
> 33024536bafd91 Huang Ying        2022-07-13  5331  		last_cpupid = (-1 & LAST_CPUPID_MASK);
> 33024536bafd91 Huang Ying        2022-07-13  5332  	else
> 67b33e3ff58374 Kefeng Wang       2023-10-18  5333  		last_cpupid = folio_last_cpupid(folio);
> f8fd525ba3a298 Donet Tom         2024-03-08  5334  	target_nid = numa_migrate_prep(folio, vmf, vmf->address, nid, &flags);
> 98fa15f34cb379 Anshuman Khandual 2019-03-05  5335  	if (target_nid == NUMA_NO_NODE) {
> 6695cf68b15c21 Kefeng Wang       2023-09-21  5336  		folio_put(folio);
> b99a342d4f11a5 Huang Ying        2021-04-29  5337  		goto out_map;
> 4daae3b4b9e49b Mel Gorman        2012-11-02  5338  	}
> b99a342d4f11a5 Huang Ying        2021-04-29  5339  	pte_unmap_unlock(vmf->pte, vmf->ptl);
> 6a56ccbcf6c695 David Hildenbrand 2022-11-08  5340  	writable = false;
> d2136d749d76af Baolin Wang       2024-03-29  5341  	ignore_writable = true;
> 4daae3b4b9e49b Mel Gorman        2012-11-02  5342
> 4daae3b4b9e49b Mel Gorman        2012-11-02  5343  	/* Migrate to the requested node */
> 6695cf68b15c21 Kefeng Wang       2023-09-21  5344  	if (migrate_misplaced_folio(folio, vma, target_nid)) {
> 6695cf68b15c21 Kefeng Wang       2023-09-21  5345  		nid = target_nid;
> 6688cc05473b36 Peter Zijlstra    2013-10-07  5346  		flags |= TNF_MIGRATED;
> b99a342d4f11a5 Huang Ying        2021-04-29  5347  	} else {
> 074c238177a75f Mel Gorman        2015-03-25  5348  		flags |= TNF_MIGRATE_FAIL;
> c7ad08804fae5b Hugh Dickins      2023-06-08  5349  		vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
> c7ad08804fae5b Hugh Dickins      2023-06-08  5350  					       vmf->address, &vmf->ptl);
> c7ad08804fae5b Hugh Dickins      2023-06-08  5351  		if (unlikely(!vmf->pte))
> c7ad08804fae5b Hugh Dickins      2023-06-08  5352  			goto out;
> c33c794828f212 Ryan Roberts      2023-06-12  5353  		if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
> b99a342d4f11a5 Huang Ying        2021-04-29  5354  			pte_unmap_unlock(vmf->pte, vmf->ptl);
> b99a342d4f11a5 Huang Ying        2021-04-29  5355  			goto out;
> b99a342d4f11a5 Huang Ying        2021-04-29  5356  		}
> b99a342d4f11a5 Huang Ying        2021-04-29  5357  		goto out_map;
> b99a342d4f11a5 Huang Ying        2021-04-29  5358  	}
> 4daae3b4b9e49b Mel Gorman        2012-11-02  5359
> 4daae3b4b9e49b Mel Gorman        2012-11-02  5360  out:
> 6695cf68b15c21 Kefeng Wang       2023-09-21  5361  	if (nid != NUMA_NO_NODE)
> d2136d749d76af Baolin Wang       2024-03-29  5362  		task_numa_fault(last_cpupid, nid, nr_pages, flags);
> d10e63f29488b0 Mel Gorman        2012-10-25  5363  	return 0;
> b99a342d4f11a5 Huang Ying        2021-04-29  5364  out_map:
> b99a342d4f11a5 Huang Ying        2021-04-29  5365  	/*
> b99a342d4f11a5 Huang Ying        2021-04-29  5366  	 * Make it present again, depending on how arch implements
> b99a342d4f11a5 Huang Ying        2021-04-29  5367  	 * non-accessible ptes, some can allow access by kernel mode.
> b99a342d4f11a5 Huang Ying        2021-04-29  5368  	 */
> d2136d749d76af Baolin Wang       2024-03-29  5369  	if (folio && folio_test_large(folio))
> 
> Are folio_test_large() and folio_is_zone_device() mutually exclusive?
> If so then this is a false positive.  Just ignore the warning in that
> case.
> 

The folio in ZONE_DEVICE is not a large folio, so there is no issue for 
now, but will fix.



> 8d27aa5be8ed93 Kefeng Wang       2024-06-07 @5370  		numa_rebuild_large_mapping(vmf, vma, folio, nr_pages, pte,
> 8d27aa5be8ed93 Kefeng Wang       2024-06-07  5371  					   ignore_writable, pte_write_upgrade);
> d2136d749d76af Baolin Wang       2024-03-29  5372  	else
> d2136d749d76af Baolin Wang       2024-03-29  5373  		numa_rebuild_single_mapping(vmf, vma, vmf->address, vmf->pte,
> d2136d749d76af Baolin Wang       2024-03-29  5374  					    writable);
> b99a342d4f11a5 Huang Ying        2021-04-29  5375  	pte_unmap_unlock(vmf->pte, vmf->ptl);
> b99a342d4f11a5 Huang Ying        2021-04-29  5376  	goto out;
> d10e63f29488b0 Mel Gorman        2012-10-25  5377  }
>
Kefeng Wang June 12, 2024, 6:02 a.m. UTC | #10
On 2024/6/11 20:32, David Hildenbrand wrote:
> On 07.06.24 12:32, Kefeng Wang wrote:
>> The large folio is mapped with folio size aligned virtual address during
>> the pagefault, eg, 'addr = ALIGN_DOWN(vmf->address, nr_pages * 
>> PAGE_SIZE)'
>> in do_anonymous_page(), but after the mremap(), the virtual address only
>> require PAGE_SIZE aligned, also pte is moved to new in 
>> move_page_tables(),
>> then traverse the new pte in numa_rebuild_large_mapping() will hint the
>> following issue,
>>
>>     Unable to handle kernel paging request at virtual address 
>> 00000a80c021a788
>>     Mem abort info:
>>       ESR = 0x0000000096000004
>>       EC = 0x25: DABT (current EL), IL = 32 bits
>>       SET = 0, FnV = 0
>>       EA = 0, S1PTW = 0
>>       FSC = 0x04: level 0 translation fault
>>     Data abort info:
>>       ISV = 0, ISS = 0x00000004, ISS2 = 0x00000000
>>       CM = 0, WnR = 0, TnD = 0, TagAccess = 0
>>       GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0
>>     user pgtable: 4k pages, 48-bit VAs, pgdp=00002040341a6000
>>     [00000a80c021a788] pgd=0000000000000000, p4d=0000000000000000
>>     Internal error: Oops: 0000000096000004 [#1] SMP
>>     ...
>>     CPU: 76 PID: 15187 Comm: git Kdump: loaded Tainted: G        
>> W          6.10.0-rc2+ #209
>>     Hardware name: Huawei TaiShan 2280 V2/BC82AMDD, BIOS 1.79 08/21/2021
>>     pstate: 60400009 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
>>     pc : numa_rebuild_large_mapping+0x338/0x638
>>     lr : numa_rebuild_large_mapping+0x320/0x638
>>     sp : ffff8000b41c3b00
>>     x29: ffff8000b41c3b30 x28: ffff8000812a0000 x27: 00000000000a8000
>>     x26: 00000000000000a8 x25: 0010000000000001 x24: ffff20401c7170f0
>>     x23: 0000ffff33a1e000 x22: 0000ffff33a76000 x21: ffff20400869eca0
>>     x20: 0000ffff33976000 x19: 00000000000000a8 x18: ffffffffffffffff
>>     x17: 0000000000000000 x16: 0000000000000020 x15: ffff8000b41c36a8
>>     x14: 0000000000000000 x13: 205d373831353154 x12: 5b5d333331363732
>>     x11: 000000000011ff78 x10: 000000000011ff10 x9 : ffff800080273f30
>>     x8 : 000000320400869e x7 : c0000000ffffd87f x6 : 00000000001e6ba8
>>     x5 : ffff206f3fb5af88 x4 : 0000000000000000 x3 : 0000000000000000
>>     x2 : 0000000000000000 x1 : fffffdffc0000000 x0 : 00000a80c021a780
>>     Call trace:
>>      numa_rebuild_large_mapping+0x338/0x638
>>      do_numa_page+0x3e4/0x4e0
>>      handle_pte_fault+0x1bc/0x238
>>      __handle_mm_fault+0x20c/0x400
>>      handle_mm_fault+0xa8/0x288
>>      do_page_fault+0x124/0x498
>>      do_translation_fault+0x54/0x80
>>      do_mem_abort+0x4c/0xa8
>>      el0_da+0x40/0x110
>>      el0t_64_sync_handler+0xe4/0x158
>>      el0t_64_sync+0x188/0x190
>>
>> Fix it by correct the start and end, which may lead to only rebuild part
>> of large mapping in one numa page fault, there is no issue since other 
>> part
>> could rebuild by another pagefault.
>>
>> Fixes: d2136d749d76 ("mm: support multi-size THP numa balancing")
>> Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
>> ---
>>   mm/memory.c | 24 +++++++++++++++---------
>>   1 file changed, 15 insertions(+), 9 deletions(-)
>>
>> diff --git a/mm/memory.c b/mm/memory.c
>> index db9130488231..0ad57b6485ca 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -5223,15 +5223,21 @@ static void numa_rebuild_single_mapping(struct 
>> vm_fault *vmf, struct vm_area_str
>>       update_mmu_cache_range(vmf, vma, fault_addr, fault_pte, 1);
>>   }
>> -static void numa_rebuild_large_mapping(struct vm_fault *vmf, struct 
>> vm_area_struct *vma,
>> -                       struct folio *folio, pte_t fault_pte,
>> -                       bool ignore_writable, bool pte_write_upgrade)
>> +static void numa_rebuild_large_mapping(struct vm_fault *vmf,
>> +        struct vm_area_struct *vma, struct folio *folio, int nr_pages,
>> +        pte_t fault_pte, bool ignore_writable, bool pte_write_upgrade)
>>   {
>>       int nr = pte_pfn(fault_pte) - folio_pfn(folio);
>> -    unsigned long start = max(vmf->address - nr * PAGE_SIZE, 
>> vma->vm_start);
>> -    unsigned long end = min(vmf->address + (folio_nr_pages(folio) - 
>> nr) * PAGE_SIZE, vma->vm_end);
>> -    pte_t *start_ptep = vmf->pte - (vmf->address - start) / PAGE_SIZE;
>> -    unsigned long addr;
>> +    unsigned long folio_size = nr_pages * PAGE_SIZE;
> 
> Just re-read folio_nr_pages() here, it's cheap. Or even better, use 
> folio_size();

OK, will directly use folio_size().

> 
>> +    unsigned long addr = vmf->address;
>> +    unsigned long start, end, align_addr;
>> +    pte_t *start_ptep;
>> +
>> +    align_addr = ALIGN_DOWN(addr, folio_size);
>> +    start = max3(addr - nr * PAGE_SIZE, align_addr, vma->vm_start);
>> +    end = min3(addr + (nr_pages - nr) * PAGE_SIZE, align_addr + 
>> folio_size,
> 
> Please avoid mixing nr_pages and folio_size.
> 
>> +           vma->vm_end);
>> +    start_ptep = vmf->pte - (addr - start) / PAGE_SIZE;
>                  writable);
> 
> I am not able to convince myself that the old code could not have 
> resulted in a vmf->pte that underflows the page table. Am I correct?
> 

Yes,it should not happen.

> Now align_addr would make sure that we are always within one page table 
> (as long as our folio size does not exceed PMD size :) ).
> 
> Can we use PMD_SIZE instead for that, something like the following?
> 
> 
> /* Stay within the VMA and within the page table. */
> pt_start = ALIGN_DOWN(addr, PMD_SIZE);
> start = max3(addr - nr << PAGE_SHIFT, pt_start, vma->vm_start);
> end = min3(addr + folio_size - nr << PAGE_SHIFT,
>             pt_start + PMD_SIZE, vma->vm_end);
> 
> start_ptep = vmf->pte - (addr - start) >> PAGE_SHIFT;
> 

It looks better, will retest and send a new one, thanks.
Dan Carpenter June 12, 2024, 10:06 a.m. UTC | #11
On Wed, Jun 12, 2024 at 10:50:02AM +0800, Kefeng Wang wrote:
> > b99a342d4f11a5 Huang Ying        2021-04-29  5364  out_map:
> > b99a342d4f11a5 Huang Ying        2021-04-29  5365  	/*
> > b99a342d4f11a5 Huang Ying        2021-04-29  5366  	 * Make it present again, depending on how arch implements
> > b99a342d4f11a5 Huang Ying        2021-04-29  5367  	 * non-accessible ptes, some can allow access by kernel mode.
> > b99a342d4f11a5 Huang Ying        2021-04-29  5368  	 */
> > d2136d749d76af Baolin Wang       2024-03-29  5369  	if (folio && folio_test_large(folio))
> > 
> > Are folio_test_large() and folio_is_zone_device() mutually exclusive?
> > If so then this is a false positive.  Just ignore the warning in that
> > case.
> > 
> 
> The folio in ZONE_DEVICE is not a large folio, so there is no issue for now,
> but will fix.
> 

If it's not an issue, then don't feel obligated to do anything.  These
are a one time email and then I treat it as addressed.  These warnings
are intended to be useful, not a burden.

regards,
dan carpenter
Kefeng Wang June 12, 2024, 12:32 p.m. UTC | #12
On 2024/6/12 18:06, Dan Carpenter wrote:
> On Wed, Jun 12, 2024 at 10:50:02AM +0800, Kefeng Wang wrote:
>>> b99a342d4f11a5 Huang Ying        2021-04-29  5364  out_map:
>>> b99a342d4f11a5 Huang Ying        2021-04-29  5365  	/*
>>> b99a342d4f11a5 Huang Ying        2021-04-29  5366  	 * Make it present again, depending on how arch implements
>>> b99a342d4f11a5 Huang Ying        2021-04-29  5367  	 * non-accessible ptes, some can allow access by kernel mode.
>>> b99a342d4f11a5 Huang Ying        2021-04-29  5368  	 */
>>> d2136d749d76af Baolin Wang       2024-03-29  5369  	if (folio && folio_test_large(folio))
>>>
>>> Are folio_test_large() and folio_is_zone_device() mutually exclusive?
>>> If so then this is a false positive.  Just ignore the warning in that
>>> case.
>>>
>>
>> The folio in ZONE_DEVICE is not a large folio, so there is no issue for now,
>> but will fix.
>>
> 
> If it's not an issue, then don't feel obligated to do anything.  These
> are a one time email and then I treat it as addressed.  These warnings
> are intended to be useful, not a burden.

The check is great, and the new version avoids to pass the nr_pages, so
there is no more warning, thanks.

> 
> regards,
> dan carpenter
>
diff mbox series

Patch

diff --git a/mm/memory.c b/mm/memory.c
index db9130488231..0ad57b6485ca 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -5223,15 +5223,21 @@  static void numa_rebuild_single_mapping(struct vm_fault *vmf, struct vm_area_str
 	update_mmu_cache_range(vmf, vma, fault_addr, fault_pte, 1);
 }
 
-static void numa_rebuild_large_mapping(struct vm_fault *vmf, struct vm_area_struct *vma,
-				       struct folio *folio, pte_t fault_pte,
-				       bool ignore_writable, bool pte_write_upgrade)
+static void numa_rebuild_large_mapping(struct vm_fault *vmf,
+		struct vm_area_struct *vma, struct folio *folio, int nr_pages,
+		pte_t fault_pte, bool ignore_writable, bool pte_write_upgrade)
 {
 	int nr = pte_pfn(fault_pte) - folio_pfn(folio);
-	unsigned long start = max(vmf->address - nr * PAGE_SIZE, vma->vm_start);
-	unsigned long end = min(vmf->address + (folio_nr_pages(folio) - nr) * PAGE_SIZE, vma->vm_end);
-	pte_t *start_ptep = vmf->pte - (vmf->address - start) / PAGE_SIZE;
-	unsigned long addr;
+	unsigned long folio_size = nr_pages * PAGE_SIZE;
+	unsigned long addr = vmf->address;
+	unsigned long start, end, align_addr;
+	pte_t *start_ptep;
+
+	align_addr = ALIGN_DOWN(addr, folio_size);
+	start = max3(addr - nr * PAGE_SIZE, align_addr, vma->vm_start);
+	end = min3(addr + (nr_pages - nr) * PAGE_SIZE, align_addr + folio_size,
+		   vma->vm_end);
+	start_ptep = vmf->pte - (addr - start) / PAGE_SIZE;
 
 	/* Restore all PTEs' mapping of the large folio */
 	for (addr = start; addr != end; start_ptep++, addr += PAGE_SIZE) {
@@ -5361,8 +5367,8 @@  static vm_fault_t do_numa_page(struct vm_fault *vmf)
 	 * non-accessible ptes, some can allow access by kernel mode.
 	 */
 	if (folio && folio_test_large(folio))
-		numa_rebuild_large_mapping(vmf, vma, folio, pte, ignore_writable,
-					   pte_write_upgrade);
+		numa_rebuild_large_mapping(vmf, vma, folio, nr_pages, pte,
+					   ignore_writable, pte_write_upgrade);
 	else
 		numa_rebuild_single_mapping(vmf, vma, vmf->address, vmf->pte,
 					    writable);