diff mbox series

mm: fix double page fault on arm64 if PTE_AF is cleared

Message ID 20190904005831.153934-1-justin.he@arm.com (mailing list archive)
State New, archived
Headers show
Series mm: fix double page fault on arm64 if PTE_AF is cleared | expand

Commit Message

Jia He Sept. 4, 2019, 12:58 a.m. UTC
When we tested pmdk unit test [1] vmmalloc_fork TEST1 in arm64 guest, there
will be a double page fault in __copy_from_user_inatomic of cow_user_page.

Below call trace is from arm64 do_page_fault for debugging purpose
[  110.016195] Call trace:
[  110.016826]  do_page_fault+0x5a4/0x690
[  110.017812]  do_mem_abort+0x50/0xb0
[  110.018726]  el1_da+0x20/0xc4
[  110.019492]  __arch_copy_from_user+0x180/0x280
[  110.020646]  do_wp_page+0xb0/0x860
[  110.021517]  __handle_mm_fault+0x994/0x1338
[  110.022606]  handle_mm_fault+0xe8/0x180
[  110.023584]  do_page_fault+0x240/0x690
[  110.024535]  do_mem_abort+0x50/0xb0
[  110.025423]  el0_da+0x20/0x24

The pte info before __copy_from_user_inatomic is(PTE_AF is cleared):
[ffff9b007000] pgd=000000023d4f8003, pud=000000023da9b003, pmd=000000023d4b3003, pte=360000298607bd3

The keypoint is: we don't always have a hardware-managed access flag on
arm64.

The root cause is in copy_one_pte, it will clear the PTE_AF for COW
pages. Generally, when it is accessed by user, the COW pages will be set
as accessed(PTE_AF bit on arm64) by hardware if hardware feature is
supported. But on some arm64 platforms, the PTE_AF needs to be set by
software.

This patch fix it by calling pte_mkyoung. Also, the parameter is
changed because vmf should be passed to cow_user_page()

[1] https://github.com/pmem/pmdk/tree/master/src/test/vmmalloc_fork

Reported-by: Yibo Cai <Yibo.Cai@arm.com>
Signed-off-by: Jia He <justin.he@arm.com>
---
 mm/memory.c | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

Comments

Anshuman Khandual Sept. 4, 2019, 3:19 a.m. UTC | #1
On 09/04/2019 06:28 AM, Jia He wrote:
> When we tested pmdk unit test [1] vmmalloc_fork TEST1 in arm64 guest, there
> will be a double page fault in __copy_from_user_inatomic of cow_user_page.
> 
> Below call trace is from arm64 do_page_fault for debugging purpose
> [  110.016195] Call trace:
> [  110.016826]  do_page_fault+0x5a4/0x690
> [  110.017812]  do_mem_abort+0x50/0xb0
> [  110.018726]  el1_da+0x20/0xc4
> [  110.019492]  __arch_copy_from_user+0x180/0x280
> [  110.020646]  do_wp_page+0xb0/0x860
> [  110.021517]  __handle_mm_fault+0x994/0x1338
> [  110.022606]  handle_mm_fault+0xe8/0x180
> [  110.023584]  do_page_fault+0x240/0x690
> [  110.024535]  do_mem_abort+0x50/0xb0
> [  110.025423]  el0_da+0x20/0x24
> 
> The pte info before __copy_from_user_inatomic is(PTE_AF is cleared):
> [ffff9b007000] pgd=000000023d4f8003, pud=000000023da9b003, pmd=000000023d4b3003, pte=360000298607bd3
> 
> The keypoint is: we don't always have a hardware-managed access flag on
> arm64.
> 
> The root cause is in copy_one_pte, it will clear the PTE_AF for COW
> pages. Generally, when it is accessed by user, the COW pages will be set
> as accessed(PTE_AF bit on arm64) by hardware if hardware feature is
> supported. But on some arm64 platforms, the PTE_AF needs to be set by
> software.
> 
> This patch fix it by calling pte_mkyoung. Also, the parameter is
> changed because vmf should be passed to cow_user_page()
> 
> [1] https://github.com/pmem/pmdk/tree/master/src/test/vmmalloc_fork
> 
> Reported-by: Yibo Cai <Yibo.Cai@arm.com>
> Signed-off-by: Jia He <justin.he@arm.com>
> ---
>  mm/memory.c | 21 ++++++++++++++++-----
>  1 file changed, 16 insertions(+), 5 deletions(-)
> 
> diff --git a/mm/memory.c b/mm/memory.c
> index e2bb51b6242e..b1f9ace2e943 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -2140,7 +2140,8 @@ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
>  	return same;
>  }
>  
> -static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
> +static inline void cow_user_page(struct page *dst, struct page *src,
> +				struct vm_fault *vmf)
>  {
>  	debug_dma_assert_idle(src);
>  
> @@ -2152,20 +2153,30 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
>  	 */
>  	if (unlikely(!src)) {
>  		void *kaddr = kmap_atomic(dst);
> -		void __user *uaddr = (void __user *)(va & PAGE_MASK);
> +		void __user *uaddr = (void __user *)(vmf->address & PAGE_MASK);
> +		pte_t entry;
>  
>  		/*
>  		 * This really shouldn't fail, because the page is there
>  		 * in the page tables. But it might just be unreadable,
>  		 * in which case we just give up and fill the result with
> -		 * zeroes.
> +		 * zeroes. If PTE_AF is cleared on arm64, it might
> +		 * cause double page fault here. so makes pte young here
>  		 */
> +		if (!pte_young(vmf->orig_pte)) {
> +			entry = pte_mkyoung(vmf->orig_pte);
> +			if (ptep_set_access_flags(vmf->vma, vmf->address,
> +				vmf->pte, entry, vmf->flags & FAULT_FLAG_WRITE))
> +				update_mmu_cache(vmf->vma, vmf->address,
> +						vmf->pte);
> +		}
> +
>  		if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))

Should not page fault be disabled when doing this ? Ideally it should
have also called access_ok() on the user address range first. The point
is that the caller of __copy_from_user_inatomic() must make sure that
there cannot be any page fault while doing the actual copy. But also it
should be done in generic way, something like in access_ok(). The current
proposal here seems very specific to arm64 case.
Anshuman Khandual Sept. 4, 2019, 4:37 a.m. UTC | #2
On 09/04/2019 08:49 AM, Anshuman Khandual wrote:
>  		/*
>  		 * This really shouldn't fail, because the page is there
>  		 * in the page tables. But it might just be unreadable,
>  		 * in which case we just give up and fill the result with
> -		 * zeroes.
> +		 * zeroes. If PTE_AF is cleared on arm64, it might
> +		 * cause double page fault here. so makes pte young here
>  		 */
> +		if (!pte_young(vmf->orig_pte)) {
> +			entry = pte_mkyoung(vmf->orig_pte);
> +			if (ptep_set_access_flags(vmf->vma, vmf->address,
> +				vmf->pte, entry, vmf->flags & FAULT_FLAG_WRITE))
> +				update_mmu_cache(vmf->vma, vmf->address,
> +						vmf->pte);
> +		}

This looks correct where it updates the pte entry with PTE_AF which
will prevent a subsequent page fault. But I think what we really need
here is to make sure 'uaddr' is mapped correctly at vma->pte. Probably
a generic function arch_map_pte() when defined for arm64 should check
CPU version and ensure continuance of PTE_AF if required. The comment
above also need to be updated saying not only the page should be there
in the page table, it needs to mapped appropriately as well.
Jia He Sept. 4, 2019, 4:57 a.m. UTC | #3
Hi Anshuman, thanks for the comments, see below please

> -----Original Message-----
> From: Anshuman Khandual <anshuman.khandual@arm.com>
> Sent: 2019年9月4日 12:38
> To: Justin He (Arm Technology China) <Justin.He@arm.com>; Andrew
> Morton <akpm@linux-foundation.org>; Matthew Wilcox
> <willy@infradead.org>; Jérôme Glisse <jglisse@redhat.com>; Ralph
> Campbell <rcampbell@nvidia.com>; Jason Gunthorpe <jgg@ziepe.ca>;
> Peter Zijlstra <peterz@infradead.org>; Dave Airlie <airlied@redhat.com>;
> Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>; Thomas Hellstrom
> <thellstrom@vmware.com>; Souptick Joarder <jrdr.linux@gmail.com>;
> linux-mm@kvack.org; linux-kernel@vger.kernel.org
> Subject: Re: [PATCH] mm: fix double page fault on arm64 if PTE_AF is
> cleared
>
>
>
> On 09/04/2019 08:49 AM, Anshuman Khandual wrote:
> >             /*
> >              * This really shouldn't fail, because the page is there
> >              * in the page tables. But it might just be unreadable,
> >              * in which case we just give up and fill the result with
> > -            * zeroes.
> > +            * zeroes. If PTE_AF is cleared on arm64, it might
> > +            * cause double page fault here. so makes pte young here
> >              */
> > +           if (!pte_young(vmf->orig_pte)) {
> > +                   entry = pte_mkyoung(vmf->orig_pte);
> > +                   if (ptep_set_access_flags(vmf->vma, vmf->address,
> > +                           vmf->pte, entry, vmf->flags &
> FAULT_FLAG_WRITE))
> > +                           update_mmu_cache(vmf->vma, vmf-
> >address,
> > +                                           vmf->pte);
> > +           }
>
> This looks correct where it updates the pte entry with PTE_AF which
> will prevent a subsequent page fault. But I think what we really need
> here is to make sure 'uaddr' is mapped correctly at vma->pte. Probably
> a generic function arch_map_pte() when defined for arm64 should check
> CPU version and ensure continuance of PTE_AF if required. The comment
> above also need to be updated saying not only the page should be there
> in the page table, it needs to mapped appropriately as well.

I agree that a generic interface here is needed but not the arch_map_pte().
In this case, I thought all the pgd/pud/pmd/pte had been set correctly except
for the PTE_AF bit.
How about arch_hw_access_flag()?
If non-arm64, arch_hw_access_flag() == true
If arm64 with hardware-managed access flag supported, == true
else == false?


--
Cheers,
Justin (Jia He)


IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.
Anshuman Khandual Sept. 4, 2019, 5:28 a.m. UTC | #4
On 09/04/2019 10:27 AM, Justin He (Arm Technology China) wrote:
> Hi Anshuman, thanks for the comments, see below please
> 
>> -----Original Message-----
>> From: Anshuman Khandual <anshuman.khandual@arm.com>
>> Sent: 2019年9月4日 12:38
>> To: Justin He (Arm Technology China) <Justin.He@arm.com>; Andrew
>> Morton <akpm@linux-foundation.org>; Matthew Wilcox
>> <willy@infradead.org>; Jérôme Glisse <jglisse@redhat.com>; Ralph
>> Campbell <rcampbell@nvidia.com>; Jason Gunthorpe <jgg@ziepe.ca>;
>> Peter Zijlstra <peterz@infradead.org>; Dave Airlie <airlied@redhat.com>;
>> Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>; Thomas Hellstrom
>> <thellstrom@vmware.com>; Souptick Joarder <jrdr.linux@gmail.com>;
>> linux-mm@kvack.org; linux-kernel@vger.kernel.org
>> Subject: Re: [PATCH] mm: fix double page fault on arm64 if PTE_AF is
>> cleared
>>
>>
>>
>> On 09/04/2019 08:49 AM, Anshuman Khandual wrote:
>>>  		/*
>>>  		 * This really shouldn't fail, because the page is there
>>>  		 * in the page tables. But it might just be unreadable,
>>>  		 * in which case we just give up and fill the result with
>>> -		 * zeroes.
>>> +		 * zeroes. If PTE_AF is cleared on arm64, it might
>>> +		 * cause double page fault here. so makes pte young here
>>>  		 */
>>> +		if (!pte_young(vmf->orig_pte)) {
>>> +			entry = pte_mkyoung(vmf->orig_pte);
>>> +			if (ptep_set_access_flags(vmf->vma, vmf->address,
>>> +				vmf->pte, entry, vmf->flags &
>> FAULT_FLAG_WRITE))
>>> +				update_mmu_cache(vmf->vma, vmf-
>>> address,
>>> +						vmf->pte);
>>> +		}
>>
>> This looks correct where it updates the pte entry with PTE_AF which
>> will prevent a subsequent page fault. But I think what we really need
>> here is to make sure 'uaddr' is mapped correctly at vma->pte. Probably
>> a generic function arch_map_pte() when defined for arm64 should check
>> CPU version and ensure continuance of PTE_AF if required. The comment
>> above also need to be updated saying not only the page should be there
>> in the page table, it needs to mapped appropriately as well.
> 
> I agree that a generic interface here is needed but not the arch_map_pte().
> In this case, I thought all the pgd/pud/pmd/pte had been set correctly except
> for the PTE_AF bit.
> How about arch_hw_access_flag()?

Sure, go ahead. I just meant 'map' to include not only the PFN but also
appropriate HW attributes not cause a page fault.

> If non-arm64, arch_hw_access_flag() == true

The function does not need to return anything. Dummy default definition
in generic MM will do nothing when arch does not override.

> If arm64 with hardware-managed access flag supported, == true
> else == false?

On arm64 with hardware-managed access flag supported, it will do nothing.
But in case its not supported the above mentioned pte update as in the
current proposal needs to be executed. The details should hide in arch
specific override.
Jia He Sept. 4, 2019, 5:41 a.m. UTC | #5
Hi Anshuman

> -----Original Message-----
> From: Anshuman Khandual <anshuman.khandual@arm.com>
> Sent: 2019年9月4日 13:29
> To: Justin He (Arm Technology China) <Justin.He@arm.com>; Andrew
> Morton <akpm@linux-foundation.org>; Matthew Wilcox
> <willy@infradead.org>; Jérôme Glisse <jglisse@redhat.com>; Ralph
> Campbell <rcampbell@nvidia.com>; Jason Gunthorpe <jgg@ziepe.ca>;
> Peter Zijlstra <peterz@infradead.org>; Dave Airlie <airlied@redhat.com>;
> Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>; Thomas Hellstrom
> <thellstrom@vmware.com>; Souptick Joarder <jrdr.linux@gmail.com>;
> linux-mm@kvack.org; linux-kernel@vger.kernel.org
> Subject: Re: [PATCH] mm: fix double page fault on arm64 if PTE_AF is
> cleared
>
>
>
> On 09/04/2019 10:27 AM, Justin He (Arm Technology China) wrote:
> > Hi Anshuman, thanks for the comments, see below please
> >
> >> -----Original Message-----
> >> From: Anshuman Khandual <anshuman.khandual@arm.com>
> >> Sent: 2019年9月4日 12:38
> >> To: Justin He (Arm Technology China) <Justin.He@arm.com>; Andrew
> >> Morton <akpm@linux-foundation.org>; Matthew Wilcox
> >> <willy@infradead.org>; Jérôme Glisse <jglisse@redhat.com>; Ralph
> >> Campbell <rcampbell@nvidia.com>; Jason Gunthorpe <jgg@ziepe.ca>;
> >> Peter Zijlstra <peterz@infradead.org>; Dave Airlie <airlied@redhat.com>;
> >> Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>; Thomas Hellstrom
> >> <thellstrom@vmware.com>; Souptick Joarder <jrdr.linux@gmail.com>;
> >> linux-mm@kvack.org; linux-kernel@vger.kernel.org
> >> Subject: Re: [PATCH] mm: fix double page fault on arm64 if PTE_AF is
> >> cleared
> >>
> >>
> >>
> >> On 09/04/2019 08:49 AM, Anshuman Khandual wrote:
> >>>           /*
> >>>            * This really shouldn't fail, because the page is there
> >>>            * in the page tables. But it might just be unreadable,
> >>>            * in which case we just give up and fill the result with
> >>> -          * zeroes.
> >>> +          * zeroes. If PTE_AF is cleared on arm64, it might
> >>> +          * cause double page fault here. so makes pte young here
> >>>            */
> >>> +         if (!pte_young(vmf->orig_pte)) {
> >>> +                 entry = pte_mkyoung(vmf->orig_pte);
> >>> +                 if (ptep_set_access_flags(vmf->vma, vmf->address,
> >>> +                         vmf->pte, entry, vmf->flags &
> >> FAULT_FLAG_WRITE))
> >>> +                         update_mmu_cache(vmf->vma, vmf-
> >>> address,
> >>> +                                         vmf->pte);
> >>> +         }
> >>
> >> This looks correct where it updates the pte entry with PTE_AF which
> >> will prevent a subsequent page fault. But I think what we really need
> >> here is to make sure 'uaddr' is mapped correctly at vma->pte. Probably
> >> a generic function arch_map_pte() when defined for arm64 should check
> >> CPU version and ensure continuance of PTE_AF if required. The
> comment
> >> above also need to be updated saying not only the page should be there
> >> in the page table, it needs to mapped appropriately as well.
> >
> > I agree that a generic interface here is needed but not the arch_map_pte().
> > In this case, I thought all the pgd/pud/pmd/pte had been set correctly
> except
> > for the PTE_AF bit.
> > How about arch_hw_access_flag()?
>
> Sure, go ahead. I just meant 'map' to include not only the PFN but also
> appropriate HW attributes not cause a page fault.
>
> > If non-arm64, arch_hw_access_flag() == true
>
> The function does not need to return anything. Dummy default definition
> in generic MM will do nothing when arch does not override.
>

Ok, got it, thanks

--
Cheers,
Justin (Jia He)



> > If arm64 with hardware-managed access flag supported, == true
> > else == false?
>
> On arm64 with hardware-managed access flag supported, it will do nothing.
> But in case its not supported the above mentioned pte update as in the
> current proposal needs to be executed. The details should hide in arch
> specific override.
IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.
Catalin Marinas Sept. 4, 2019, 1:49 p.m. UTC | #6
On Wed, 4 Sep 2019 at 01:59, Jia He <justin.he@arm.com> wrote:
> @@ -2152,20 +2153,30 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
>          */
>         if (unlikely(!src)) {
>                 void *kaddr = kmap_atomic(dst);
> -               void __user *uaddr = (void __user *)(va & PAGE_MASK);
> +               void __user *uaddr = (void __user *)(vmf->address & PAGE_MASK);
> +               pte_t entry;
>
>                 /*
>                  * This really shouldn't fail, because the page is there
>                  * in the page tables. But it might just be unreadable,
>                  * in which case we just give up and fill the result with
> -                * zeroes.
> +                * zeroes. If PTE_AF is cleared on arm64, it might
> +                * cause double page fault here. so makes pte young here
>                  */
> +               if (!pte_young(vmf->orig_pte)) {
> +                       entry = pte_mkyoung(vmf->orig_pte);
> +                       if (ptep_set_access_flags(vmf->vma, vmf->address,
> +                               vmf->pte, entry, vmf->flags & FAULT_FLAG_WRITE))

I think you need to pass dirty = 0 to ptep_set_access_flags() rather
than the vmf->flags & FAULT_FLAG_WRITE. This is copying from the user
address into a kernel mapping and the fault you want to prevent is a
read access on uaddr via __copy_from_user_inatomic(). The pte will be
made writable in the wp_page_copy() function.
Catalin Marinas Sept. 4, 2019, 2:22 p.m. UTC | #7
On Wed, 4 Sep 2019 at 04:20, Anshuman Khandual
<anshuman.khandual@arm.com> wrote:
> On 09/04/2019 06:28 AM, Jia He wrote:
> > @@ -2152,20 +2153,30 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
> >        */
> >       if (unlikely(!src)) {
> >               void *kaddr = kmap_atomic(dst);
> > -             void __user *uaddr = (void __user *)(va & PAGE_MASK);
> > +             void __user *uaddr = (void __user *)(vmf->address & PAGE_MASK);
> > +             pte_t entry;
> >
> >               /*
> >                * This really shouldn't fail, because the page is there
> >                * in the page tables. But it might just be unreadable,
> >                * in which case we just give up and fill the result with
> > -              * zeroes.
> > +              * zeroes. If PTE_AF is cleared on arm64, it might
> > +              * cause double page fault here. so makes pte young here
> >                */
> > +             if (!pte_young(vmf->orig_pte)) {
> > +                     entry = pte_mkyoung(vmf->orig_pte);
> > +                     if (ptep_set_access_flags(vmf->vma, vmf->address,
> > +                             vmf->pte, entry, vmf->flags & FAULT_FLAG_WRITE))
> > +                             update_mmu_cache(vmf->vma, vmf->address,
> > +                                             vmf->pte);
> > +             }
> > +
> >               if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
>
> Should not page fault be disabled when doing this ?

Page faults are already disabled by the kmap_atomic(). But that only
means that you don't deadlock trying to take the mmap_sem again.

> Ideally it should
> have also called access_ok() on the user address range first.

Not necessary, we've already got a vma and the access to the vma checked.

> The point
> is that the caller of __copy_from_user_inatomic() must make sure that
> there cannot be any page fault while doing the actual copy.

When you copy from a user address, in general that's not guaranteed,
more of a best effort.

> But also it
> should be done in generic way, something like in access_ok(). The current
> proposal here seems very specific to arm64 case.

The commit log didn't explain the problem properly. On arm64 without
hardware Access Flag, copying from user will fail because the pte is
old and cannot be marked young. So we always end up with zeroed page
after fork() + CoW for pfn mappings.
Jia He Sept. 5, 2019, 1:18 a.m. UTC | #8
Hi Catalin

> -----Original Message-----
> From: Catalin Marinas <catalin.marinas@arm.com>
> Sent: 2019年9月4日 22:22
> To: Anshuman Khandual <Anshuman.Khandual@arm.com>
> Cc: Justin He (Arm Technology China) <Justin.He@arm.com>; Andrew
> Morton <akpm@linux-foundation.org>; Matthew Wilcox
> <willy@infradead.org>; Jérôme Glisse <jglisse@redhat.com>; Ralph
> Campbell <rcampbell@nvidia.com>; Jason Gunthorpe <jgg@ziepe.ca>;
> Peter Zijlstra <peterz@infradead.org>; Dave Airlie <airlied@redhat.com>;
> Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>; Thomas Hellstrom
> <thellstrom@vmware.com>; Souptick Joarder <jrdr.linux@gmail.com>;
> linux-mm <linux-mm@kvack.org>; Linux Kernel Mailing List <linux-
> kernel@vger.kernel.org>
> Subject: Re: [PATCH] mm: fix double page fault on arm64 if PTE_AF is
> cleared
>
> On Wed, 4 Sep 2019 at 04:20, Anshuman Khandual
> <anshuman.khandual@arm.com> wrote:
> > On 09/04/2019 06:28 AM, Jia He wrote:
> > > @@ -2152,20 +2153,30 @@ static inline void cow_user_page(struct
> page *dst, struct page *src, unsigned lo
> > >        */
> > >       if (unlikely(!src)) {
> > >               void *kaddr = kmap_atomic(dst);
> > > -             void __user *uaddr = (void __user *)(va & PAGE_MASK);
> > > +             void __user *uaddr = (void __user *)(vmf->address &
> PAGE_MASK);
> > > +             pte_t entry;
> > >
> > >               /*
> > >                * This really shouldn't fail, because the page is there
> > >                * in the page tables. But it might just be unreadable,
> > >                * in which case we just give up and fill the result with
> > > -              * zeroes.
> > > +              * zeroes. If PTE_AF is cleared on arm64, it might
> > > +              * cause double page fault here. so makes pte young here
> > >                */
> > > +             if (!pte_young(vmf->orig_pte)) {
> > > +                     entry = pte_mkyoung(vmf->orig_pte);
> > > +                     if (ptep_set_access_flags(vmf->vma, vmf->address,
> > > +                             vmf->pte, entry, vmf->flags & FAULT_FLAG_WRITE))
> > > +                             update_mmu_cache(vmf->vma, vmf->address,
> > > +                                             vmf->pte);
> > > +             }
> > > +
> > >               if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
> >
> > Should not page fault be disabled when doing this ?
>
> Page faults are already disabled by the kmap_atomic(). But that only
> means that you don't deadlock trying to take the mmap_sem again.
>
> > Ideally it should
> > have also called access_ok() on the user address range first.
>
> Not necessary, we've already got a vma and the access to the vma checked.
>
> > The point
> > is that the caller of __copy_from_user_inatomic() must make sure that
> > there cannot be any page fault while doing the actual copy.
>
> When you copy from a user address, in general that's not guaranteed,
> more of a best effort.
>
> > But also it
> > should be done in generic way, something like in access_ok(). The current
> > proposal here seems very specific to arm64 case.
>
> The commit log didn't explain the problem properly. On arm64 without
> hardware Access Flag, copying from user will fail because the pte is
> old and cannot be marked young. So we always end up with zeroed page
> after fork() + CoW for pfn mappings.
>

Ok I will update it, thanks

--
Cheers,
Justin (Jia He)


> --
> Catalin
IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.
Jia He Sept. 5, 2019, 1:21 a.m. UTC | #9
> -----Original Message-----
> From: Catalin Marinas <catalin.marinas@arm.com>
> Sent: 2019年9月4日 21:49
> To: Justin He (Arm Technology China) <Justin.He@arm.com>
> Cc: Andrew Morton <akpm@linux-foundation.org>; Matthew Wilcox
> <willy@infradead.org>; Jérôme Glisse <jglisse@redhat.com>; Ralph
> Campbell <rcampbell@nvidia.com>; Jason Gunthorpe <jgg@ziepe.ca>;
> Peter Zijlstra <peterz@infradead.org>; Dave Airlie <airlied@redhat.com>;
> Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>; Thomas Hellstrom
> <thellstrom@vmware.com>; Souptick Joarder <jrdr.linux@gmail.com>;
> linux-mm <linux-mm@kvack.org>; Linux Kernel Mailing List <linux-
> kernel@vger.kernel.org>
> Subject: Re: [PATCH] mm: fix double page fault on arm64 if PTE_AF is
> cleared
>
> On Wed, 4 Sep 2019 at 01:59, Jia He <justin.he@arm.com> wrote:
> > @@ -2152,20 +2153,30 @@ static inline void cow_user_page(struct page
> *dst, struct page *src, unsigned lo
> >          */
> >         if (unlikely(!src)) {
> >                 void *kaddr = kmap_atomic(dst);
> > -               void __user *uaddr = (void __user *)(va & PAGE_MASK);
> > +               void __user *uaddr = (void __user *)(vmf->address &
> PAGE_MASK);
> > +               pte_t entry;
> >
> >                 /*
> >                  * This really shouldn't fail, because the page is there
> >                  * in the page tables. But it might just be unreadable,
> >                  * in which case we just give up and fill the result with
> > -                * zeroes.
> > +                * zeroes. If PTE_AF is cleared on arm64, it might
> > +                * cause double page fault here. so makes pte young here
> >                  */
> > +               if (!pte_young(vmf->orig_pte)) {
> > +                       entry = pte_mkyoung(vmf->orig_pte);
> > +                       if (ptep_set_access_flags(vmf->vma, vmf->address,
> > +                               vmf->pte, entry, vmf->flags & FAULT_FLAG_WRITE))
>
> I think you need to pass dirty = 0 to ptep_set_access_flags() rather
> than the vmf->flags & FAULT_FLAG_WRITE. This is copying from the user
> address into a kernel mapping and the fault you want to prevent is a
> read access on uaddr via __copy_from_user_inatomic(). The pte will be
> made writable in the wp_page_copy() function.

Ok, thanks

--
Cheers,
Justin (Jia He)


>
> --
> Catalin
IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.
diff mbox series

Patch

diff --git a/mm/memory.c b/mm/memory.c
index e2bb51b6242e..b1f9ace2e943 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2140,7 +2140,8 @@  static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
 	return same;
 }
 
-static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
+static inline void cow_user_page(struct page *dst, struct page *src,
+				struct vm_fault *vmf)
 {
 	debug_dma_assert_idle(src);
 
@@ -2152,20 +2153,30 @@  static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
 	 */
 	if (unlikely(!src)) {
 		void *kaddr = kmap_atomic(dst);
-		void __user *uaddr = (void __user *)(va & PAGE_MASK);
+		void __user *uaddr = (void __user *)(vmf->address & PAGE_MASK);
+		pte_t entry;
 
 		/*
 		 * This really shouldn't fail, because the page is there
 		 * in the page tables. But it might just be unreadable,
 		 * in which case we just give up and fill the result with
-		 * zeroes.
+		 * zeroes. If PTE_AF is cleared on arm64, it might
+		 * cause double page fault here. so makes pte young here
 		 */
+		if (!pte_young(vmf->orig_pte)) {
+			entry = pte_mkyoung(vmf->orig_pte);
+			if (ptep_set_access_flags(vmf->vma, vmf->address,
+				vmf->pte, entry, vmf->flags & FAULT_FLAG_WRITE))
+				update_mmu_cache(vmf->vma, vmf->address,
+						vmf->pte);
+		}
+
 		if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
 			clear_page(kaddr);
 		kunmap_atomic(kaddr);
 		flush_dcache_page(dst);
 	} else
-		copy_user_highpage(dst, src, va, vma);
+		copy_user_highpage(dst, src, vmf->address, vmf->vma);
 }
 
 static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
@@ -2318,7 +2329,7 @@  static vm_fault_t wp_page_copy(struct vm_fault *vmf)
 				vmf->address);
 		if (!new_page)
 			goto oom;
-		cow_user_page(new_page, old_page, vmf->address, vma);
+		cow_user_page(new_page, old_page, vmf);
 	}
 
 	if (mem_cgroup_try_charge_delay(new_page, mm, GFP_KERNEL, &memcg, false))