diff mbox series

[v2] mm: Fix modifying of page protection by insert_pfn_pmd()

Message ID 20190402115125.18803-1-aneesh.kumar@linux.ibm.com (mailing list archive)
State New, archived
Headers show
Series [v2] mm: Fix modifying of page protection by insert_pfn_pmd() | expand

Commit Message

Aneesh Kumar K.V April 2, 2019, 11:51 a.m. UTC
With some architectures like ppc64, set_pmd_at() cannot cope with
a situation where there is already some (different) valid entry present.

Use pmdp_set_access_flags() instead to modify the pfn which is built to
deal with modifying existing PMD entries.

This is similar to
commit cae85cb8add3 ("mm/memory.c: fix modifying of page protection by insert_pfn()")

We also do similar update w.r.t insert_pfn_pud eventhough ppc64 don't support
pud pfn entries now.

Without this patch we also see the below message in kernel log
"BUG: non-zero pgtables_bytes on freeing mm:"

CC: stable@vger.kernel.org
Reported-by: Chandan Rajendra <chandan@linux.ibm.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
Changes from v1:
* Fix the pgtable leak 

 mm/huge_memory.c | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

Comments

Jan Kara April 2, 2019, 3:24 p.m. UTC | #1
On Tue 02-04-19 17:21:25, Aneesh Kumar K.V wrote:
> With some architectures like ppc64, set_pmd_at() cannot cope with
> a situation where there is already some (different) valid entry present.
> 
> Use pmdp_set_access_flags() instead to modify the pfn which is built to
> deal with modifying existing PMD entries.
> 
> This is similar to
> commit cae85cb8add3 ("mm/memory.c: fix modifying of page protection by insert_pfn()")
> 
> We also do similar update w.r.t insert_pfn_pud eventhough ppc64 don't support
> pud pfn entries now.
> 
> Without this patch we also see the below message in kernel log
> "BUG: non-zero pgtables_bytes on freeing mm:"
> 
> CC: stable@vger.kernel.org
> Reported-by: Chandan Rajendra <chandan@linux.ibm.com>
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>

Looks good to me. You can add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza

> ---
> Changes from v1:
> * Fix the pgtable leak 
> 
>  mm/huge_memory.c | 36 ++++++++++++++++++++++++++++++++++++
>  1 file changed, 36 insertions(+)
> 
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 404acdcd0455..165ea46bf149 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -755,6 +755,21 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
>  	spinlock_t *ptl;
>  
>  	ptl = pmd_lock(mm, pmd);
> +	if (!pmd_none(*pmd)) {
> +		if (write) {
> +			if (pmd_pfn(*pmd) != pfn_t_to_pfn(pfn)) {
> +				WARN_ON_ONCE(!is_huge_zero_pmd(*pmd));
> +				goto out_unlock;
> +			}
> +			entry = pmd_mkyoung(*pmd);
> +			entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
> +			if (pmdp_set_access_flags(vma, addr, pmd, entry, 1))
> +				update_mmu_cache_pmd(vma, addr, pmd);
> +		}
> +
> +		goto out_unlock;
> +	}
> +
>  	entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
>  	if (pfn_t_devmap(pfn))
>  		entry = pmd_mkdevmap(entry);
> @@ -766,11 +781,16 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
>  	if (pgtable) {
>  		pgtable_trans_huge_deposit(mm, pmd, pgtable);
>  		mm_inc_nr_ptes(mm);
> +		pgtable = NULL;
>  	}
>  
>  	set_pmd_at(mm, addr, pmd, entry);
>  	update_mmu_cache_pmd(vma, addr, pmd);
> +
> +out_unlock:
>  	spin_unlock(ptl);
> +	if (pgtable)
> +		pte_free(mm, pgtable);
>  }
>  
>  vm_fault_t vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
> @@ -821,6 +841,20 @@ static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
>  	spinlock_t *ptl;
>  
>  	ptl = pud_lock(mm, pud);
> +	if (!pud_none(*pud)) {
> +		if (write) {
> +			if (pud_pfn(*pud) != pfn_t_to_pfn(pfn)) {
> +				WARN_ON_ONCE(!is_huge_zero_pud(*pud));
> +				goto out_unlock;
> +			}
> +			entry = pud_mkyoung(*pud);
> +			entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma);
> +			if (pudp_set_access_flags(vma, addr, pud, entry, 1))
> +				update_mmu_cache_pud(vma, addr, pud);
> +		}
> +		goto out_unlock;
> +	}
> +
>  	entry = pud_mkhuge(pfn_t_pud(pfn, prot));
>  	if (pfn_t_devmap(pfn))
>  		entry = pud_mkdevmap(entry);
> @@ -830,6 +864,8 @@ static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
>  	}
>  	set_pud_at(mm, addr, pud, entry);
>  	update_mmu_cache_pud(vma, addr, pud);
> +
> +out_unlock:
>  	spin_unlock(ptl);
>  }
>  
> -- 
> 2.20.1
>
Sasha Levin April 3, 2019, 12:29 p.m. UTC | #2
Hi,

[This is an automated email]

This commit has been processed because it contains a -stable tag.
The stable tag indicates that it's relevant for the following trees: all

The bot has tested the following trees: v5.0.5, v4.19.32, v4.14.109, v4.9.166, v4.4.177, v3.18.137.

v5.0.5: Build OK!
v4.19.32: Build OK!
v4.14.109: Failed to apply! Possible dependencies:
    b4e98d9ac775 ("mm: account pud page tables")
    c4812909f5d5 ("mm: introduce wrappers to access mm->nr_ptes")

v4.9.166: Failed to apply! Possible dependencies:
    166f61b9435a ("mm: codgin-style fixes")
    505a60e22560 ("asm-generic: introduce 5level-fixup.h")
    5c6a84a3f455 ("mm/kasan: Switch to using __pa_symbol and lm_alias")
    82b0f8c39a38 ("mm: join struct fault_env and vm_fault")
    953c66c2b22a ("mm: THP page cache support for ppc64")
    b279ddc33824 ("mm: clarify mm_struct.mm_{users,count} documentation")
    b4e98d9ac775 ("mm: account pud page tables")
    c2febafc6773 ("mm: convert generic code to 5-level paging")
    c4812909f5d5 ("mm: introduce wrappers to access mm->nr_ptes")

v4.4.177: Failed to apply! Possible dependencies:
    01871e59af5c ("mm, dax: fix livelock, allow dax pmd mappings to become writeable")
    01c8f1c44b83 ("mm, dax, gpu: convert vm_insert_mixed to pfn_t")
    0e749e54244e ("dax: increase granularity of dax_clear_blocks() operations")
    1bdb2d4ee05f ("ARM: split off core mapping logic from create_mapping")
    34c0fd540e79 ("mm, dax, pmem: introduce pfn_t")
    3ed3a4f0ddff ("mm: cleanup *pte_alloc* interfaces")
    52db400fcd50 ("pmem, dax: clean up clear_pmem()")
    7bc3777ca19c ("sparc64: Trim page tables for 8M hugepages")
    b2e0d1625e19 ("dax: fix lifetime of in-kernel dax mappings with dax_map_atomic()")
    c4812909f5d5 ("mm: introduce wrappers to access mm->nr_ptes")
    c7936206b971 ("ARM: implement create_mapping_late() for EFI use")
    f25748e3c34e ("mm, dax: convert vmf_insert_pfn_pmd() to pfn_t")
    f579b2b10412 ("ARM: factor out allocation routine from __create_mapping()")

v3.18.137: Failed to apply! Possible dependencies:
    047fc8a1f9a6 ("libnvdimm, nfit, nd_blk: driver for BLK-mode access persistent memory")
    2a3746984c98 ("x86: Use new cache mode type in track_pfn_remap() and track_pfn_insert()")
    34c0fd540e79 ("mm, dax, pmem: introduce pfn_t")
    4c1eaa2344fb ("drivers/block/pmem: Fix 32-bit build warning in pmem_alloc()")
    5cad465d7fa6 ("mm: add vmf_insert_pfn_pmd()")
    61031952f4c8 ("arch, x86: pmem api for ensuring durability of persistent memory updates")
    62232e45f4a2 ("libnvdimm: control (ioctl) messages for nvdimm_bus and nvdimm devices")
    777783e0abae ("staging: android: binder: move to the "real" part of the kernel")
    957e3facd147 ("gcov: enable GCOV_PROFILE_ALL from ARCH Kconfigs")
    9e853f2313e5 ("drivers/block/pmem: Add a driver for persistent memory")
    9f53f9fa4ad1 ("libnvdimm, pmem: add libnvdimm support to the pmem driver")
    b94d5230d06e ("libnvdimm, nfit: initial libnvdimm infrastructure and NFIT support")
    cb389b9c0e00 ("dax: drop size parameter to ->direct_access()")
    dd22f551ac0a ("block: Change direct_access calling convention")
    e2e05394e4a3 ("pmem, dax: have direct_access use __pmem annotation")
    ec776ef6bbe1 ("x86/mm: Add support for the non-standard protected e820 type")
    f0dc089ce217 ("libnvdimm: enable iostat")
    f25748e3c34e ("mm, dax: convert vmf_insert_pfn_pmd() to pfn_t")


How should we proceed with this patch?

--
Thanks,
Sasha
Aneesh Kumar K.V April 9, 2019, 4:46 a.m. UTC | #3
Sasha Levin <sashal@kernel.org> writes:

> Hi,
>
> [This is an automated email]
>
> This commit has been processed because it contains a -stable tag.
> The stable tag indicates that it's relevant for the following trees: all
>
> The bot has tested the following trees: v5.0.5, v4.19.32, v4.14.109, v4.9.166, v4.4.177, v3.18.137.
>
> v5.0.5: Build OK!
> v4.19.32: Build OK!

Considering this only impact ppc64 for now I guess it is ok to apply
this to the above two kernel versions. The SCM support for ppc64 was
added via

git describe --contains b5beae5e224f1c72c4482b0ab36fc3d89481a6b2
v4.20-rc1~24^2~68

powerpc/pseries: Add driver for PAPR SCM regions

-aneesh
Dan Williams April 24, 2019, 5:13 p.m. UTC | #4
On Tue, Apr 2, 2019 at 4:51 AM Aneesh Kumar K.V
<aneesh.kumar@linux.ibm.com> wrote:
>
> With some architectures like ppc64, set_pmd_at() cannot cope with
> a situation where there is already some (different) valid entry present.
>
> Use pmdp_set_access_flags() instead to modify the pfn which is built to
> deal with modifying existing PMD entries.
>
> This is similar to
> commit cae85cb8add3 ("mm/memory.c: fix modifying of page protection by insert_pfn()")
>
> We also do similar update w.r.t insert_pfn_pud eventhough ppc64 don't support
> pud pfn entries now.
>
> Without this patch we also see the below message in kernel log
> "BUG: non-zero pgtables_bytes on freeing mm:"
>
> CC: stable@vger.kernel.org
> Reported-by: Chandan Rajendra <chandan@linux.ibm.com>
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
> Changes from v1:
> * Fix the pgtable leak
>
>  mm/huge_memory.c | 36 ++++++++++++++++++++++++++++++++++++
>  1 file changed, 36 insertions(+)

This patch is triggering the following bug in v4.19.35.

 kernel BUG at arch/x86/mm/pgtable.c:515!
 invalid opcode: 0000 [#1] SMP NOPTI
 CPU: 51 PID: 43713 Comm: java Tainted: G           OE     4.19.35 #1
 RIP: 0010:pmdp_set_access_flags+0x48/0x50
 [..]
 Call Trace:
  vmf_insert_pfn_pmd+0x198/0x350
  dax_iomap_fault+0xe82/0x1190
  ext4_dax_huge_fault+0x103/0x1f0
  ? __switch_to_asm+0x40/0x70
  __handle_mm_fault+0x3f6/0x1370
  ? __switch_to_asm+0x34/0x70
  ? __switch_to_asm+0x40/0x70
  handle_mm_fault+0xda/0x200
  __do_page_fault+0x249/0x4f0
  do_page_fault+0x32/0x110
  ? page_fault+0x8/0x30
  page_fault+0x1e/0x30

I asked the reporter to try a kernel with commit c6f3c5ee40c1
"mm/huge_memory.c: fix modifying of page protection by
insert_pfn_pmd()" reverted and the failing test passed.

I think unaligned addresses have always been passed to
vmf_insert_pfn_pmd(), but nothing cared until this patch. I *think*
the only change needed is the following, thoughts?

diff --git a/fs/dax.c b/fs/dax.c
index ca0671d55aa6..82aee9a87efa 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1560,7 +1560,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct
vm_fault *vmf, pfn_t *pfnp,
                }

                trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry);
-               result = vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn,
+               result = vmf_insert_pfn_pmd(vma, pmd_addr, vmf->pmd, pfn,
                                            write);
                break;
        case IOMAP_UNWRITTEN:


I'll ask the reporter to try this fix as well.
Matthew Wilcox (Oracle) April 24, 2019, 5:38 p.m. UTC | #5
On Wed, Apr 24, 2019 at 10:13:15AM -0700, Dan Williams wrote:
> I think unaligned addresses have always been passed to
> vmf_insert_pfn_pmd(), but nothing cared until this patch. I *think*
> the only change needed is the following, thoughts?
> 
> diff --git a/fs/dax.c b/fs/dax.c
> index ca0671d55aa6..82aee9a87efa 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -1560,7 +1560,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct
> vm_fault *vmf, pfn_t *pfnp,
>                 }
> 
>                 trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry);
> -               result = vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn,
> +               result = vmf_insert_pfn_pmd(vma, pmd_addr, vmf->pmd, pfn,
>                                             write);

We also call vmf_insert_pfn_pmd() in dax_insert_pfn_mkwrite() -- does
that need to change too?
Dan Williams April 24, 2019, 6:13 p.m. UTC | #6
On Wed, Apr 24, 2019 at 10:38 AM Matthew Wilcox <willy@infradead.org> wrote:
>
> On Wed, Apr 24, 2019 at 10:13:15AM -0700, Dan Williams wrote:
> > I think unaligned addresses have always been passed to
> > vmf_insert_pfn_pmd(), but nothing cared until this patch. I *think*
> > the only change needed is the following, thoughts?
> >
> > diff --git a/fs/dax.c b/fs/dax.c
> > index ca0671d55aa6..82aee9a87efa 100644
> > --- a/fs/dax.c
> > +++ b/fs/dax.c
> > @@ -1560,7 +1560,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct
> > vm_fault *vmf, pfn_t *pfnp,
> >                 }
> >
> >                 trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry);
> > -               result = vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn,
> > +               result = vmf_insert_pfn_pmd(vma, pmd_addr, vmf->pmd, pfn,
> >                                             write);
>
> We also call vmf_insert_pfn_pmd() in dax_insert_pfn_mkwrite() -- does
> that need to change too?

It wasn't clear to me that it was a problem. I think that one already
happens to be pmd-aligned.
Aneesh Kumar K.V April 25, 2019, 1:37 a.m. UTC | #7
On 4/24/19 11:43 PM, Dan Williams wrote:
> On Wed, Apr 24, 2019 at 10:38 AM Matthew Wilcox <willy@infradead.org> wrote:
>>
>> On Wed, Apr 24, 2019 at 10:13:15AM -0700, Dan Williams wrote:
>>> I think unaligned addresses have always been passed to
>>> vmf_insert_pfn_pmd(), but nothing cared until this patch. I *think*
>>> the only change needed is the following, thoughts?
>>>
>>> diff --git a/fs/dax.c b/fs/dax.c
>>> index ca0671d55aa6..82aee9a87efa 100644
>>> --- a/fs/dax.c
>>> +++ b/fs/dax.c
>>> @@ -1560,7 +1560,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct
>>> vm_fault *vmf, pfn_t *pfnp,
>>>                  }
>>>
>>>                  trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry);
>>> -               result = vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn,
>>> +               result = vmf_insert_pfn_pmd(vma, pmd_addr, vmf->pmd, pfn,
>>>                                              write);
>>
>> We also call vmf_insert_pfn_pmd() in dax_insert_pfn_mkwrite() -- does
>> that need to change too?
> 
> It wasn't clear to me that it was a problem. I think that one already
> happens to be pmd-aligned.
> 

How about vmf_insert_pfn_pud()?

-aneesh
Dan Williams April 25, 2019, 4:33 a.m. UTC | #8
On Wed, Apr 24, 2019 at 6:37 PM Aneesh Kumar K.V
<aneesh.kumar@linux.ibm.com> wrote:
>
> On 4/24/19 11:43 PM, Dan Williams wrote:
> > On Wed, Apr 24, 2019 at 10:38 AM Matthew Wilcox <willy@infradead.org> wrote:
> >>
> >> On Wed, Apr 24, 2019 at 10:13:15AM -0700, Dan Williams wrote:
> >>> I think unaligned addresses have always been passed to
> >>> vmf_insert_pfn_pmd(), but nothing cared until this patch. I *think*
> >>> the only change needed is the following, thoughts?
> >>>
> >>> diff --git a/fs/dax.c b/fs/dax.c
> >>> index ca0671d55aa6..82aee9a87efa 100644
> >>> --- a/fs/dax.c
> >>> +++ b/fs/dax.c
> >>> @@ -1560,7 +1560,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct
> >>> vm_fault *vmf, pfn_t *pfnp,
> >>>                  }
> >>>
> >>>                  trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry);
> >>> -               result = vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn,
> >>> +               result = vmf_insert_pfn_pmd(vma, pmd_addr, vmf->pmd, pfn,
> >>>                                              write);
> >>
> >> We also call vmf_insert_pfn_pmd() in dax_insert_pfn_mkwrite() -- does
> >> that need to change too?
> >
> > It wasn't clear to me that it was a problem. I think that one already
> > happens to be pmd-aligned.
> >
>
> How about vmf_insert_pfn_pud()?

That is currently not used by fsdax, only devdax, but it does seem
that it passes the unaligned fault address rather than the pud aligned
address. I'll add that to the proposed fix.
Jan Kara April 25, 2019, 7:31 a.m. UTC | #9
On Wed 24-04-19 11:13:48, Dan Williams wrote:
> On Wed, Apr 24, 2019 at 10:38 AM Matthew Wilcox <willy@infradead.org> wrote:
> >
> > On Wed, Apr 24, 2019 at 10:13:15AM -0700, Dan Williams wrote:
> > > I think unaligned addresses have always been passed to
> > > vmf_insert_pfn_pmd(), but nothing cared until this patch. I *think*
> > > the only change needed is the following, thoughts?
> > >
> > > diff --git a/fs/dax.c b/fs/dax.c
> > > index ca0671d55aa6..82aee9a87efa 100644
> > > --- a/fs/dax.c
> > > +++ b/fs/dax.c
> > > @@ -1560,7 +1560,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct
> > > vm_fault *vmf, pfn_t *pfnp,
> > >                 }
> > >
> > >                 trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry);
> > > -               result = vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn,
> > > +               result = vmf_insert_pfn_pmd(vma, pmd_addr, vmf->pmd, pfn,
> > >                                             write);
> >
> > We also call vmf_insert_pfn_pmd() in dax_insert_pfn_mkwrite() -- does
> > that need to change too?
> 
> It wasn't clear to me that it was a problem. I think that one already
> happens to be pmd-aligned.

Why would it need to be? The address is taken from vmf->address and that's
set up in __handle_mm_fault() like .address = address & PAGE_MASK. So I
don't see anything forcing PMD alignment of the virtual address...

								Honza
Dan Williams April 26, 2019, 12:33 a.m. UTC | #10
On Thu, Apr 25, 2019 at 12:32 AM Jan Kara <jack@suse.cz> wrote:
>
> On Wed 24-04-19 11:13:48, Dan Williams wrote:
> > On Wed, Apr 24, 2019 at 10:38 AM Matthew Wilcox <willy@infradead.org> wrote:
> > >
> > > On Wed, Apr 24, 2019 at 10:13:15AM -0700, Dan Williams wrote:
> > > > I think unaligned addresses have always been passed to
> > > > vmf_insert_pfn_pmd(), but nothing cared until this patch. I *think*
> > > > the only change needed is the following, thoughts?
> > > >
> > > > diff --git a/fs/dax.c b/fs/dax.c
> > > > index ca0671d55aa6..82aee9a87efa 100644
> > > > --- a/fs/dax.c
> > > > +++ b/fs/dax.c
> > > > @@ -1560,7 +1560,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct
> > > > vm_fault *vmf, pfn_t *pfnp,
> > > >                 }
> > > >
> > > >                 trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry);
> > > > -               result = vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn,
> > > > +               result = vmf_insert_pfn_pmd(vma, pmd_addr, vmf->pmd, pfn,
> > > >                                             write);
> > >
> > > We also call vmf_insert_pfn_pmd() in dax_insert_pfn_mkwrite() -- does
> > > that need to change too?
> >
> > It wasn't clear to me that it was a problem. I think that one already
> > happens to be pmd-aligned.
>
> Why would it need to be? The address is taken from vmf->address and that's
> set up in __handle_mm_fault() like .address = address & PAGE_MASK. So I
> don't see anything forcing PMD alignment of the virtual address...

True. So now I'm wondering if the masking should be done internal to
the routine. Given it's prefixed vmf_ it seems to imply the api is
prepared to take raw 'struct vm_fault' parameters. I think I'll go
that route unless someone sees a reason to require the caller to
handle this responsibility.
Matthew Wilcox (Oracle) April 26, 2019, 12:51 a.m. UTC | #11
On Thu, Apr 25, 2019 at 05:33:04PM -0700, Dan Williams wrote:
> On Thu, Apr 25, 2019 at 12:32 AM Jan Kara <jack@suse.cz> wrote:
> > > > We also call vmf_insert_pfn_pmd() in dax_insert_pfn_mkwrite() -- does
> > > > that need to change too?
> > >
> > > It wasn't clear to me that it was a problem. I think that one already
> > > happens to be pmd-aligned.
> >
> > Why would it need to be? The address is taken from vmf->address and that's
> > set up in __handle_mm_fault() like .address = address & PAGE_MASK. So I
> > don't see anything forcing PMD alignment of the virtual address...
> 
> True. So now I'm wondering if the masking should be done internal to
> the routine. Given it's prefixed vmf_ it seems to imply the api is
> prepared to take raw 'struct vm_fault' parameters. I think I'll go
> that route unless someone sees a reason to require the caller to
> handle this responsibility.

The vmf_ prefix was originally used to indicate 'returns a vm_fault_t'
instead of 'returns an errno'.  That said, I like the interpretation
you're coming up with here, and it makes me wonder if we shouldn't
change vmf_insert_pfn_pmd() to take (vmf, pfn, write) as arguments
instead of separate vma, address & pmd arguments.
Jan Kara April 26, 2019, 8:36 a.m. UTC | #12
On Thu 25-04-19 17:33:04, Dan Williams wrote:
> On Thu, Apr 25, 2019 at 12:32 AM Jan Kara <jack@suse.cz> wrote:
> >
> > On Wed 24-04-19 11:13:48, Dan Williams wrote:
> > > On Wed, Apr 24, 2019 at 10:38 AM Matthew Wilcox <willy@infradead.org> wrote:
> > > >
> > > > On Wed, Apr 24, 2019 at 10:13:15AM -0700, Dan Williams wrote:
> > > > > I think unaligned addresses have always been passed to
> > > > > vmf_insert_pfn_pmd(), but nothing cared until this patch. I *think*
> > > > > the only change needed is the following, thoughts?
> > > > >
> > > > > diff --git a/fs/dax.c b/fs/dax.c
> > > > > index ca0671d55aa6..82aee9a87efa 100644
> > > > > --- a/fs/dax.c
> > > > > +++ b/fs/dax.c
> > > > > @@ -1560,7 +1560,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct
> > > > > vm_fault *vmf, pfn_t *pfnp,
> > > > >                 }
> > > > >
> > > > >                 trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry);
> > > > > -               result = vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn,
> > > > > +               result = vmf_insert_pfn_pmd(vma, pmd_addr, vmf->pmd, pfn,
> > > > >                                             write);
> > > >
> > > > We also call vmf_insert_pfn_pmd() in dax_insert_pfn_mkwrite() -- does
> > > > that need to change too?
> > >
> > > It wasn't clear to me that it was a problem. I think that one already
> > > happens to be pmd-aligned.
> >
> > Why would it need to be? The address is taken from vmf->address and that's
> > set up in __handle_mm_fault() like .address = address & PAGE_MASK. So I
> > don't see anything forcing PMD alignment of the virtual address...
> 
> True. So now I'm wondering if the masking should be done internal to
> the routine. Given it's prefixed vmf_ it seems to imply the api is
> prepared to take raw 'struct vm_fault' parameters. I think I'll go
> that route unless someone sees a reason to require the caller to
> handle this responsibility.

Yeah, that sounds good to me. Thanks for fixing this.

								Honza
diff mbox series

Patch

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 404acdcd0455..165ea46bf149 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -755,6 +755,21 @@  static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
 	spinlock_t *ptl;
 
 	ptl = pmd_lock(mm, pmd);
+	if (!pmd_none(*pmd)) {
+		if (write) {
+			if (pmd_pfn(*pmd) != pfn_t_to_pfn(pfn)) {
+				WARN_ON_ONCE(!is_huge_zero_pmd(*pmd));
+				goto out_unlock;
+			}
+			entry = pmd_mkyoung(*pmd);
+			entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+			if (pmdp_set_access_flags(vma, addr, pmd, entry, 1))
+				update_mmu_cache_pmd(vma, addr, pmd);
+		}
+
+		goto out_unlock;
+	}
+
 	entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
 	if (pfn_t_devmap(pfn))
 		entry = pmd_mkdevmap(entry);
@@ -766,11 +781,16 @@  static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
 	if (pgtable) {
 		pgtable_trans_huge_deposit(mm, pmd, pgtable);
 		mm_inc_nr_ptes(mm);
+		pgtable = NULL;
 	}
 
 	set_pmd_at(mm, addr, pmd, entry);
 	update_mmu_cache_pmd(vma, addr, pmd);
+
+out_unlock:
 	spin_unlock(ptl);
+	if (pgtable)
+		pte_free(mm, pgtable);
 }
 
 vm_fault_t vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
@@ -821,6 +841,20 @@  static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
 	spinlock_t *ptl;
 
 	ptl = pud_lock(mm, pud);
+	if (!pud_none(*pud)) {
+		if (write) {
+			if (pud_pfn(*pud) != pfn_t_to_pfn(pfn)) {
+				WARN_ON_ONCE(!is_huge_zero_pud(*pud));
+				goto out_unlock;
+			}
+			entry = pud_mkyoung(*pud);
+			entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma);
+			if (pudp_set_access_flags(vma, addr, pud, entry, 1))
+				update_mmu_cache_pud(vma, addr, pud);
+		}
+		goto out_unlock;
+	}
+
 	entry = pud_mkhuge(pfn_t_pud(pfn, prot));
 	if (pfn_t_devmap(pfn))
 		entry = pud_mkdevmap(entry);
@@ -830,6 +864,8 @@  static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
 	}
 	set_pud_at(mm, addr, pud, entry);
 	update_mmu_cache_pud(vma, addr, pud);
+
+out_unlock:
 	spin_unlock(ptl);
 }