diff mbox

[v2,1/3] arm64: hugetlb: Fix huge_pte_offset to return poisoned page table entries

Message ID 20170517152336.6052-2-punit.agrawal@arm.com (mailing list archive)
State New, archived
Headers show

Commit Message

Punit Agrawal May 17, 2017, 3:23 p.m. UTC
When memory failure is enabled, a poisoned hugepage pte is marked as a
swap entry. huge_pte_offset() does not return the poisoned page table
entries when it encounters PUD/PMD hugepages.

This behaviour of huge_pte_offset() leads to error such as below when
munmap is called on poisoned hugepages.

[  344.165544] mm/pgtable-generic.c:33: bad pmd 000000083af00074.

Fix huge_pte_offset() to return the poisoned pte which is then
appropriately handled by the generic layer code.

Signed-off-by: Punit Agrawal <punit.agrawal@arm.com>
Acked-by: Steve Capper <steve.capper@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: David Woods <dwoods@mellanox.com>
---
 arch/arm64/include/asm/pgtable.h |  2 +-
 arch/arm64/mm/hugetlbpage.c      | 29 ++++++++++-------------------
 2 files changed, 11 insertions(+), 20 deletions(-)

Comments

Will Deacon June 7, 2017, 1:47 p.m. UTC | #1
On Wed, May 17, 2017 at 04:23:34PM +0100, Punit Agrawal wrote:
> When memory failure is enabled, a poisoned hugepage pte is marked as a
> swap entry. huge_pte_offset() does not return the poisoned page table
> entries when it encounters PUD/PMD hugepages.
> 
> This behaviour of huge_pte_offset() leads to error such as below when
> munmap is called on poisoned hugepages.
> 
> [  344.165544] mm/pgtable-generic.c:33: bad pmd 000000083af00074.
> 
> Fix huge_pte_offset() to return the poisoned pte which is then
> appropriately handled by the generic layer code.
> 
> Signed-off-by: Punit Agrawal <punit.agrawal@arm.com>
> Acked-by: Steve Capper <steve.capper@arm.com>
> Cc: Catalin Marinas <catalin.marinas@arm.com>
> Cc: David Woods <dwoods@mellanox.com>
> ---
>  arch/arm64/include/asm/pgtable.h |  2 +-
>  arch/arm64/mm/hugetlbpage.c      | 29 ++++++++++-------------------
>  2 files changed, 11 insertions(+), 20 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
> index c213fdbd056c..6eae342ced6b 100644
> --- a/arch/arm64/include/asm/pgtable.h
> +++ b/arch/arm64/include/asm/pgtable.h
> @@ -441,7 +441,7 @@ static inline phys_addr_t pmd_page_paddr(pmd_t pmd)
>  
>  #define pud_none(pud)		(!pud_val(pud))
>  #define pud_bad(pud)		(!(pud_val(pud) & PUD_TABLE_BIT))
> -#define pud_present(pud)	(pud_val(pud))
> +#define pud_present(pud)	pte_present(pud_pte(pud))
>  
>  static inline void set_pud(pud_t *pudp, pud_t pud)
>  {
> diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
> index 7514a000e361..69b8200b1cfd 100644
> --- a/arch/arm64/mm/hugetlbpage.c
> +++ b/arch/arm64/mm/hugetlbpage.c
> @@ -136,36 +136,27 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
>  {
>  	pgd_t *pgd;
>  	pud_t *pud;
> -	pmd_t *pmd = NULL;
> -	pte_t *pte = NULL;
> +	pmd_t *pmd;
>  
>  	pgd = pgd_offset(mm, addr);
>  	pr_debug("%s: addr:0x%lx pgd:%p\n", __func__, addr, pgd);
>  	if (!pgd_present(*pgd))
>  		return NULL;
> +
>  	pud = pud_offset(pgd, addr);
> -	if (!pud_present(*pud))
> +	if (pud_none(*pud))
>  		return NULL;

Do you actually need this special case?

> -
> -	if (pud_huge(*pud))
> +	/* swap or huge page */
> +	if (!pud_present(*pud) || pud_huge(*pud))

... couldn't you just add a '|| pud_none(*pud)' in here?

>  		return (pte_t *)pud;
> +	/* table; check the next level */
> +
>  	pmd = pmd_offset(pud, addr);
> -	if (!pmd_present(*pmd))
> +	if (pmd_none(*pmd))
>  		return NULL;
> -
> -	if (pte_cont(pmd_pte(*pmd))) {
> -		pmd = pmd_offset(
> -			pud, (addr & CONT_PMD_MASK));
> -		return (pte_t *)pmd;
> -	}
> -	if (pmd_huge(*pmd))
> +	if (!pmd_present(*pmd) || pmd_huge(*pmd))

Similarly here.

Will
Catalin Marinas June 7, 2017, 2:30 p.m. UTC | #2
On Wed, Jun 07, 2017 at 02:47:32PM +0100, Will Deacon wrote:
> On Wed, May 17, 2017 at 04:23:34PM +0100, Punit Agrawal wrote:
> > --- a/arch/arm64/mm/hugetlbpage.c
> > +++ b/arch/arm64/mm/hugetlbpage.c
> > @@ -136,36 +136,27 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
> >  {
> >  	pgd_t *pgd;
> >  	pud_t *pud;
> > -	pmd_t *pmd = NULL;
> > -	pte_t *pte = NULL;
> > +	pmd_t *pmd;
> >  
> >  	pgd = pgd_offset(mm, addr);
> >  	pr_debug("%s: addr:0x%lx pgd:%p\n", __func__, addr, pgd);
> >  	if (!pgd_present(*pgd))
> >  		return NULL;
> > +
> >  	pud = pud_offset(pgd, addr);
> > -	if (!pud_present(*pud))
> > +	if (pud_none(*pud))
> >  		return NULL;
> 
> Do you actually need this special case?
> 
> > -
> > -	if (pud_huge(*pud))
> > +	/* swap or huge page */
> > +	if (!pud_present(*pud) || pud_huge(*pud))
> 
> ... couldn't you just add a '|| pud_none(*pud)' in here?
> 
> >  		return (pte_t *)pud;

But then you no longer return NULL if *pud == 0.
Will Deacon June 7, 2017, 2:57 p.m. UTC | #3
On Wed, Jun 07, 2017 at 03:30:37PM +0100, Catalin Marinas wrote:
> On Wed, Jun 07, 2017 at 02:47:32PM +0100, Will Deacon wrote:
> > On Wed, May 17, 2017 at 04:23:34PM +0100, Punit Agrawal wrote:
> > > --- a/arch/arm64/mm/hugetlbpage.c
> > > +++ b/arch/arm64/mm/hugetlbpage.c
> > > @@ -136,36 +136,27 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
> > >  {
> > >  	pgd_t *pgd;
> > >  	pud_t *pud;
> > > -	pmd_t *pmd = NULL;
> > > -	pte_t *pte = NULL;
> > > +	pmd_t *pmd;
> > >  
> > >  	pgd = pgd_offset(mm, addr);
> > >  	pr_debug("%s: addr:0x%lx pgd:%p\n", __func__, addr, pgd);
> > >  	if (!pgd_present(*pgd))
> > >  		return NULL;
> > > +
> > >  	pud = pud_offset(pgd, addr);
> > > -	if (!pud_present(*pud))
> > > +	if (pud_none(*pud))
> > >  		return NULL;
> > 
> > Do you actually need this special case?
> > 
> > > -
> > > -	if (pud_huge(*pud))
> > > +	/* swap or huge page */
> > > +	if (!pud_present(*pud) || pud_huge(*pud))
> > 
> > ... couldn't you just add a '|| pud_none(*pud)' in here?
> > 
> > >  		return (pte_t *)pud;
> 
> But then you no longer return NULL if *pud == 0.

Does that actually matter? The bits of hugetlb code I looked at will
deferenced the returned pud and handle the huge_pte_none case correctly.

Will
Catalin Marinas June 7, 2017, 2:58 p.m. UTC | #4
On Wed, May 17, 2017 at 04:23:34PM +0100, Punit Agrawal wrote:
> When memory failure is enabled, a poisoned hugepage pte is marked as a
> swap entry. huge_pte_offset() does not return the poisoned page table
> entries when it encounters PUD/PMD hugepages.
> 
> This behaviour of huge_pte_offset() leads to error such as below when
> munmap is called on poisoned hugepages.
> 
> [  344.165544] mm/pgtable-generic.c:33: bad pmd 000000083af00074.
> 
> Fix huge_pte_offset() to return the poisoned pte which is then
> appropriately handled by the generic layer code.
> 
> Signed-off-by: Punit Agrawal <punit.agrawal@arm.com>
> Acked-by: Steve Capper <steve.capper@arm.com>
> Cc: Catalin Marinas <catalin.marinas@arm.com>
> Cc: David Woods <dwoods@mellanox.com>

Since the patch matches my suggestions in v1:

Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Punit Agrawal June 7, 2017, 3:32 p.m. UTC | #5
Will Deacon <will.deacon@arm.com> writes:

> On Wed, Jun 07, 2017 at 03:30:37PM +0100, Catalin Marinas wrote:
>> On Wed, Jun 07, 2017 at 02:47:32PM +0100, Will Deacon wrote:
>> > On Wed, May 17, 2017 at 04:23:34PM +0100, Punit Agrawal wrote:
>> > > --- a/arch/arm64/mm/hugetlbpage.c
>> > > +++ b/arch/arm64/mm/hugetlbpage.c
>> > > @@ -136,36 +136,27 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
>> > >  {
>> > >  	pgd_t *pgd;
>> > >  	pud_t *pud;
>> > > -	pmd_t *pmd = NULL;
>> > > -	pte_t *pte = NULL;
>> > > +	pmd_t *pmd;
>> > >  
>> > >  	pgd = pgd_offset(mm, addr);
>> > >  	pr_debug("%s: addr:0x%lx pgd:%p\n", __func__, addr, pgd);
>> > >  	if (!pgd_present(*pgd))
>> > >  		return NULL;
>> > > +
>> > >  	pud = pud_offset(pgd, addr);
>> > > -	if (!pud_present(*pud))
>> > > +	if (pud_none(*pud))
>> > >  		return NULL;
>> > 
>> > Do you actually need this special case?
>> > 
>> > > -
>> > > -	if (pud_huge(*pud))
>> > > +	/* swap or huge page */
>> > > +	if (!pud_present(*pud) || pud_huge(*pud))
>> > 
>> > ... couldn't you just add a '|| pud_none(*pud)' in here?
>> > 

I think an earlier version took this approach but...

>> > >  		return (pte_t *)pud;
>> 
>> But then you no longer return NULL if *pud == 0.
>
> Does that actually matter? The bits of hugetlb code I looked at will
> deferenced the returned pud and handle the huge_pte_none case correctly.

For hugetlb fault handling (hugetlb_fault()), returning NULL vs pointer
to the pud/pmd results in different behaviour. If we return the pud when
pud_none(), then we lose the resulting hugepage size check we get from
huge_pte_alloc().

>
> Will
Will Deacon June 7, 2017, 3:41 p.m. UTC | #6
On Wed, Jun 07, 2017 at 04:32:28PM +0100, Punit Agrawal wrote:
> Will Deacon <will.deacon@arm.com> writes:
> 
> > On Wed, Jun 07, 2017 at 03:30:37PM +0100, Catalin Marinas wrote:
> >> On Wed, Jun 07, 2017 at 02:47:32PM +0100, Will Deacon wrote:
> >> > On Wed, May 17, 2017 at 04:23:34PM +0100, Punit Agrawal wrote:
> >> > > --- a/arch/arm64/mm/hugetlbpage.c
> >> > > +++ b/arch/arm64/mm/hugetlbpage.c
> >> > > @@ -136,36 +136,27 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
> >> > >  {
> >> > >  	pgd_t *pgd;
> >> > >  	pud_t *pud;
> >> > > -	pmd_t *pmd = NULL;
> >> > > -	pte_t *pte = NULL;
> >> > > +	pmd_t *pmd;
> >> > >  
> >> > >  	pgd = pgd_offset(mm, addr);
> >> > >  	pr_debug("%s: addr:0x%lx pgd:%p\n", __func__, addr, pgd);
> >> > >  	if (!pgd_present(*pgd))
> >> > >  		return NULL;
> >> > > +
> >> > >  	pud = pud_offset(pgd, addr);
> >> > > -	if (!pud_present(*pud))
> >> > > +	if (pud_none(*pud))
> >> > >  		return NULL;
> >> > 
> >> > Do you actually need this special case?
> >> > 
> >> > > -
> >> > > -	if (pud_huge(*pud))
> >> > > +	/* swap or huge page */
> >> > > +	if (!pud_present(*pud) || pud_huge(*pud))
> >> > 
> >> > ... couldn't you just add a '|| pud_none(*pud)' in here?
> >> > 
> 
> I think an earlier version took this approach but...
> 
> >> > >  		return (pte_t *)pud;
> >> 
> >> But then you no longer return NULL if *pud == 0.
> >
> > Does that actually matter? The bits of hugetlb code I looked at will
> > deferenced the returned pud and handle the huge_pte_none case correctly.
> 
> For hugetlb fault handling (hugetlb_fault()), returning NULL vs pointer
> to the pud/pmd results in different behaviour. If we return the pud when
> pud_none(), then we lose the resulting hugepage size check we get from
> huge_pte_alloc().

Ok, so does that mean that many of the huge_pte_none checks in mm/hugetlb.c
that operate on a huge_ptep_get of non-NULL output from huge_pte_offset are
actually redundant?

Will
Catalin Marinas June 7, 2017, 4:54 p.m. UTC | #7
On Wed, Jun 07, 2017 at 04:32:28PM +0100, Punit Agrawal wrote:
> Will Deacon <will.deacon@arm.com> writes:
> > On Wed, Jun 07, 2017 at 03:30:37PM +0100, Catalin Marinas wrote:
> >> On Wed, Jun 07, 2017 at 02:47:32PM +0100, Will Deacon wrote:
> >> > On Wed, May 17, 2017 at 04:23:34PM +0100, Punit Agrawal wrote:
> >> > > --- a/arch/arm64/mm/hugetlbpage.c
> >> > > +++ b/arch/arm64/mm/hugetlbpage.c
> >> > > @@ -136,36 +136,27 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
> >> > >  {
> >> > >  	pgd_t *pgd;
> >> > >  	pud_t *pud;
> >> > > -	pmd_t *pmd = NULL;
> >> > > -	pte_t *pte = NULL;
> >> > > +	pmd_t *pmd;
> >> > >  
> >> > >  	pgd = pgd_offset(mm, addr);
> >> > >  	pr_debug("%s: addr:0x%lx pgd:%p\n", __func__, addr, pgd);
> >> > >  	if (!pgd_present(*pgd))
> >> > >  		return NULL;
> >> > > +
> >> > >  	pud = pud_offset(pgd, addr);
> >> > > -	if (!pud_present(*pud))
> >> > > +	if (pud_none(*pud))
> >> > >  		return NULL;
> >> > 
> >> > Do you actually need this special case?
> >> > 
> >> > > -
> >> > > -	if (pud_huge(*pud))
> >> > > +	/* swap or huge page */
> >> > > +	if (!pud_present(*pud) || pud_huge(*pud))
> >> > 
> >> > ... couldn't you just add a '|| pud_none(*pud)' in here?
> >> > 
> 
> I think an earlier version took this approach but...
> 
> >> > >  		return (pte_t *)pud;
> >> 
> >> But then you no longer return NULL if *pud == 0.
> >
> > Does that actually matter? The bits of hugetlb code I looked at will
> > deferenced the returned pud and handle the huge_pte_none case correctly.
> 
> For hugetlb fault handling (hugetlb_fault()), returning NULL vs pointer
> to the pud/pmd results in different behaviour. If we return the pud when
> pud_none(), then we lose the resulting hugepage size check we get from
> huge_pte_alloc().

At a quick look, there are a few other places where not returning NULL
has some other effects (though I don't think any of them are fatal):

- copy_huge_tlb_page_range() - unnecessary allocation of a destination
  pud

- huge_pmd_share() - do we actually need the subsequent get_page()?

- page_vma_mapped_walk() - it even has a comment: "when pud is not
  present, pte will be NULL". Now, that's no longer true with swap
  entries but we'd never return NULL for a pud_none() case

Current code behaviour is to return NULL when !p*d_present(). We are
slightly relaxing this for swap entries while still returning NULL for
the p*d_none() case but I wouldn't go that far as to never return NULL
here.
Punit Agrawal June 8, 2017, 4:28 p.m. UTC | #8
Will Deacon <will.deacon@arm.com> writes:

> On Wed, Jun 07, 2017 at 04:32:28PM +0100, Punit Agrawal wrote:
>> Will Deacon <will.deacon@arm.com> writes:
>> 
>> > On Wed, Jun 07, 2017 at 03:30:37PM +0100, Catalin Marinas wrote:
>> >> On Wed, Jun 07, 2017 at 02:47:32PM +0100, Will Deacon wrote:
>> >> > On Wed, May 17, 2017 at 04:23:34PM +0100, Punit Agrawal wrote:
>> >> > > --- a/arch/arm64/mm/hugetlbpage.c
>> >> > > +++ b/arch/arm64/mm/hugetlbpage.c
>> >> > > @@ -136,36 +136,27 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
>> >> > >  {
>> >> > >  	pgd_t *pgd;
>> >> > >  	pud_t *pud;
>> >> > > -	pmd_t *pmd = NULL;
>> >> > > -	pte_t *pte = NULL;
>> >> > > +	pmd_t *pmd;
>> >> > >  
>> >> > >  	pgd = pgd_offset(mm, addr);
>> >> > >  	pr_debug("%s: addr:0x%lx pgd:%p\n", __func__, addr, pgd);
>> >> > >  	if (!pgd_present(*pgd))
>> >> > >  		return NULL;
>> >> > > +
>> >> > >  	pud = pud_offset(pgd, addr);
>> >> > > -	if (!pud_present(*pud))
>> >> > > +	if (pud_none(*pud))
>> >> > >  		return NULL;
>> >> > 
>> >> > Do you actually need this special case?
>> >> > 
>> >> > > -
>> >> > > -	if (pud_huge(*pud))
>> >> > > +	/* swap or huge page */
>> >> > > +	if (!pud_present(*pud) || pud_huge(*pud))
>> >> > 
>> >> > ... couldn't you just add a '|| pud_none(*pud)' in here?
>> >> > 
>> 
>> I think an earlier version took this approach but...
>> 
>> >> > >  		return (pte_t *)pud;
>> >> 
>> >> But then you no longer return NULL if *pud == 0.
>> >
>> > Does that actually matter? The bits of hugetlb code I looked at will
>> > deferenced the returned pud and handle the huge_pte_none case correctly.
>> 
>> For hugetlb fault handling (hugetlb_fault()), returning NULL vs pointer
>> to the pud/pmd results in different behaviour. If we return the pud when
>> pud_none(), then we lose the resulting hugepage size check we get from
>> huge_pte_alloc().
>
> Ok, so does that mean that many of the huge_pte_none checks in mm/hugetlb.c
> that operate on a huge_ptep_get of non-NULL output from huge_pte_offset are
> actually redundant?

Summarising our offline discussion - the semantics of huge_pte_offset()
are unclear in terms of when a pointer to (p*d) is returned vs when to
return NULL.

As part of enabling contiguous hugepage support[0][1], I have a patch to
add a size argument to huge_pte_offset() that can then help disambiguate
when we have a valid pte* vs when we have an error (NULL).

I'll separately look at clarifying the semantics of the generic version
of huge_pte_offse() and potentially also look at unifying the
huge_pte_offset() implementations across the various architectures.

[0] https://lkml.org/lkml/2017/5/24/463
[1] https://www.spinics.net/lists/arm-kernel/msg583367.html

>
> Will
diff mbox

Patch

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index c213fdbd056c..6eae342ced6b 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -441,7 +441,7 @@  static inline phys_addr_t pmd_page_paddr(pmd_t pmd)
 
 #define pud_none(pud)		(!pud_val(pud))
 #define pud_bad(pud)		(!(pud_val(pud) & PUD_TABLE_BIT))
-#define pud_present(pud)	(pud_val(pud))
+#define pud_present(pud)	pte_present(pud_pte(pud))
 
 static inline void set_pud(pud_t *pudp, pud_t pud)
 {
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 7514a000e361..69b8200b1cfd 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -136,36 +136,27 @@  pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pgd;
 	pud_t *pud;
-	pmd_t *pmd = NULL;
-	pte_t *pte = NULL;
+	pmd_t *pmd;
 
 	pgd = pgd_offset(mm, addr);
 	pr_debug("%s: addr:0x%lx pgd:%p\n", __func__, addr, pgd);
 	if (!pgd_present(*pgd))
 		return NULL;
+
 	pud = pud_offset(pgd, addr);
-	if (!pud_present(*pud))
+	if (pud_none(*pud))
 		return NULL;
-
-	if (pud_huge(*pud))
+	/* swap or huge page */
+	if (!pud_present(*pud) || pud_huge(*pud))
 		return (pte_t *)pud;
+	/* table; check the next level */
+
 	pmd = pmd_offset(pud, addr);
-	if (!pmd_present(*pmd))
+	if (pmd_none(*pmd))
 		return NULL;
-
-	if (pte_cont(pmd_pte(*pmd))) {
-		pmd = pmd_offset(
-			pud, (addr & CONT_PMD_MASK));
-		return (pte_t *)pmd;
-	}
-	if (pmd_huge(*pmd))
+	if (!pmd_present(*pmd) || pmd_huge(*pmd))
 		return (pte_t *)pmd;
-	pte = pte_offset_kernel(pmd, addr);
-	if (pte_present(*pte) && pte_cont(*pte)) {
-		pte = pte_offset_kernel(
-			pmd, (addr & CONT_PTE_MASK));
-		return pte;
-	}
+
 	return NULL;
 }