Message ID | 20230616110826.344417-14-aneesh.kumar@linux.ibm.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | Add support for DAX vmemmap optimization for ppc64 | expand |
"Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes: These are just some minor nits in case you are going to send another revision. > This is enabled only with radix translation and 1G hugepage size. This will > be used with devdax device memory with a namespace alignment of 1G. > > Anon transparent hugepage is not supported even though we do have helpers > checking pud_trans_huge(). We should never find that return true. The only > expected pte bit combination is _PAGE_PTE | _PAGE_DEVMAP. > > Some of the helpers are never expected to get called on hash translation > and hence is marked to call BUG() in such a case. > > Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> > --- > arch/powerpc/include/asm/book3s/64/pgtable.h | 156 ++++++++++++++++-- > arch/powerpc/include/asm/book3s/64/radix.h | 37 +++++ > .../include/asm/book3s/64/tlbflush-radix.h | 2 + > arch/powerpc/include/asm/book3s/64/tlbflush.h | 8 + > arch/powerpc/mm/book3s64/pgtable.c | 78 +++++++++ > arch/powerpc/mm/book3s64/radix_pgtable.c | 28 ++++ > arch/powerpc/mm/book3s64/radix_tlb.c | 7 + > arch/powerpc/platforms/Kconfig.cputype | 1 + > include/trace/events/thp.h | 17 ++ > 9 files changed, 323 insertions(+), 11 deletions(-) > > diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h > index 4acc9690f599..9a05de007956 100644 > --- a/arch/powerpc/include/asm/book3s/64/pgtable.h > +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h > @@ -921,8 +921,29 @@ static inline pud_t pte_pud(pte_t pte) > { > return __pud_raw(pte_raw(pte)); > } > + > +static inline pte_t *pudp_ptep(pud_t *pud) > +{ > + return (pte_t *)pud; > +} > + > +#define pud_pfn(pud) pte_pfn(pud_pte(pud)) > +#define pud_dirty(pud) pte_dirty(pud_pte(pud)) > +#define pud_young(pud) pte_young(pud_pte(pud)) > +#define pud_mkold(pud) pte_pud(pte_mkold(pud_pte(pud))) > +#define pud_wrprotect(pud) pte_pud(pte_wrprotect(pud_pte(pud))) > +#define pud_mkdirty(pud) pte_pud(pte_mkdirty(pud_pte(pud))) > +#define pud_mkclean(pud) pte_pud(pte_mkclean(pud_pte(pud))) > +#define pud_mkyoung(pud) pte_pud(pte_mkyoung(pud_pte(pud))) > +#define pud_mkwrite(pud) pte_pud(pte_mkwrite(pud_pte(pud))) > #define pud_write(pud) pte_write(pud_pte(pud)) > > +#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY > +#define pud_soft_dirty(pmd) pte_soft_dirty(pud_pte(pud)) > +#define pud_mksoft_dirty(pmd) pte_pud(pte_mksoft_dirty(pud_pte(pud))) > +#define pud_clear_soft_dirty(pmd) pte_pud(pte_clear_soft_dirty(pud_pte(pud))) > +#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */ > + > static inline int pud_bad(pud_t pud) > { > if (radix_enabled()) > @@ -1115,15 +1136,24 @@ static inline bool pmd_access_permitted(pmd_t pmd, bool write) > > #ifdef CONFIG_TRANSPARENT_HUGEPAGE > extern pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot); > +extern pud_t pfn_pud(unsigned long pfn, pgprot_t pgprot); > extern pmd_t mk_pmd(struct page *page, pgprot_t pgprot); > extern pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot); > extern void set_pmd_at(struct mm_struct *mm, unsigned long addr, > pmd_t *pmdp, pmd_t pmd); > +extern void set_pud_at(struct mm_struct *mm, unsigned long addr, > + pud_t *pudp, pud_t pud); > + > static inline void update_mmu_cache_pmd(struct vm_area_struct *vma, > unsigned long addr, pmd_t *pmd) > { > } > > +static inline void update_mmu_cache_pud(struct vm_area_struct *vma, > + unsigned long addr, pud_t *pud) > +{ > +} > + > extern int hash__has_transparent_hugepage(void); > static inline int has_transparent_hugepage(void) > { > @@ -1133,6 +1163,14 @@ static inline int has_transparent_hugepage(void) > } > #define has_transparent_hugepage has_transparent_hugepage > > +static inline int has_transparent_pud_hugepage(void) > +{ > + if (radix_enabled()) > + return radix__has_transparent_pud_hugepage(); > + return 0; > +} > +#define has_transparent_pud_hugepage has_transparent_pud_hugepage > + > static inline unsigned long > pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, > unsigned long clr, unsigned long set) > @@ -1142,6 +1180,16 @@ pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, > return hash__pmd_hugepage_update(mm, addr, pmdp, clr, set); > } > > +static inline unsigned long > +pud_hugepage_update(struct mm_struct *mm, unsigned long addr, pud_t *pudp, > + unsigned long clr, unsigned long set) > +{ > + if (radix_enabled()) > + return radix__pud_hugepage_update(mm, addr, pudp, clr, set); > + BUG(); > + return pud_val(*pudp); > +} > + > /* > * returns true for pmd migration entries, THP, devmap, hugetlb > * But compile time dependent on THP config > @@ -1151,6 +1199,11 @@ static inline int pmd_large(pmd_t pmd) > return !!(pmd_raw(pmd) & cpu_to_be64(_PAGE_PTE)); > } > > +static inline int pud_large(pud_t pud) > +{ > + return !!(pud_raw(pud) & cpu_to_be64(_PAGE_PTE)); > +} > + > /* > * For radix we should always find H_PAGE_HASHPTE zero. Hence > * the below will work for radix too > @@ -1166,6 +1219,17 @@ static inline int __pmdp_test_and_clear_young(struct mm_struct *mm, > return ((old & _PAGE_ACCESSED) != 0); > } > > +static inline int __pudp_test_and_clear_young(struct mm_struct *mm, > + unsigned long addr, pud_t *pudp) > +{ > + unsigned long old; > + > + if ((pud_raw(*pudp) & cpu_to_be64(_PAGE_ACCESSED | H_PAGE_HASHPTE)) == 0) > + return 0; > + old = pud_hugepage_update(mm, addr, pudp, _PAGE_ACCESSED, 0); > + return ((old & _PAGE_ACCESSED) != 0); > +} > + > #define __HAVE_ARCH_PMDP_SET_WRPROTECT > static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr, > pmd_t *pmdp) > @@ -1174,6 +1238,14 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr, > pmd_hugepage_update(mm, addr, pmdp, _PAGE_WRITE, 0); > } > > +#define __HAVE_ARCH_PUDP_SET_WRPROTECT > +static inline void pudp_set_wrprotect(struct mm_struct *mm, unsigned long addr, > + pud_t *pudp) > +{ > + if (pud_write(*pudp)) > + pud_hugepage_update(mm, addr, pudp, _PAGE_WRITE, 0); > +} > + > /* > * Only returns true for a THP. False for pmd migration entry. > * We also need to return true when we come across a pte that > @@ -1195,6 +1267,17 @@ static inline int pmd_trans_huge(pmd_t pmd) > return hash__pmd_trans_huge(pmd); > } > > +static inline int pud_trans_huge(pud_t pud) > +{ > + if (!pud_present(pud)) > + return false; > + > + if (radix_enabled()) > + return radix__pud_trans_huge(pud); > + return 0; > +} > + > + > #define __HAVE_ARCH_PMD_SAME > static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) > { > @@ -1203,6 +1286,16 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) > return hash__pmd_same(pmd_a, pmd_b); > } > > +#define __HAVE_ARCH_PUD_SAME > +static inline int pud_same(pud_t pud_a, pud_t pud_b) > +{ > + if (radix_enabled()) > + return radix__pud_same(pud_a, pud_b); > + BUG(); > + return 0; > +} > + > + > static inline pmd_t __pmd_mkhuge(pmd_t pmd) > { > if (radix_enabled()) > @@ -1210,6 +1303,14 @@ static inline pmd_t __pmd_mkhuge(pmd_t pmd) > return hash__pmd_mkhuge(pmd); > } > > +static inline pud_t __pud_mkhuge(pud_t pud) > +{ > + if (radix_enabled()) > + return radix__pud_mkhuge(pud); > + BUG(); > + return pud; > +} > + > /* > * pfn_pmd return a pmd_t that can be used as pmd pte entry. > */ > @@ -1225,14 +1326,34 @@ static inline pmd_t pmd_mkhuge(pmd_t pmd) > return pmd; > } > > +static inline pud_t pud_mkhuge(pud_t pud) > +{ > +#ifdef CONFIG_DEBUG_VM > + if (radix_enabled()) > + WARN_ON((pud_raw(pud) & cpu_to_be64(_PAGE_PTE)) == 0); > + else > + WARN_ON(1); > +#endif > + return pud; > +} > + > + > #define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS > extern int pmdp_set_access_flags(struct vm_area_struct *vma, > unsigned long address, pmd_t *pmdp, > pmd_t entry, int dirty); > +#define __HAVE_ARCH_PUDP_SET_ACCESS_FLAGS > +extern int pudp_set_access_flags(struct vm_area_struct *vma, > + unsigned long address, pud_t *pudp, > + pud_t entry, int dirty); > > #define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG > extern int pmdp_test_and_clear_young(struct vm_area_struct *vma, > unsigned long address, pmd_t *pmdp); > +#define __HAVE_ARCH_PUDP_TEST_AND_CLEAR_YOUNG > +extern int pudp_test_and_clear_young(struct vm_area_struct *vma, > + unsigned long address, pud_t *pudp); > + > > #define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR > static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, > @@ -1243,6 +1364,16 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, > return hash__pmdp_huge_get_and_clear(mm, addr, pmdp); > } > > +#define __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR > +static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, > + unsigned long addr, pud_t *pudp) > +{ > + if (radix_enabled()) > + return radix__pudp_huge_get_and_clear(mm, addr, pudp); > + BUG(); > + return *pudp; > +} > + > static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, > unsigned long address, pmd_t *pmdp) > { > @@ -1257,6 +1388,11 @@ pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma, > unsigned long addr, > pmd_t *pmdp, int full); > > +#define __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR_FULL > +pud_t pudp_huge_get_and_clear_full(struct vm_area_struct *vma, > + unsigned long addr, > + pud_t *pudp, int full); > + > #define __HAVE_ARCH_PGTABLE_DEPOSIT > static inline void pgtable_trans_huge_deposit(struct mm_struct *mm, > pmd_t *pmdp, pgtable_t pgtable) > @@ -1305,6 +1441,14 @@ static inline pmd_t pmd_mkdevmap(pmd_t pmd) > return hash__pmd_mkdevmap(pmd); > } > > +static inline pud_t pud_mkdevmap(pud_t pud) > +{ > + if (radix_enabled()) > + return radix__pud_mkdevmap(pud); > + BUG(); > + return pud; > +} > + > static inline int pmd_devmap(pmd_t pmd) > { > return pte_devmap(pmd_pte(pmd)); > @@ -1312,7 +1456,7 @@ static inline int pmd_devmap(pmd_t pmd) > > static inline int pud_devmap(pud_t pud) > { > - return 0; > + return pte_devmap(pud_pte(pud)); > } > > static inline int pgd_devmap(pgd_t pgd) > @@ -1321,16 +1465,6 @@ static inline int pgd_devmap(pgd_t pgd) > } > #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ > > -static inline int pud_pfn(pud_t pud) > -{ > - /* > - * Currently all calls to pud_pfn() are gated around a pud_devmap() > - * check so this should never be used. If it grows another user we > - * want to know about it. > - */ > - BUILD_BUG(); > - return 0; > -} > #define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION > pte_t ptep_modify_prot_start(struct vm_area_struct *, unsigned long, pte_t *); > void ptep_modify_prot_commit(struct vm_area_struct *, unsigned long, > diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h > index 686001eda936..8cdff5a05011 100644 > --- a/arch/powerpc/include/asm/book3s/64/radix.h > +++ b/arch/powerpc/include/asm/book3s/64/radix.h > @@ -250,6 +250,10 @@ static inline int radix__pud_bad(pud_t pud) > return !!(pud_val(pud) & RADIX_PUD_BAD_BITS); > } > > +static inline int radix__pud_same(pud_t pud_a, pud_t pud_b) > +{ > + return ((pud_raw(pud_a) ^ pud_raw(pud_b)) == 0); > +} > > static inline int radix__p4d_bad(p4d_t p4d) > { > @@ -268,9 +272,22 @@ static inline pmd_t radix__pmd_mkhuge(pmd_t pmd) > return __pmd(pmd_val(pmd) | _PAGE_PTE); > } > > +static inline int radix__pud_trans_huge(pud_t pud) > +{ > + return (pud_val(pud) & (_PAGE_PTE | _PAGE_DEVMAP)) == _PAGE_PTE; > +} > + > +static inline pud_t radix__pud_mkhuge(pud_t pud) > +{ > + return __pud(pud_val(pud) | _PAGE_PTE); > +} > + > extern unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, > pmd_t *pmdp, unsigned long clr, > unsigned long set); > +extern unsigned long radix__pud_hugepage_update(struct mm_struct *mm, unsigned long addr, > + pud_t *pudp, unsigned long clr, > + unsigned long set); > extern pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, > unsigned long address, pmd_t *pmdp); > extern void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, > @@ -278,6 +295,9 @@ extern void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, > extern pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); > extern pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm, > unsigned long addr, pmd_t *pmdp); > +pud_t radix__pudp_huge_get_and_clear(struct mm_struct *mm, > + unsigned long addr, pud_t *pudp); > + > static inline int radix__has_transparent_hugepage(void) > { > /* For radix 2M at PMD level means thp */ > @@ -285,6 +305,14 @@ static inline int radix__has_transparent_hugepage(void) > return 1; > return 0; > } > + > +static inline int radix__has_transparent_pud_hugepage(void) > +{ > + /* For radix 1G at PUD level means pud hugepage support */ > + if (mmu_psize_defs[MMU_PAGE_1G].shift == PUD_SHIFT) > + return 1; > + return 0; > +} > #endif > > static inline pmd_t radix__pmd_mkdevmap(pmd_t pmd) > @@ -292,9 +320,18 @@ static inline pmd_t radix__pmd_mkdevmap(pmd_t pmd) > return __pmd(pmd_val(pmd) | (_PAGE_PTE | _PAGE_DEVMAP)); > } > > +static inline pud_t radix__pud_mkdevmap(pud_t pud) > +{ > + return __pud(pud_val(pud) | (_PAGE_PTE | _PAGE_DEVMAP)); > +} > + > +struct vmem_altmap; > +struct dev_pagemap; Minor nit. I guess this struct dev_pagemap is meant to be added in a later patch. > extern int __meminit radix__vmemmap_create_mapping(unsigned long start, > unsigned long page_size, > unsigned long phys); > +int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end, > + int node, struct vmem_altmap *altmap); > extern void radix__vmemmap_remove_mapping(unsigned long start, > unsigned long page_size); > > diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h > index 77797a2a82eb..a38542259fab 100644 > --- a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h > +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h > @@ -68,6 +68,8 @@ void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start, > unsigned long end, int psize); > extern void radix__flush_pmd_tlb_range(struct vm_area_struct *vma, > unsigned long start, unsigned long end); > +extern void radix__flush_pud_tlb_range(struct vm_area_struct *vma, > + unsigned long start, unsigned long end); > extern void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, > unsigned long end); > extern void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end); > diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/include/asm/book3s/64/tlbflush.h > index 0d0c1447ecf0..a01c20a8fbf7 100644 > --- a/arch/powerpc/include/asm/book3s/64/tlbflush.h > +++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h > @@ -50,6 +50,14 @@ static inline void flush_pmd_tlb_range(struct vm_area_struct *vma, > radix__flush_pmd_tlb_range(vma, start, end); > } > > +#define __HAVE_ARCH_FLUSH_PUD_TLB_RANGE > +static inline void flush_pud_tlb_range(struct vm_area_struct *vma, > + unsigned long start, unsigned long end) > +{ > + if (radix_enabled()) > + radix__flush_pud_tlb_range(vma, start, end); > +} > + > #define __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE > static inline void flush_hugetlb_tlb_range(struct vm_area_struct *vma, > unsigned long start, > diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c > index 85c84e89e3ea..9e5f01a1738c 100644 > --- a/arch/powerpc/mm/book3s64/pgtable.c > +++ b/arch/powerpc/mm/book3s64/pgtable.c > @@ -64,11 +64,39 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, > return changed; > } > > +int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address, > + pud_t *pudp, pud_t entry, int dirty) > +{ > + int changed; > +#ifdef CONFIG_DEBUG_VM > + WARN_ON(!pud_devmap(*pudp)); just a query - for pmdp_set_access_flags() we had WARN_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp)); so why don't we require the same check here? > + assert_spin_locked(pud_lockptr(vma->vm_mm, pudp)); > +#endif > + changed = !pud_same(*(pudp), entry); > + if (changed) { > + /* > + * We can use MMU_PAGE_2M here, because only radix s/MMU_PAGE_2M/MMY_PAGE_1G > + * path look at the psize. > + */ > + __ptep_set_access_flags(vma, pudp_ptep(pudp), > + pud_pte(entry), address, MMU_PAGE_1G); > + } > + return changed; > +} > + > + > int pmdp_test_and_clear_young(struct vm_area_struct *vma, > unsigned long address, pmd_t *pmdp) > { > return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp); > } > + > +int pudp_test_and_clear_young(struct vm_area_struct *vma, > + unsigned long address, pud_t *pudp) > +{ > + return __pudp_test_and_clear_young(vma->vm_mm, address, pudp); > +} > + > /* > * set a new huge pmd. We should not be called for updating > * an existing pmd entry. That should go via pmd_hugepage_update. > @@ -90,6 +118,23 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr, > return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd)); > } > > +void set_pud_at(struct mm_struct *mm, unsigned long addr, > + pud_t *pudp, pud_t pud) > +{ > +#ifdef CONFIG_DEBUG_VM > + /* > + * Make sure hardware valid bit is not set. We don't do > + * tlb flush for this update. > + */ > + > + WARN_ON(pte_hw_valid(pud_pte(*pudp))); For set_pmd_at() we had WARN_ON(pte_hw_valid(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp))); Could you please help me understand why is it different for set_pud_at() > + assert_spin_locked(pud_lockptr(mm, pudp)); > + WARN_ON(!(pud_large(pud))); > +#endif > + trace_hugepage_set_pud(addr, pud_val(pud)); > + return set_pte_at(mm, addr, pudp_ptep(pudp), pud_pte(pud)); > +} > + > static void do_serialize(void *arg) > { > /* We've taken the IPI, so try to trim the mask while here */ > @@ -147,11 +192,35 @@ pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma, > return pmd; > } > > +pud_t pudp_huge_get_and_clear_full(struct vm_area_struct *vma, > + unsigned long addr, pud_t *pudp, int full) > +{ > + pud_t pud; > + > + VM_BUG_ON(addr & ~HPAGE_PMD_MASK); > + VM_BUG_ON((pud_present(*pudp) && !pud_devmap(*pudp)) || > + !pud_present(*pudp)); > + pud = pudp_huge_get_and_clear(vma->vm_mm, addr, pudp); > + /* > + * if it not a fullmm flush, then we can possibly end up converting > + * this PMD pte entry to a regular level 0 PTE by a parallel page fault. > + * Make sure we flush the tlb in this case. > + */ > + if (!full) > + flush_pud_tlb_range(vma, addr, addr + HPAGE_PUD_SIZE); > + return pud; > +} > + > static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot) > { > return __pmd(pmd_val(pmd) | pgprot_val(pgprot)); > } > > +static pud_t pud_set_protbits(pud_t pud, pgprot_t pgprot) > +{ > + return __pud(pud_val(pud) | pgprot_val(pgprot)); > +} > + > /* > * At some point we should be able to get rid of > * pmd_mkhuge() and mk_huge_pmd() when we update all the > @@ -166,6 +235,15 @@ pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot) > return __pmd_mkhuge(pmd_set_protbits(__pmd(pmdv), pgprot)); > } > > +pud_t pfn_pud(unsigned long pfn, pgprot_t pgprot) > +{ > + unsigned long pudv; > + > + pudv = (pfn << PAGE_SHIFT) & PTE_RPN_MASK; > + > + return __pud_mkhuge(pud_set_protbits(__pud(pudv), pgprot)); > +} > + > pmd_t mk_pmd(struct page *page, pgprot_t pgprot) > { > return pfn_pmd(page_to_pfn(page), pgprot); > diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c > index 76f6a1f3b9d8..d7e2dd3d4add 100644 > --- a/arch/powerpc/mm/book3s64/radix_pgtable.c > +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c > @@ -965,6 +965,23 @@ unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long add > return old; > } > > +unsigned long radix__pud_hugepage_update(struct mm_struct *mm, unsigned long addr, > + pud_t *pudp, unsigned long clr, > + unsigned long set) > +{ > + unsigned long old; > + > +#ifdef CONFIG_DEBUG_VM > + WARN_ON(!pud_devmap(*pudp)); > + assert_spin_locked(pud_lockptr(mm, pudp)); > +#endif > + > + old = radix__pte_update(mm, addr, pudp_ptep(pudp), clr, set, 1); > + trace_hugepage_update(addr, old, clr, set); Here, we are using the same trace event for both pmd and pud update. See the comment in the end. > + > + return old; > +} > + > pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, > pmd_t *pmdp) > > @@ -1041,6 +1058,17 @@ pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm, > return old_pmd; > } > > +pud_t radix__pudp_huge_get_and_clear(struct mm_struct *mm, > + unsigned long addr, pud_t *pudp) > +{ > + pud_t old_pud; > + unsigned long old; > + > + old = radix__pud_hugepage_update(mm, addr, pudp, ~0UL, 0); > + old_pud = __pud(old); > + return old_pud; > +} > + > #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ > > void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep, > diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c > index ce804b7bf84e..a18f7d2c9f63 100644 > --- a/arch/powerpc/mm/book3s64/radix_tlb.c > +++ b/arch/powerpc/mm/book3s64/radix_tlb.c > @@ -1453,6 +1453,13 @@ void radix__flush_pmd_tlb_range(struct vm_area_struct *vma, > } > EXPORT_SYMBOL(radix__flush_pmd_tlb_range); > > +void radix__flush_pud_tlb_range(struct vm_area_struct *vma, > + unsigned long start, unsigned long end) > +{ > + radix__flush_tlb_range_psize(vma->vm_mm, start, end, MMU_PAGE_1G); > +} > +EXPORT_SYMBOL(radix__flush_pud_tlb_range); > + > void radix__flush_tlb_all(void) > { > unsigned long rb,prs,r,rs; > diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype > index 45fd975ef521..340b86ef7284 100644 > --- a/arch/powerpc/platforms/Kconfig.cputype > +++ b/arch/powerpc/platforms/Kconfig.cputype > @@ -94,6 +94,7 @@ config PPC_BOOK3S_64 > select PPC_FPU > select PPC_HAVE_PMU_SUPPORT > select HAVE_ARCH_TRANSPARENT_HUGEPAGE > + select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD > select ARCH_ENABLE_HUGEPAGE_MIGRATION if HUGETLB_PAGE && MIGRATION > select ARCH_ENABLE_SPLIT_PMD_PTLOCK > select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE > diff --git a/include/trace/events/thp.h b/include/trace/events/thp.h > index 202b3e3e67ff..a919d943d106 100644 > --- a/include/trace/events/thp.h > +++ b/include/trace/events/thp.h > @@ -25,6 +25,23 @@ TRACE_EVENT(hugepage_set_pmd, > TP_printk("Set pmd with 0x%lx with 0x%lx", __entry->addr, __entry->pmd) > ); > > +TRACE_EVENT(hugepage_set_pud, Given we have trace_hugepage_set_pud and trace_hugepage_set_pmd, doing the exact same thing, we can as well have a DECLARE_EVENT_CLASS(hugepage_set) and then DEFINE_EVENT using this event class. > + > + TP_PROTO(unsigned long addr, unsigned long pud), > + TP_ARGS(addr, pud), > + TP_STRUCT__entry( > + __field(unsigned long, addr) > + __field(unsigned long, pud) > + ), > + > + TP_fast_assign( > + __entry->addr = addr; > + __entry->pud = pud; > + ), > + > + TP_printk("Set pud with 0x%lx with 0x%lx", __entry->addr, __entry->pud) > + ); > + > > TRACE_EVENT(hugepage_update, Same here. We can have a DECLARE_EVENT_CLASS(hugepage_update) and then DEFINE_EVENT for hugepage_update_pmd and hugepage_update_pud. -ritesh
On 6/28/23 6:53 AM, Ritesh Harjani (IBM) wrote: > "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes: .... >> >> static inline pmd_t radix__pmd_mkdevmap(pmd_t pmd) >> @@ -292,9 +320,18 @@ static inline pmd_t radix__pmd_mkdevmap(pmd_t pmd) >> return __pmd(pmd_val(pmd) | (_PAGE_PTE | _PAGE_DEVMAP)); >> } >> >> +static inline pud_t radix__pud_mkdevmap(pud_t pud) >> +{ >> + return __pud(pud_val(pud) | (_PAGE_PTE | _PAGE_DEVMAP)); >> +} >> + >> +struct vmem_altmap; >> +struct dev_pagemap; > > Minor nit. > I guess this struct dev_pagemap is meant to be added in a later patch. Moved that change to a later patch. > >> extern int __meminit radix__vmemmap_create_mapping(unsigned long start, >> unsigned long page_size, >> unsigned long phys); >> +int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end, >> + int node, struct vmem_altmap *altmap); >> extern void radix__vmemmap_remove_mapping(unsigned long start, >> unsigned long page_size); >> >> diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h >> index 77797a2a82eb..a38542259fab 100644 >> --- a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h >> +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h >> @@ -68,6 +68,8 @@ void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start, >> unsigned long end, int psize); >> extern void radix__flush_pmd_tlb_range(struct vm_area_struct *vma, >> unsigned long start, unsigned long end); >> +extern void radix__flush_pud_tlb_range(struct vm_area_struct *vma, >> + unsigned long start, unsigned long end); >> extern void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, >> unsigned long end); >> extern void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end); >> diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/include/asm/book3s/64/tlbflush.h >> index 0d0c1447ecf0..a01c20a8fbf7 100644 >> --- a/arch/powerpc/include/asm/book3s/64/tlbflush.h >> +++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h >> @@ -50,6 +50,14 @@ static inline void flush_pmd_tlb_range(struct vm_area_struct *vma, >> radix__flush_pmd_tlb_range(vma, start, end); >> } >> >> +#define __HAVE_ARCH_FLUSH_PUD_TLB_RANGE >> +static inline void flush_pud_tlb_range(struct vm_area_struct *vma, >> + unsigned long start, unsigned long end) >> +{ >> + if (radix_enabled()) >> + radix__flush_pud_tlb_range(vma, start, end); >> +} >> + >> #define __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE >> static inline void flush_hugetlb_tlb_range(struct vm_area_struct *vma, >> unsigned long start, >> diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c >> index 85c84e89e3ea..9e5f01a1738c 100644 >> --- a/arch/powerpc/mm/book3s64/pgtable.c >> +++ b/arch/powerpc/mm/book3s64/pgtable.c >> @@ -64,11 +64,39 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, >> return changed; >> } >> >> +int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address, >> + pud_t *pudp, pud_t entry, int dirty) >> +{ >> + int changed; >> +#ifdef CONFIG_DEBUG_VM >> + WARN_ON(!pud_devmap(*pudp)); > > just a query - > for pmdp_set_access_flags() we had > WARN_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp)); > > so why don't we require the same check here? > Because we don't support generic anon PUD level THP. and we want to catch a wrong usage. >> + assert_spin_locked(pud_lockptr(vma->vm_mm, pudp)); >> +#endif >> + changed = !pud_same(*(pudp), entry); >> + if (changed) { >> + /* >> + * We can use MMU_PAGE_2M here, because only radix > > s/MMU_PAGE_2M/MMY_PAGE_1G > updated >> + * path look at the psize. >> + */ >> + __ptep_set_access_flags(vma, pudp_ptep(pudp), >> + pud_pte(entry), address, MMU_PAGE_1G); >> + } >> + return changed; >> +} >> + >> + >> int pmdp_test_and_clear_young(struct vm_area_struct *vma, >> unsigned long address, pmd_t *pmdp) >> { >> return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp); >> } >> + >> +int pudp_test_and_clear_young(struct vm_area_struct *vma, >> + unsigned long address, pud_t *pudp) >> +{ >> + return __pudp_test_and_clear_young(vma->vm_mm, address, pudp); >> +} >> + >> /* >> * set a new huge pmd. We should not be called for updating >> * an existing pmd entry. That should go via pmd_hugepage_update. >> @@ -90,6 +118,23 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr, >> return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd)); >> } >> >> +void set_pud_at(struct mm_struct *mm, unsigned long addr, >> + pud_t *pudp, pud_t pud) >> +{ >> +#ifdef CONFIG_DEBUG_VM >> + /* >> + * Make sure hardware valid bit is not set. We don't do >> + * tlb flush for this update. >> + */ >> + >> + WARN_ON(pte_hw_valid(pud_pte(*pudp))); > > For set_pmd_at() we had > WARN_ON(pte_hw_valid(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp))); > > Could you please help me understand why is it different for set_pud_at() > > There is no numa fault support for devdax pages and we want to catch wrong usage. >> + assert_spin_locked(pud_lockptr(mm, pudp)); >> + WARN_ON(!(pud_large(pud))); >> +#endif >> + trace_hugepage_set_pud(addr, pud_val(pud)); >> + return set_pte_at(mm, addr, pudp_ptep(pudp), pud_pte(pud)); >> +} >> + >> static void do_serialize(void *arg) >> { >> /* We've taken the IPI, so try to trim the mask while here */ >> @@ -147,11 +192,35 @@ pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma, >> return pmd; >> } >> >> +pud_t pudp_huge_get_and_clear_full(struct vm_area_struct *vma, >> + unsigned long addr, pud_t *pudp, int full) >> +{ >> + pud_t pud; >> + >> + VM_BUG_ON(addr & ~HPAGE_PMD_MASK); >> + VM_BUG_ON((pud_present(*pudp) && !pud_devmap(*pudp)) || >> + !pud_present(*pudp)); >> + pud = pudp_huge_get_and_clear(vma->vm_mm, addr, pudp); >> + /* >> + * if it not a fullmm flush, then we can possibly end up converting >> + * this PMD pte entry to a regular level 0 PTE by a parallel page fault. >> + * Make sure we flush the tlb in this case. >> + */ >> + if (!full) >> + flush_pud_tlb_range(vma, addr, addr + HPAGE_PUD_SIZE); >> + return pud; >> +} >> + >> static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot) >> { >> return __pmd(pmd_val(pmd) | pgprot_val(pgprot)); >> } >> >> +static pud_t pud_set_protbits(pud_t pud, pgprot_t pgprot) >> +{ >> + return __pud(pud_val(pud) | pgprot_val(pgprot)); >> +} >> + >> /* >> * At some point we should be able to get rid of >> * pmd_mkhuge() and mk_huge_pmd() when we update all the >> @@ -166,6 +235,15 @@ pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot) >> return __pmd_mkhuge(pmd_set_protbits(__pmd(pmdv), pgprot)); >> } >> >> +pud_t pfn_pud(unsigned long pfn, pgprot_t pgprot) >> +{ >> + unsigned long pudv; >> + >> + pudv = (pfn << PAGE_SHIFT) & PTE_RPN_MASK; >> + >> + return __pud_mkhuge(pud_set_protbits(__pud(pudv), pgprot)); >> +} >> + >> pmd_t mk_pmd(struct page *page, pgprot_t pgprot) >> { >> return pfn_pmd(page_to_pfn(page), pgprot); >> diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c >> index 76f6a1f3b9d8..d7e2dd3d4add 100644 >> --- a/arch/powerpc/mm/book3s64/radix_pgtable.c >> +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c >> @@ -965,6 +965,23 @@ unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long add >> return old; >> } >> >> +unsigned long radix__pud_hugepage_update(struct mm_struct *mm, unsigned long addr, >> + pud_t *pudp, unsigned long clr, >> + unsigned long set) >> +{ >> + unsigned long old; >> + >> +#ifdef CONFIG_DEBUG_VM >> + WARN_ON(!pud_devmap(*pudp)); >> + assert_spin_locked(pud_lockptr(mm, pudp)); >> +#endif >> + >> + old = radix__pte_update(mm, addr, pudp_ptep(pudp), clr, set, 1); >> + trace_hugepage_update(addr, old, clr, set); > > Here, we are using the same trace event for both pmd and pud update. > See the comment in the end. > updated >> + >> + return old; >> +} >> + >> pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, >> pmd_t *pmdp) >> >> @@ -1041,6 +1058,17 @@ pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm, >> return old_pmd; >> } >> >> +pud_t radix__pudp_huge_get_and_clear(struct mm_struct *mm, >> + unsigned long addr, pud_t *pudp) >> +{ >> + pud_t old_pud; >> + unsigned long old; >> + >> + old = radix__pud_hugepage_update(mm, addr, pudp, ~0UL, 0); >> + old_pud = __pud(old); >> + return old_pud; >> +} >> + >> #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ >> >> void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep, >> diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c >> index ce804b7bf84e..a18f7d2c9f63 100644 >> --- a/arch/powerpc/mm/book3s64/radix_tlb.c >> +++ b/arch/powerpc/mm/book3s64/radix_tlb.c >> @@ -1453,6 +1453,13 @@ void radix__flush_pmd_tlb_range(struct vm_area_struct *vma, >> } >> EXPORT_SYMBOL(radix__flush_pmd_tlb_range); >> >> +void radix__flush_pud_tlb_range(struct vm_area_struct *vma, >> + unsigned long start, unsigned long end) >> +{ >> + radix__flush_tlb_range_psize(vma->vm_mm, start, end, MMU_PAGE_1G); >> +} >> +EXPORT_SYMBOL(radix__flush_pud_tlb_range); >> + >> void radix__flush_tlb_all(void) >> { >> unsigned long rb,prs,r,rs; >> diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype >> index 45fd975ef521..340b86ef7284 100644 >> --- a/arch/powerpc/platforms/Kconfig.cputype >> +++ b/arch/powerpc/platforms/Kconfig.cputype >> @@ -94,6 +94,7 @@ config PPC_BOOK3S_64 >> select PPC_FPU >> select PPC_HAVE_PMU_SUPPORT >> select HAVE_ARCH_TRANSPARENT_HUGEPAGE >> + select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD >> select ARCH_ENABLE_HUGEPAGE_MIGRATION if HUGETLB_PAGE && MIGRATION >> select ARCH_ENABLE_SPLIT_PMD_PTLOCK >> select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE >> diff --git a/include/trace/events/thp.h b/include/trace/events/thp.h >> index 202b3e3e67ff..a919d943d106 100644 >> --- a/include/trace/events/thp.h >> +++ b/include/trace/events/thp.h >> @@ -25,6 +25,23 @@ TRACE_EVENT(hugepage_set_pmd, >> TP_printk("Set pmd with 0x%lx with 0x%lx", __entry->addr, __entry->pmd) >> ); >> >> +TRACE_EVENT(hugepage_set_pud, > > Given we have trace_hugepage_set_pud and trace_hugepage_set_pmd, doing > the exact same thing, we can as well have a > DECLARE_EVENT_CLASS(hugepage_set) and then DEFINE_EVENT using this event class. > >> + >> + TP_PROTO(unsigned long addr, unsigned long pud), >> + TP_ARGS(addr, pud), >> + TP_STRUCT__entry( >> + __field(unsigned long, addr) >> + __field(unsigned long, pud) >> + ), >> + >> + TP_fast_assign( >> + __entry->addr = addr; >> + __entry->pud = pud; >> + ), >> + >> + TP_printk("Set pud with 0x%lx with 0x%lx", __entry->addr, __entry->pud) >> + ); >> + >> >> TRACE_EVENT(hugepage_update, > > Same here. We can have a DECLARE_EVENT_CLASS(hugepage_update) and then > DEFINE_EVENT for hugepage_update_pmd and hugepage_update_pud. updated. I created a patch that first switch the existing tracepoint to DECLARE_EVENT_CLASS. -aneesh
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 4acc9690f599..9a05de007956 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -921,8 +921,29 @@ static inline pud_t pte_pud(pte_t pte) { return __pud_raw(pte_raw(pte)); } + +static inline pte_t *pudp_ptep(pud_t *pud) +{ + return (pte_t *)pud; +} + +#define pud_pfn(pud) pte_pfn(pud_pte(pud)) +#define pud_dirty(pud) pte_dirty(pud_pte(pud)) +#define pud_young(pud) pte_young(pud_pte(pud)) +#define pud_mkold(pud) pte_pud(pte_mkold(pud_pte(pud))) +#define pud_wrprotect(pud) pte_pud(pte_wrprotect(pud_pte(pud))) +#define pud_mkdirty(pud) pte_pud(pte_mkdirty(pud_pte(pud))) +#define pud_mkclean(pud) pte_pud(pte_mkclean(pud_pte(pud))) +#define pud_mkyoung(pud) pte_pud(pte_mkyoung(pud_pte(pud))) +#define pud_mkwrite(pud) pte_pud(pte_mkwrite(pud_pte(pud))) #define pud_write(pud) pte_write(pud_pte(pud)) +#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY +#define pud_soft_dirty(pmd) pte_soft_dirty(pud_pte(pud)) +#define pud_mksoft_dirty(pmd) pte_pud(pte_mksoft_dirty(pud_pte(pud))) +#define pud_clear_soft_dirty(pmd) pte_pud(pte_clear_soft_dirty(pud_pte(pud))) +#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */ + static inline int pud_bad(pud_t pud) { if (radix_enabled()) @@ -1115,15 +1136,24 @@ static inline bool pmd_access_permitted(pmd_t pmd, bool write) #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot); +extern pud_t pfn_pud(unsigned long pfn, pgprot_t pgprot); extern pmd_t mk_pmd(struct page *page, pgprot_t pgprot); extern pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot); extern void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd); +extern void set_pud_at(struct mm_struct *mm, unsigned long addr, + pud_t *pudp, pud_t pud); + static inline void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd) { } +static inline void update_mmu_cache_pud(struct vm_area_struct *vma, + unsigned long addr, pud_t *pud) +{ +} + extern int hash__has_transparent_hugepage(void); static inline int has_transparent_hugepage(void) { @@ -1133,6 +1163,14 @@ static inline int has_transparent_hugepage(void) } #define has_transparent_hugepage has_transparent_hugepage +static inline int has_transparent_pud_hugepage(void) +{ + if (radix_enabled()) + return radix__has_transparent_pud_hugepage(); + return 0; +} +#define has_transparent_pud_hugepage has_transparent_pud_hugepage + static inline unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, unsigned long clr, unsigned long set) @@ -1142,6 +1180,16 @@ pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, return hash__pmd_hugepage_update(mm, addr, pmdp, clr, set); } +static inline unsigned long +pud_hugepage_update(struct mm_struct *mm, unsigned long addr, pud_t *pudp, + unsigned long clr, unsigned long set) +{ + if (radix_enabled()) + return radix__pud_hugepage_update(mm, addr, pudp, clr, set); + BUG(); + return pud_val(*pudp); +} + /* * returns true for pmd migration entries, THP, devmap, hugetlb * But compile time dependent on THP config @@ -1151,6 +1199,11 @@ static inline int pmd_large(pmd_t pmd) return !!(pmd_raw(pmd) & cpu_to_be64(_PAGE_PTE)); } +static inline int pud_large(pud_t pud) +{ + return !!(pud_raw(pud) & cpu_to_be64(_PAGE_PTE)); +} + /* * For radix we should always find H_PAGE_HASHPTE zero. Hence * the below will work for radix too @@ -1166,6 +1219,17 @@ static inline int __pmdp_test_and_clear_young(struct mm_struct *mm, return ((old & _PAGE_ACCESSED) != 0); } +static inline int __pudp_test_and_clear_young(struct mm_struct *mm, + unsigned long addr, pud_t *pudp) +{ + unsigned long old; + + if ((pud_raw(*pudp) & cpu_to_be64(_PAGE_ACCESSED | H_PAGE_HASHPTE)) == 0) + return 0; + old = pud_hugepage_update(mm, addr, pudp, _PAGE_ACCESSED, 0); + return ((old & _PAGE_ACCESSED) != 0); +} + #define __HAVE_ARCH_PMDP_SET_WRPROTECT static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) @@ -1174,6 +1238,14 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr, pmd_hugepage_update(mm, addr, pmdp, _PAGE_WRITE, 0); } +#define __HAVE_ARCH_PUDP_SET_WRPROTECT +static inline void pudp_set_wrprotect(struct mm_struct *mm, unsigned long addr, + pud_t *pudp) +{ + if (pud_write(*pudp)) + pud_hugepage_update(mm, addr, pudp, _PAGE_WRITE, 0); +} + /* * Only returns true for a THP. False for pmd migration entry. * We also need to return true when we come across a pte that @@ -1195,6 +1267,17 @@ static inline int pmd_trans_huge(pmd_t pmd) return hash__pmd_trans_huge(pmd); } +static inline int pud_trans_huge(pud_t pud) +{ + if (!pud_present(pud)) + return false; + + if (radix_enabled()) + return radix__pud_trans_huge(pud); + return 0; +} + + #define __HAVE_ARCH_PMD_SAME static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) { @@ -1203,6 +1286,16 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) return hash__pmd_same(pmd_a, pmd_b); } +#define __HAVE_ARCH_PUD_SAME +static inline int pud_same(pud_t pud_a, pud_t pud_b) +{ + if (radix_enabled()) + return radix__pud_same(pud_a, pud_b); + BUG(); + return 0; +} + + static inline pmd_t __pmd_mkhuge(pmd_t pmd) { if (radix_enabled()) @@ -1210,6 +1303,14 @@ static inline pmd_t __pmd_mkhuge(pmd_t pmd) return hash__pmd_mkhuge(pmd); } +static inline pud_t __pud_mkhuge(pud_t pud) +{ + if (radix_enabled()) + return radix__pud_mkhuge(pud); + BUG(); + return pud; +} + /* * pfn_pmd return a pmd_t that can be used as pmd pte entry. */ @@ -1225,14 +1326,34 @@ static inline pmd_t pmd_mkhuge(pmd_t pmd) return pmd; } +static inline pud_t pud_mkhuge(pud_t pud) +{ +#ifdef CONFIG_DEBUG_VM + if (radix_enabled()) + WARN_ON((pud_raw(pud) & cpu_to_be64(_PAGE_PTE)) == 0); + else + WARN_ON(1); +#endif + return pud; +} + + #define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS extern int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t entry, int dirty); +#define __HAVE_ARCH_PUDP_SET_ACCESS_FLAGS +extern int pudp_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pud_t *pudp, + pud_t entry, int dirty); #define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG extern int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); +#define __HAVE_ARCH_PUDP_TEST_AND_CLEAR_YOUNG +extern int pudp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, pud_t *pudp); + #define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, @@ -1243,6 +1364,16 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, return hash__pmdp_huge_get_and_clear(mm, addr, pmdp); } +#define __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR +static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, + unsigned long addr, pud_t *pudp) +{ + if (radix_enabled()) + return radix__pudp_huge_get_and_clear(mm, addr, pudp); + BUG(); + return *pudp; +} + static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { @@ -1257,6 +1388,11 @@ pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp, int full); +#define __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR_FULL +pud_t pudp_huge_get_and_clear_full(struct vm_area_struct *vma, + unsigned long addr, + pud_t *pudp, int full); + #define __HAVE_ARCH_PGTABLE_DEPOSIT static inline void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, pgtable_t pgtable) @@ -1305,6 +1441,14 @@ static inline pmd_t pmd_mkdevmap(pmd_t pmd) return hash__pmd_mkdevmap(pmd); } +static inline pud_t pud_mkdevmap(pud_t pud) +{ + if (radix_enabled()) + return radix__pud_mkdevmap(pud); + BUG(); + return pud; +} + static inline int pmd_devmap(pmd_t pmd) { return pte_devmap(pmd_pte(pmd)); @@ -1312,7 +1456,7 @@ static inline int pmd_devmap(pmd_t pmd) static inline int pud_devmap(pud_t pud) { - return 0; + return pte_devmap(pud_pte(pud)); } static inline int pgd_devmap(pgd_t pgd) @@ -1321,16 +1465,6 @@ static inline int pgd_devmap(pgd_t pgd) } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -static inline int pud_pfn(pud_t pud) -{ - /* - * Currently all calls to pud_pfn() are gated around a pud_devmap() - * check so this should never be used. If it grows another user we - * want to know about it. - */ - BUILD_BUG(); - return 0; -} #define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION pte_t ptep_modify_prot_start(struct vm_area_struct *, unsigned long, pte_t *); void ptep_modify_prot_commit(struct vm_area_struct *, unsigned long, diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h index 686001eda936..8cdff5a05011 100644 --- a/arch/powerpc/include/asm/book3s/64/radix.h +++ b/arch/powerpc/include/asm/book3s/64/radix.h @@ -250,6 +250,10 @@ static inline int radix__pud_bad(pud_t pud) return !!(pud_val(pud) & RADIX_PUD_BAD_BITS); } +static inline int radix__pud_same(pud_t pud_a, pud_t pud_b) +{ + return ((pud_raw(pud_a) ^ pud_raw(pud_b)) == 0); +} static inline int radix__p4d_bad(p4d_t p4d) { @@ -268,9 +272,22 @@ static inline pmd_t radix__pmd_mkhuge(pmd_t pmd) return __pmd(pmd_val(pmd) | _PAGE_PTE); } +static inline int radix__pud_trans_huge(pud_t pud) +{ + return (pud_val(pud) & (_PAGE_PTE | _PAGE_DEVMAP)) == _PAGE_PTE; +} + +static inline pud_t radix__pud_mkhuge(pud_t pud) +{ + return __pud(pud_val(pud) | _PAGE_PTE); +} + extern unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, unsigned long clr, unsigned long set); +extern unsigned long radix__pud_hugepage_update(struct mm_struct *mm, unsigned long addr, + pud_t *pudp, unsigned long clr, + unsigned long set); extern pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); extern void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, @@ -278,6 +295,9 @@ extern void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, extern pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); extern pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp); +pud_t radix__pudp_huge_get_and_clear(struct mm_struct *mm, + unsigned long addr, pud_t *pudp); + static inline int radix__has_transparent_hugepage(void) { /* For radix 2M at PMD level means thp */ @@ -285,6 +305,14 @@ static inline int radix__has_transparent_hugepage(void) return 1; return 0; } + +static inline int radix__has_transparent_pud_hugepage(void) +{ + /* For radix 1G at PUD level means pud hugepage support */ + if (mmu_psize_defs[MMU_PAGE_1G].shift == PUD_SHIFT) + return 1; + return 0; +} #endif static inline pmd_t radix__pmd_mkdevmap(pmd_t pmd) @@ -292,9 +320,18 @@ static inline pmd_t radix__pmd_mkdevmap(pmd_t pmd) return __pmd(pmd_val(pmd) | (_PAGE_PTE | _PAGE_DEVMAP)); } +static inline pud_t radix__pud_mkdevmap(pud_t pud) +{ + return __pud(pud_val(pud) | (_PAGE_PTE | _PAGE_DEVMAP)); +} + +struct vmem_altmap; +struct dev_pagemap; extern int __meminit radix__vmemmap_create_mapping(unsigned long start, unsigned long page_size, unsigned long phys); +int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end, + int node, struct vmem_altmap *altmap); extern void radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size); diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h index 77797a2a82eb..a38542259fab 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h @@ -68,6 +68,8 @@ void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start, unsigned long end, int psize); extern void radix__flush_pmd_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); +extern void radix__flush_pud_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end); extern void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); extern void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end); diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/include/asm/book3s/64/tlbflush.h index 0d0c1447ecf0..a01c20a8fbf7 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h @@ -50,6 +50,14 @@ static inline void flush_pmd_tlb_range(struct vm_area_struct *vma, radix__flush_pmd_tlb_range(vma, start, end); } +#define __HAVE_ARCH_FLUSH_PUD_TLB_RANGE +static inline void flush_pud_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + if (radix_enabled()) + radix__flush_pud_tlb_range(vma, start, end); +} + #define __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE static inline void flush_hugetlb_tlb_range(struct vm_area_struct *vma, unsigned long start, diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index 85c84e89e3ea..9e5f01a1738c 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -64,11 +64,39 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, return changed; } +int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address, + pud_t *pudp, pud_t entry, int dirty) +{ + int changed; +#ifdef CONFIG_DEBUG_VM + WARN_ON(!pud_devmap(*pudp)); + assert_spin_locked(pud_lockptr(vma->vm_mm, pudp)); +#endif + changed = !pud_same(*(pudp), entry); + if (changed) { + /* + * We can use MMU_PAGE_2M here, because only radix + * path look at the psize. + */ + __ptep_set_access_flags(vma, pudp_ptep(pudp), + pud_pte(entry), address, MMU_PAGE_1G); + } + return changed; +} + + int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp); } + +int pudp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, pud_t *pudp) +{ + return __pudp_test_and_clear_young(vma->vm_mm, address, pudp); +} + /* * set a new huge pmd. We should not be called for updating * an existing pmd entry. That should go via pmd_hugepage_update. @@ -90,6 +118,23 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr, return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd)); } +void set_pud_at(struct mm_struct *mm, unsigned long addr, + pud_t *pudp, pud_t pud) +{ +#ifdef CONFIG_DEBUG_VM + /* + * Make sure hardware valid bit is not set. We don't do + * tlb flush for this update. + */ + + WARN_ON(pte_hw_valid(pud_pte(*pudp))); + assert_spin_locked(pud_lockptr(mm, pudp)); + WARN_ON(!(pud_large(pud))); +#endif + trace_hugepage_set_pud(addr, pud_val(pud)); + return set_pte_at(mm, addr, pudp_ptep(pudp), pud_pte(pud)); +} + static void do_serialize(void *arg) { /* We've taken the IPI, so try to trim the mask while here */ @@ -147,11 +192,35 @@ pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma, return pmd; } +pud_t pudp_huge_get_and_clear_full(struct vm_area_struct *vma, + unsigned long addr, pud_t *pudp, int full) +{ + pud_t pud; + + VM_BUG_ON(addr & ~HPAGE_PMD_MASK); + VM_BUG_ON((pud_present(*pudp) && !pud_devmap(*pudp)) || + !pud_present(*pudp)); + pud = pudp_huge_get_and_clear(vma->vm_mm, addr, pudp); + /* + * if it not a fullmm flush, then we can possibly end up converting + * this PMD pte entry to a regular level 0 PTE by a parallel page fault. + * Make sure we flush the tlb in this case. + */ + if (!full) + flush_pud_tlb_range(vma, addr, addr + HPAGE_PUD_SIZE); + return pud; +} + static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot) { return __pmd(pmd_val(pmd) | pgprot_val(pgprot)); } +static pud_t pud_set_protbits(pud_t pud, pgprot_t pgprot) +{ + return __pud(pud_val(pud) | pgprot_val(pgprot)); +} + /* * At some point we should be able to get rid of * pmd_mkhuge() and mk_huge_pmd() when we update all the @@ -166,6 +235,15 @@ pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot) return __pmd_mkhuge(pmd_set_protbits(__pmd(pmdv), pgprot)); } +pud_t pfn_pud(unsigned long pfn, pgprot_t pgprot) +{ + unsigned long pudv; + + pudv = (pfn << PAGE_SHIFT) & PTE_RPN_MASK; + + return __pud_mkhuge(pud_set_protbits(__pud(pudv), pgprot)); +} + pmd_t mk_pmd(struct page *page, pgprot_t pgprot) { return pfn_pmd(page_to_pfn(page), pgprot); diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index 76f6a1f3b9d8..d7e2dd3d4add 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -965,6 +965,23 @@ unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long add return old; } +unsigned long radix__pud_hugepage_update(struct mm_struct *mm, unsigned long addr, + pud_t *pudp, unsigned long clr, + unsigned long set) +{ + unsigned long old; + +#ifdef CONFIG_DEBUG_VM + WARN_ON(!pud_devmap(*pudp)); + assert_spin_locked(pud_lockptr(mm, pudp)); +#endif + + old = radix__pte_update(mm, addr, pudp_ptep(pudp), clr, set, 1); + trace_hugepage_update(addr, old, clr, set); + + return old; +} + pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) @@ -1041,6 +1058,17 @@ pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm, return old_pmd; } +pud_t radix__pudp_huge_get_and_clear(struct mm_struct *mm, + unsigned long addr, pud_t *pudp) +{ + pud_t old_pud; + unsigned long old; + + old = radix__pud_hugepage_update(mm, addr, pudp, ~0UL, 0); + old_pud = __pud(old); + return old_pud; +} + #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep, diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c index ce804b7bf84e..a18f7d2c9f63 100644 --- a/arch/powerpc/mm/book3s64/radix_tlb.c +++ b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -1453,6 +1453,13 @@ void radix__flush_pmd_tlb_range(struct vm_area_struct *vma, } EXPORT_SYMBOL(radix__flush_pmd_tlb_range); +void radix__flush_pud_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + radix__flush_tlb_range_psize(vma->vm_mm, start, end, MMU_PAGE_1G); +} +EXPORT_SYMBOL(radix__flush_pud_tlb_range); + void radix__flush_tlb_all(void) { unsigned long rb,prs,r,rs; diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 45fd975ef521..340b86ef7284 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -94,6 +94,7 @@ config PPC_BOOK3S_64 select PPC_FPU select PPC_HAVE_PMU_SUPPORT select HAVE_ARCH_TRANSPARENT_HUGEPAGE + select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD select ARCH_ENABLE_HUGEPAGE_MIGRATION if HUGETLB_PAGE && MIGRATION select ARCH_ENABLE_SPLIT_PMD_PTLOCK select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE diff --git a/include/trace/events/thp.h b/include/trace/events/thp.h index 202b3e3e67ff..a919d943d106 100644 --- a/include/trace/events/thp.h +++ b/include/trace/events/thp.h @@ -25,6 +25,23 @@ TRACE_EVENT(hugepage_set_pmd, TP_printk("Set pmd with 0x%lx with 0x%lx", __entry->addr, __entry->pmd) ); +TRACE_EVENT(hugepage_set_pud, + + TP_PROTO(unsigned long addr, unsigned long pud), + TP_ARGS(addr, pud), + TP_STRUCT__entry( + __field(unsigned long, addr) + __field(unsigned long, pud) + ), + + TP_fast_assign( + __entry->addr = addr; + __entry->pud = pud; + ), + + TP_printk("Set pud with 0x%lx with 0x%lx", __entry->addr, __entry->pud) + ); + TRACE_EVENT(hugepage_update,
This is enabled only with radix translation and 1G hugepage size. This will be used with devdax device memory with a namespace alignment of 1G. Anon transparent hugepage is not supported even though we do have helpers checking pud_trans_huge(). We should never find that return true. The only expected pte bit combination is _PAGE_PTE | _PAGE_DEVMAP. Some of the helpers are never expected to get called on hash translation and hence is marked to call BUG() in such a case. Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> --- arch/powerpc/include/asm/book3s/64/pgtable.h | 156 ++++++++++++++++-- arch/powerpc/include/asm/book3s/64/radix.h | 37 +++++ .../include/asm/book3s/64/tlbflush-radix.h | 2 + arch/powerpc/include/asm/book3s/64/tlbflush.h | 8 + arch/powerpc/mm/book3s64/pgtable.c | 78 +++++++++ arch/powerpc/mm/book3s64/radix_pgtable.c | 28 ++++ arch/powerpc/mm/book3s64/radix_tlb.c | 7 + arch/powerpc/platforms/Kconfig.cputype | 1 + include/trace/events/thp.h | 17 ++ 9 files changed, 323 insertions(+), 11 deletions(-)