Message ID | 20240412073740.294272-2-21cnbao@gmail.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | mm: add per-order mTHP alloc and swpout counters | expand |
Hi Barry, 2 remaining comments - otherwise looks good. (same comments I just made in the v4 conversation). On 12/04/2024 08:37, Barry Song wrote: > From: Barry Song <v-songbaohua@oppo.com> > > Profiling a system blindly with mTHP has become challenging due to the > lack of visibility into its operations. Presenting the success rate of > mTHP allocations appears to be pressing need. > > Recently, I've been experiencing significant difficulty debugging > performance improvements and regressions without these figures. It's > crucial for us to understand the true effectiveness of mTHP in real-world > scenarios, especially in systems with fragmented memory. > > This patch establishes the framework for per-order mTHP > counters. It begins by introducing the anon_fault_alloc and > anon_fault_fallback counters. Additionally, to maintain consistency > with thp_fault_fallback_charge in /proc/vmstat, this patch also tracks > anon_fault_fallback_charge when mem_cgroup_charge fails for mTHP. > Incorporating additional counters should now be straightforward as well. > > Signed-off-by: Barry Song <v-songbaohua@oppo.com> > Cc: Chris Li <chrisl@kernel.org> > Cc: David Hildenbrand <david@redhat.com> > Cc: Domenico Cerasuolo <cerasuolodomenico@gmail.com> > Cc: Kairui Song <kasong@tencent.com> > Cc: Matthew Wilcox (Oracle) <willy@infradead.org> > Cc: Peter Xu <peterx@redhat.com> > Cc: Ryan Roberts <ryan.roberts@arm.com> > Cc: Suren Baghdasaryan <surenb@google.com> > Cc: Yosry Ahmed <yosryahmed@google.com> > Cc: Yu Zhao <yuzhao@google.com> > --- > include/linux/huge_mm.h | 51 ++++++++++++++++++++++++++++++++++ > mm/huge_memory.c | 61 +++++++++++++++++++++++++++++++++++++++++ > mm/memory.c | 3 ++ > mm/page_alloc.c | 4 +++ > 4 files changed, 119 insertions(+) > > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h > index e896ca4760f6..c5beb54b97cb 100644 > --- a/include/linux/huge_mm.h > +++ b/include/linux/huge_mm.h > @@ -264,6 +264,57 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, > enforce_sysfs, orders); > } > > +enum mthp_stat_item { > + MTHP_STAT_ANON_FAULT_ALLOC, > + MTHP_STAT_ANON_FAULT_FALLBACK, > + MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE, > + __MTHP_STAT_COUNT > +}; > + > +struct mthp_stat { > + unsigned long stats[0][__MTHP_STAT_COUNT]; > +}; > + > +extern struct mthp_stat __percpu *mthp_stats; > + > +static inline void count_mthp_stat(int order, enum mthp_stat_item item) > +{ > + if (order <= 0 || order > PMD_ORDER || !mthp_stats) > + return; > + > + this_cpu_inc(mthp_stats->stats[order][item]); > +} > + > +static inline void count_mthp_stats(int order, enum mthp_stat_item item, long delta) > +{ > + if (order <= 0 || order > PMD_ORDER || !mthp_stats) > + return; > + > + this_cpu_add(mthp_stats->stats[order][item], delta); > +} > + > +/* > + * Fold the foreign cpu mthp stats into our own. > + * > + * This is adding to the stats on one processor > + * but keeps the global counts constant. > + */ > +static inline void mthp_stats_fold_cpu(int cpu) > +{ > + struct mthp_stat *fold_stat; > + int i, j; > + > + if (!mthp_stats) > + return; > + fold_stat = per_cpu_ptr(mthp_stats, cpu); > + for (i = 1; i <= PMD_ORDER; i++) { > + for (j = 0; j < __MTHP_STAT_COUNT; j++) { > + count_mthp_stats(i, j, fold_stat->stats[i][j]); > + fold_stat->stats[i][j] = 0; > + } > + } > +} This is a pretty horrible hack; I'm pretty sure just summing for all *possible* cpus should work. > + > #define transparent_hugepage_use_zero_page() \ > (transparent_hugepage_flags & \ > (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG)) > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > index dc30139590e6..21c4ac74b484 100644 > --- a/mm/huge_memory.c > +++ b/mm/huge_memory.c > @@ -526,6 +526,50 @@ static const struct kobj_type thpsize_ktype = { > .sysfs_ops = &kobj_sysfs_ops, > }; > > +struct mthp_stat __percpu *mthp_stats; > + > +static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item) > +{ > + unsigned long sum = 0; > + int cpu; > + > + cpus_read_lock(); > + for_each_online_cpu(cpu) { > + struct mthp_stat *this = per_cpu_ptr(mthp_stats, cpu); > + > + sum += this->stats[order][item]; > + } > + cpus_read_unlock(); > + > + return sum; > +} > + > +#define DEFINE_MTHP_STAT_ATTR(_name, _index) \ > +static ssize_t _name##_show(struct kobject *kobj, \ > + struct kobj_attribute *attr, char *buf) \ > +{ \ > + int order = to_thpsize(kobj)->order; \ > + \ > + return sysfs_emit(buf, "%lu\n", sum_mthp_stat(order, _index)); \ > +} \ > +static struct kobj_attribute _name##_attr = __ATTR_RO(_name) > + > +DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC); > +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK); > +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); > + > +static struct attribute *stats_attrs[] = { > + &anon_fault_alloc_attr.attr, > + &anon_fault_fallback_attr.attr, > + &anon_fault_fallback_charge_attr.attr, > + NULL, > +}; > + > +static struct attribute_group stats_attr_group = { > + .name = "stats", > + .attrs = stats_attrs, > +}; > + > static struct thpsize *thpsize_create(int order, struct kobject *parent) > { > unsigned long size = (PAGE_SIZE << order) / SZ_1K; > @@ -549,6 +593,12 @@ static struct thpsize *thpsize_create(int order, struct kobject *parent) > return ERR_PTR(ret); > } > > + ret = sysfs_create_group(&thpsize->kobj, &stats_attr_group); > + if (ret) { > + kobject_put(&thpsize->kobj); > + return ERR_PTR(ret); > + } > + > thpsize->order = order; > return thpsize; > } > @@ -691,6 +741,11 @@ static int __init hugepage_init(void) > */ > MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2); > > + mthp_stats = __alloc_percpu((PMD_ORDER + 1) * sizeof(mthp_stats->stats[0]), > + sizeof(unsigned long)); Personally I think it would be cleaner to allocate statically using ilog2(MAX_PTRS_PER_PTE) instead of PMD_ORDER. > + if (!mthp_stats) > + return -ENOMEM; > + > err = hugepage_init_sysfs(&hugepage_kobj); > if (err) > goto err_sysfs; > @@ -725,6 +780,8 @@ static int __init hugepage_init(void) > err_slab: > hugepage_exit_sysfs(hugepage_kobj); > err_sysfs: > + free_percpu(mthp_stats); > + mthp_stats = NULL; > return err; > } > subsys_initcall(hugepage_init); > @@ -880,6 +937,8 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, > folio_put(folio); > count_vm_event(THP_FAULT_FALLBACK); > count_vm_event(THP_FAULT_FALLBACK_CHARGE); > + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK); > + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); > return VM_FAULT_FALLBACK; > } > folio_throttle_swaprate(folio, gfp); > @@ -929,6 +988,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, > mm_inc_nr_ptes(vma->vm_mm); > spin_unlock(vmf->ptl); > count_vm_event(THP_FAULT_ALLOC); > + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC); > count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); > } > > @@ -1050,6 +1110,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) > folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true); > if (unlikely(!folio)) { > count_vm_event(THP_FAULT_FALLBACK); > + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK); > return VM_FAULT_FALLBACK; > } > return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp); > diff --git a/mm/memory.c b/mm/memory.c > index 649a547fe8e3..06048af7cf9a 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -4368,6 +4368,7 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) > folio = vma_alloc_folio(gfp, order, vma, addr, true); > if (folio) { > if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) { > + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); > folio_put(folio); > goto next; > } > @@ -4376,6 +4377,7 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) > return folio; > } > next: > + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK); > order = next_order(&orders, order); > } > > @@ -4485,6 +4487,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) > > folio_ref_add(folio, nr_pages - 1); > add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages); > + count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_FAULT_ALLOC); > folio_add_new_anon_rmap(folio, vma, addr); > folio_add_lru_vma(folio, vma); > setpte: > diff --git a/mm/page_alloc.c b/mm/page_alloc.c > index b51becf03d1e..3135b5ca2457 100644 > --- a/mm/page_alloc.c > +++ b/mm/page_alloc.c > @@ -5840,6 +5840,10 @@ static int page_alloc_cpu_dead(unsigned int cpu) > */ > vm_events_fold_cpu(cpu); > > +#ifdef CONFIG_TRANSPARENT_HUGEPAGE > + mthp_stats_fold_cpu(cpu); > +#endif > + > /* > * Zero the differential counters of the dead processor > * so that the vm statistics are consistent.
On Fri, Apr 12, 2024 at 9:27 PM Ryan Roberts <ryan.roberts@arm.com> wrote: > > Hi Barry, > > 2 remaining comments - otherwise looks good. (same comments I just made in the > v4 conversation). > > On 12/04/2024 08:37, Barry Song wrote: > > From: Barry Song <v-songbaohua@oppo.com> > > > > Profiling a system blindly with mTHP has become challenging due to the > > lack of visibility into its operations. Presenting the success rate of > > mTHP allocations appears to be pressing need. > > > > Recently, I've been experiencing significant difficulty debugging > > performance improvements and regressions without these figures. It's > > crucial for us to understand the true effectiveness of mTHP in real-world > > scenarios, especially in systems with fragmented memory. > > > > This patch establishes the framework for per-order mTHP > > counters. It begins by introducing the anon_fault_alloc and > > anon_fault_fallback counters. Additionally, to maintain consistency > > with thp_fault_fallback_charge in /proc/vmstat, this patch also tracks > > anon_fault_fallback_charge when mem_cgroup_charge fails for mTHP. > > Incorporating additional counters should now be straightforward as well. > > > > Signed-off-by: Barry Song <v-songbaohua@oppo.com> > > Cc: Chris Li <chrisl@kernel.org> > > Cc: David Hildenbrand <david@redhat.com> > > Cc: Domenico Cerasuolo <cerasuolodomenico@gmail.com> > > Cc: Kairui Song <kasong@tencent.com> > > Cc: Matthew Wilcox (Oracle) <willy@infradead.org> > > Cc: Peter Xu <peterx@redhat.com> > > Cc: Ryan Roberts <ryan.roberts@arm.com> > > Cc: Suren Baghdasaryan <surenb@google.com> > > Cc: Yosry Ahmed <yosryahmed@google.com> > > Cc: Yu Zhao <yuzhao@google.com> > > --- > > include/linux/huge_mm.h | 51 ++++++++++++++++++++++++++++++++++ > > mm/huge_memory.c | 61 +++++++++++++++++++++++++++++++++++++++++ > > mm/memory.c | 3 ++ > > mm/page_alloc.c | 4 +++ > > 4 files changed, 119 insertions(+) > > > > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h > > index e896ca4760f6..c5beb54b97cb 100644 > > --- a/include/linux/huge_mm.h > > +++ b/include/linux/huge_mm.h > > @@ -264,6 +264,57 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, > > enforce_sysfs, orders); > > } > > > > +enum mthp_stat_item { > > + MTHP_STAT_ANON_FAULT_ALLOC, > > + MTHP_STAT_ANON_FAULT_FALLBACK, > > + MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE, > > + __MTHP_STAT_COUNT > > +}; > > + > > +struct mthp_stat { > > + unsigned long stats[0][__MTHP_STAT_COUNT]; > > +}; > > + > > +extern struct mthp_stat __percpu *mthp_stats; > > + > > +static inline void count_mthp_stat(int order, enum mthp_stat_item item) > > +{ > > + if (order <= 0 || order > PMD_ORDER || !mthp_stats) > > + return; > > + > > + this_cpu_inc(mthp_stats->stats[order][item]); > > +} > > + > > +static inline void count_mthp_stats(int order, enum mthp_stat_item item, long delta) > > +{ > > + if (order <= 0 || order > PMD_ORDER || !mthp_stats) > > + return; > > + > > + this_cpu_add(mthp_stats->stats[order][item], delta); > > +} > > + > > +/* > > + * Fold the foreign cpu mthp stats into our own. > > + * > > + * This is adding to the stats on one processor > > + * but keeps the global counts constant. > > + */ > > +static inline void mthp_stats_fold_cpu(int cpu) > > +{ > > + struct mthp_stat *fold_stat; > > + int i, j; > > + > > + if (!mthp_stats) > > + return; > > + fold_stat = per_cpu_ptr(mthp_stats, cpu); > > + for (i = 1; i <= PMD_ORDER; i++) { > > + for (j = 0; j < __MTHP_STAT_COUNT; j++) { > > + count_mthp_stats(i, j, fold_stat->stats[i][j]); > > + fold_stat->stats[i][j] = 0; > > + } > > + } > > +} > > This is a pretty horrible hack; I'm pretty sure just summing for all *possible* > cpus should work. > > > + > > #define transparent_hugepage_use_zero_page() \ > > (transparent_hugepage_flags & \ > > (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG)) > > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > > index dc30139590e6..21c4ac74b484 100644 > > --- a/mm/huge_memory.c > > +++ b/mm/huge_memory.c > > @@ -526,6 +526,50 @@ static const struct kobj_type thpsize_ktype = { > > .sysfs_ops = &kobj_sysfs_ops, > > }; > > > > +struct mthp_stat __percpu *mthp_stats; > > + > > +static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item) > > +{ > > + unsigned long sum = 0; > > + int cpu; > > + > > + cpus_read_lock(); > > + for_each_online_cpu(cpu) { > > + struct mthp_stat *this = per_cpu_ptr(mthp_stats, cpu); > > + > > + sum += this->stats[order][item]; > > + } > > + cpus_read_unlock(); > > + > > + return sum; > > +} > > + > > +#define DEFINE_MTHP_STAT_ATTR(_name, _index) \ > > +static ssize_t _name##_show(struct kobject *kobj, \ > > + struct kobj_attribute *attr, char *buf) \ > > +{ \ > > + int order = to_thpsize(kobj)->order; \ > > + \ > > + return sysfs_emit(buf, "%lu\n", sum_mthp_stat(order, _index)); \ > > +} \ > > +static struct kobj_attribute _name##_attr = __ATTR_RO(_name) > > + > > +DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC); > > +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK); > > +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); > > + > > +static struct attribute *stats_attrs[] = { > > + &anon_fault_alloc_attr.attr, > > + &anon_fault_fallback_attr.attr, > > + &anon_fault_fallback_charge_attr.attr, > > + NULL, > > +}; > > + > > +static struct attribute_group stats_attr_group = { > > + .name = "stats", > > + .attrs = stats_attrs, > > +}; > > + > > static struct thpsize *thpsize_create(int order, struct kobject *parent) > > { > > unsigned long size = (PAGE_SIZE << order) / SZ_1K; > > @@ -549,6 +593,12 @@ static struct thpsize *thpsize_create(int order, struct kobject *parent) > > return ERR_PTR(ret); > > } > > > > + ret = sysfs_create_group(&thpsize->kobj, &stats_attr_group); > > + if (ret) { > > + kobject_put(&thpsize->kobj); > > + return ERR_PTR(ret); > > + } > > + > > thpsize->order = order; > > return thpsize; > > } > > @@ -691,6 +741,11 @@ static int __init hugepage_init(void) > > */ > > MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2); > > > > + mthp_stats = __alloc_percpu((PMD_ORDER + 1) * sizeof(mthp_stats->stats[0]), > > + sizeof(unsigned long)); > > Personally I think it would be cleaner to allocate statically using > ilog2(MAX_PTRS_PER_PTE) instead of PMD_ORDER. Hi Ryan, I don't understand why MAX_PTRS_PER_PTE is the correct size. For ARM64, #define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT) #define MAX_PTRS_PER_PTE PTRS_PER_PTE #define PTRS_PER_PTE (1 << (PAGE_SHIFT - 3)) while PAGE_SIZE is 16KiB or 64KiB, PTRS_PER_PTE can be a huge number? Am I missing something? > > > + if (!mthp_stats) > > + return -ENOMEM; > > + > > err = hugepage_init_sysfs(&hugepage_kobj); > > if (err) > > goto err_sysfs; > > @@ -725,6 +780,8 @@ static int __init hugepage_init(void) > > err_slab: > > hugepage_exit_sysfs(hugepage_kobj); > > err_sysfs: > > + free_percpu(mthp_stats); > > + mthp_stats = NULL; > > return err; > > } > > subsys_initcall(hugepage_init); > > @@ -880,6 +937,8 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, > > folio_put(folio); > > count_vm_event(THP_FAULT_FALLBACK); > > count_vm_event(THP_FAULT_FALLBACK_CHARGE); > > + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK); > > + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); > > return VM_FAULT_FALLBACK; > > } > > folio_throttle_swaprate(folio, gfp); > > @@ -929,6 +988,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, > > mm_inc_nr_ptes(vma->vm_mm); > > spin_unlock(vmf->ptl); > > count_vm_event(THP_FAULT_ALLOC); > > + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC); > > count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); > > } > > > > @@ -1050,6 +1110,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) > > folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true); > > if (unlikely(!folio)) { > > count_vm_event(THP_FAULT_FALLBACK); > > + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK); > > return VM_FAULT_FALLBACK; > > } > > return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp); > > diff --git a/mm/memory.c b/mm/memory.c > > index 649a547fe8e3..06048af7cf9a 100644 > > --- a/mm/memory.c > > +++ b/mm/memory.c > > @@ -4368,6 +4368,7 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) > > folio = vma_alloc_folio(gfp, order, vma, addr, true); > > if (folio) { > > if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) { > > + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); > > folio_put(folio); > > goto next; > > } > > @@ -4376,6 +4377,7 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) > > return folio; > > } > > next: > > + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK); > > order = next_order(&orders, order); > > } > > > > @@ -4485,6 +4487,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) > > > > folio_ref_add(folio, nr_pages - 1); > > add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages); > > + count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_FAULT_ALLOC); > > folio_add_new_anon_rmap(folio, vma, addr); > > folio_add_lru_vma(folio, vma); > > setpte: > > diff --git a/mm/page_alloc.c b/mm/page_alloc.c > > index b51becf03d1e..3135b5ca2457 100644 > > --- a/mm/page_alloc.c > > +++ b/mm/page_alloc.c > > @@ -5840,6 +5840,10 @@ static int page_alloc_cpu_dead(unsigned int cpu) > > */ > > vm_events_fold_cpu(cpu); > > > > +#ifdef CONFIG_TRANSPARENT_HUGEPAGE > > + mthp_stats_fold_cpu(cpu); > > +#endif > > + > > /* > > * Zero the differential counters of the dead processor > > * so that the vm statistics are consistent. >
On 12/04/2024 10:43, Barry Song wrote: > On Fri, Apr 12, 2024 at 9:27 PM Ryan Roberts <ryan.roberts@arm.com> wrote: >> >> Hi Barry, >> >> 2 remaining comments - otherwise looks good. (same comments I just made in the >> v4 conversation). >> >> On 12/04/2024 08:37, Barry Song wrote: >>> From: Barry Song <v-songbaohua@oppo.com> >>> >>> Profiling a system blindly with mTHP has become challenging due to the >>> lack of visibility into its operations. Presenting the success rate of >>> mTHP allocations appears to be pressing need. >>> >>> Recently, I've been experiencing significant difficulty debugging >>> performance improvements and regressions without these figures. It's >>> crucial for us to understand the true effectiveness of mTHP in real-world >>> scenarios, especially in systems with fragmented memory. >>> >>> This patch establishes the framework for per-order mTHP >>> counters. It begins by introducing the anon_fault_alloc and >>> anon_fault_fallback counters. Additionally, to maintain consistency >>> with thp_fault_fallback_charge in /proc/vmstat, this patch also tracks >>> anon_fault_fallback_charge when mem_cgroup_charge fails for mTHP. >>> Incorporating additional counters should now be straightforward as well. >>> >>> Signed-off-by: Barry Song <v-songbaohua@oppo.com> >>> Cc: Chris Li <chrisl@kernel.org> >>> Cc: David Hildenbrand <david@redhat.com> >>> Cc: Domenico Cerasuolo <cerasuolodomenico@gmail.com> >>> Cc: Kairui Song <kasong@tencent.com> >>> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> >>> Cc: Peter Xu <peterx@redhat.com> >>> Cc: Ryan Roberts <ryan.roberts@arm.com> >>> Cc: Suren Baghdasaryan <surenb@google.com> >>> Cc: Yosry Ahmed <yosryahmed@google.com> >>> Cc: Yu Zhao <yuzhao@google.com> >>> --- >>> include/linux/huge_mm.h | 51 ++++++++++++++++++++++++++++++++++ >>> mm/huge_memory.c | 61 +++++++++++++++++++++++++++++++++++++++++ >>> mm/memory.c | 3 ++ >>> mm/page_alloc.c | 4 +++ >>> 4 files changed, 119 insertions(+) >>> >>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h >>> index e896ca4760f6..c5beb54b97cb 100644 >>> --- a/include/linux/huge_mm.h >>> +++ b/include/linux/huge_mm.h >>> @@ -264,6 +264,57 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, >>> enforce_sysfs, orders); >>> } >>> >>> +enum mthp_stat_item { >>> + MTHP_STAT_ANON_FAULT_ALLOC, >>> + MTHP_STAT_ANON_FAULT_FALLBACK, >>> + MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE, >>> + __MTHP_STAT_COUNT >>> +}; >>> + >>> +struct mthp_stat { >>> + unsigned long stats[0][__MTHP_STAT_COUNT]; >>> +}; >>> + >>> +extern struct mthp_stat __percpu *mthp_stats; >>> + >>> +static inline void count_mthp_stat(int order, enum mthp_stat_item item) >>> +{ >>> + if (order <= 0 || order > PMD_ORDER || !mthp_stats) >>> + return; >>> + >>> + this_cpu_inc(mthp_stats->stats[order][item]); >>> +} >>> + >>> +static inline void count_mthp_stats(int order, enum mthp_stat_item item, long delta) >>> +{ >>> + if (order <= 0 || order > PMD_ORDER || !mthp_stats) >>> + return; >>> + >>> + this_cpu_add(mthp_stats->stats[order][item], delta); >>> +} >>> + >>> +/* >>> + * Fold the foreign cpu mthp stats into our own. >>> + * >>> + * This is adding to the stats on one processor >>> + * but keeps the global counts constant. >>> + */ >>> +static inline void mthp_stats_fold_cpu(int cpu) >>> +{ >>> + struct mthp_stat *fold_stat; >>> + int i, j; >>> + >>> + if (!mthp_stats) >>> + return; >>> + fold_stat = per_cpu_ptr(mthp_stats, cpu); >>> + for (i = 1; i <= PMD_ORDER; i++) { >>> + for (j = 0; j < __MTHP_STAT_COUNT; j++) { >>> + count_mthp_stats(i, j, fold_stat->stats[i][j]); >>> + fold_stat->stats[i][j] = 0; >>> + } >>> + } >>> +} >> >> This is a pretty horrible hack; I'm pretty sure just summing for all *possible* >> cpus should work. >> >>> + >>> #define transparent_hugepage_use_zero_page() \ >>> (transparent_hugepage_flags & \ >>> (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG)) >>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c >>> index dc30139590e6..21c4ac74b484 100644 >>> --- a/mm/huge_memory.c >>> +++ b/mm/huge_memory.c >>> @@ -526,6 +526,50 @@ static const struct kobj_type thpsize_ktype = { >>> .sysfs_ops = &kobj_sysfs_ops, >>> }; >>> >>> +struct mthp_stat __percpu *mthp_stats; >>> + >>> +static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item) >>> +{ >>> + unsigned long sum = 0; >>> + int cpu; >>> + >>> + cpus_read_lock(); >>> + for_each_online_cpu(cpu) { >>> + struct mthp_stat *this = per_cpu_ptr(mthp_stats, cpu); >>> + >>> + sum += this->stats[order][item]; >>> + } >>> + cpus_read_unlock(); >>> + >>> + return sum; >>> +} >>> + >>> +#define DEFINE_MTHP_STAT_ATTR(_name, _index) \ >>> +static ssize_t _name##_show(struct kobject *kobj, \ >>> + struct kobj_attribute *attr, char *buf) \ >>> +{ \ >>> + int order = to_thpsize(kobj)->order; \ >>> + \ >>> + return sysfs_emit(buf, "%lu\n", sum_mthp_stat(order, _index)); \ >>> +} \ >>> +static struct kobj_attribute _name##_attr = __ATTR_RO(_name) >>> + >>> +DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC); >>> +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK); >>> +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); >>> + >>> +static struct attribute *stats_attrs[] = { >>> + &anon_fault_alloc_attr.attr, >>> + &anon_fault_fallback_attr.attr, >>> + &anon_fault_fallback_charge_attr.attr, >>> + NULL, >>> +}; >>> + >>> +static struct attribute_group stats_attr_group = { >>> + .name = "stats", >>> + .attrs = stats_attrs, >>> +}; >>> + >>> static struct thpsize *thpsize_create(int order, struct kobject *parent) >>> { >>> unsigned long size = (PAGE_SIZE << order) / SZ_1K; >>> @@ -549,6 +593,12 @@ static struct thpsize *thpsize_create(int order, struct kobject *parent) >>> return ERR_PTR(ret); >>> } >>> >>> + ret = sysfs_create_group(&thpsize->kobj, &stats_attr_group); >>> + if (ret) { >>> + kobject_put(&thpsize->kobj); >>> + return ERR_PTR(ret); >>> + } >>> + >>> thpsize->order = order; >>> return thpsize; >>> } >>> @@ -691,6 +741,11 @@ static int __init hugepage_init(void) >>> */ >>> MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2); >>> >>> + mthp_stats = __alloc_percpu((PMD_ORDER + 1) * sizeof(mthp_stats->stats[0]), >>> + sizeof(unsigned long)); >> >> Personally I think it would be cleaner to allocate statically using >> ilog2(MAX_PTRS_PER_PTE) instead of PMD_ORDER. > > Hi Ryan, > > I don't understand why MAX_PTRS_PER_PTE is the correct size. For ARM64, > > #define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT) > > #define MAX_PTRS_PER_PTE PTRS_PER_PTE > > #define PTRS_PER_PTE (1 << (PAGE_SHIFT - 3)) > > while PAGE_SIZE is 16KiB or 64KiB, PTRS_PER_PTE can be a huge number? > > > Am I missing something? PTRS_PER_PTE is the number of PTE entries in a PTE table. On arm64 its as follows: PAGE_SIZE PAGE_SHIFT PTRS_PER_PTE 4K 12 512 16K 14 2048 64K 16 8192 So (PTRS_PER_PTE * PAGE_SIZE) = PMD_SIZE PMD_ORDER is ilog2(PMD_SIZE / PAGE_SIZE) = ilog2(PTRS_PER_PTE) MAX_PTRS_PER_PTE is just the maximum value that PTRS_PER_PTE will ever have, (and its equal to PTRS_PER_PTE except for powerpc). Pretty sure the math is correct? > >> >>> + if (!mthp_stats) >>> + return -ENOMEM; >>> + >>> err = hugepage_init_sysfs(&hugepage_kobj); >>> if (err) >>> goto err_sysfs; >>> @@ -725,6 +780,8 @@ static int __init hugepage_init(void) >>> err_slab: >>> hugepage_exit_sysfs(hugepage_kobj); >>> err_sysfs: >>> + free_percpu(mthp_stats); >>> + mthp_stats = NULL; >>> return err; >>> } >>> subsys_initcall(hugepage_init); >>> @@ -880,6 +937,8 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, >>> folio_put(folio); >>> count_vm_event(THP_FAULT_FALLBACK); >>> count_vm_event(THP_FAULT_FALLBACK_CHARGE); >>> + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK); >>> + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); >>> return VM_FAULT_FALLBACK; >>> } >>> folio_throttle_swaprate(folio, gfp); >>> @@ -929,6 +988,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, >>> mm_inc_nr_ptes(vma->vm_mm); >>> spin_unlock(vmf->ptl); >>> count_vm_event(THP_FAULT_ALLOC); >>> + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC); >>> count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); >>> } >>> >>> @@ -1050,6 +1110,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) >>> folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true); >>> if (unlikely(!folio)) { >>> count_vm_event(THP_FAULT_FALLBACK); >>> + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK); >>> return VM_FAULT_FALLBACK; >>> } >>> return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp); >>> diff --git a/mm/memory.c b/mm/memory.c >>> index 649a547fe8e3..06048af7cf9a 100644 >>> --- a/mm/memory.c >>> +++ b/mm/memory.c >>> @@ -4368,6 +4368,7 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) >>> folio = vma_alloc_folio(gfp, order, vma, addr, true); >>> if (folio) { >>> if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) { >>> + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); >>> folio_put(folio); >>> goto next; >>> } >>> @@ -4376,6 +4377,7 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) >>> return folio; >>> } >>> next: >>> + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK); >>> order = next_order(&orders, order); >>> } >>> >>> @@ -4485,6 +4487,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) >>> >>> folio_ref_add(folio, nr_pages - 1); >>> add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages); >>> + count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_FAULT_ALLOC); >>> folio_add_new_anon_rmap(folio, vma, addr); >>> folio_add_lru_vma(folio, vma); >>> setpte: >>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c >>> index b51becf03d1e..3135b5ca2457 100644 >>> --- a/mm/page_alloc.c >>> +++ b/mm/page_alloc.c >>> @@ -5840,6 +5840,10 @@ static int page_alloc_cpu_dead(unsigned int cpu) >>> */ >>> vm_events_fold_cpu(cpu); >>> >>> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE >>> + mthp_stats_fold_cpu(cpu); >>> +#endif >>> + >>> /* >>> * Zero the differential counters of the dead processor >>> * so that the vm statistics are consistent. >>
On Fri, Apr 12, 2024 at 9:56 PM Ryan Roberts <ryan.roberts@arm.com> wrote: > > On 12/04/2024 10:43, Barry Song wrote: > > On Fri, Apr 12, 2024 at 9:27 PM Ryan Roberts <ryan.roberts@arm.com> wrote: > >> > >> Hi Barry, > >> > >> 2 remaining comments - otherwise looks good. (same comments I just made in the > >> v4 conversation). > >> > >> On 12/04/2024 08:37, Barry Song wrote: > >>> From: Barry Song <v-songbaohua@oppo.com> > >>> > >>> Profiling a system blindly with mTHP has become challenging due to the > >>> lack of visibility into its operations. Presenting the success rate of > >>> mTHP allocations appears to be pressing need. > >>> > >>> Recently, I've been experiencing significant difficulty debugging > >>> performance improvements and regressions without these figures. It's > >>> crucial for us to understand the true effectiveness of mTHP in real-world > >>> scenarios, especially in systems with fragmented memory. > >>> > >>> This patch establishes the framework for per-order mTHP > >>> counters. It begins by introducing the anon_fault_alloc and > >>> anon_fault_fallback counters. Additionally, to maintain consistency > >>> with thp_fault_fallback_charge in /proc/vmstat, this patch also tracks > >>> anon_fault_fallback_charge when mem_cgroup_charge fails for mTHP. > >>> Incorporating additional counters should now be straightforward as well. > >>> > >>> Signed-off-by: Barry Song <v-songbaohua@oppo.com> > >>> Cc: Chris Li <chrisl@kernel.org> > >>> Cc: David Hildenbrand <david@redhat.com> > >>> Cc: Domenico Cerasuolo <cerasuolodomenico@gmail.com> > >>> Cc: Kairui Song <kasong@tencent.com> > >>> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> > >>> Cc: Peter Xu <peterx@redhat.com> > >>> Cc: Ryan Roberts <ryan.roberts@arm.com> > >>> Cc: Suren Baghdasaryan <surenb@google.com> > >>> Cc: Yosry Ahmed <yosryahmed@google.com> > >>> Cc: Yu Zhao <yuzhao@google.com> > >>> --- > >>> include/linux/huge_mm.h | 51 ++++++++++++++++++++++++++++++++++ > >>> mm/huge_memory.c | 61 +++++++++++++++++++++++++++++++++++++++++ > >>> mm/memory.c | 3 ++ > >>> mm/page_alloc.c | 4 +++ > >>> 4 files changed, 119 insertions(+) > >>> > >>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h > >>> index e896ca4760f6..c5beb54b97cb 100644 > >>> --- a/include/linux/huge_mm.h > >>> +++ b/include/linux/huge_mm.h > >>> @@ -264,6 +264,57 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, > >>> enforce_sysfs, orders); > >>> } > >>> > >>> +enum mthp_stat_item { > >>> + MTHP_STAT_ANON_FAULT_ALLOC, > >>> + MTHP_STAT_ANON_FAULT_FALLBACK, > >>> + MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE, > >>> + __MTHP_STAT_COUNT > >>> +}; > >>> + > >>> +struct mthp_stat { > >>> + unsigned long stats[0][__MTHP_STAT_COUNT]; > >>> +}; > >>> + > >>> +extern struct mthp_stat __percpu *mthp_stats; > >>> + > >>> +static inline void count_mthp_stat(int order, enum mthp_stat_item item) > >>> +{ > >>> + if (order <= 0 || order > PMD_ORDER || !mthp_stats) > >>> + return; > >>> + > >>> + this_cpu_inc(mthp_stats->stats[order][item]); > >>> +} > >>> + > >>> +static inline void count_mthp_stats(int order, enum mthp_stat_item item, long delta) > >>> +{ > >>> + if (order <= 0 || order > PMD_ORDER || !mthp_stats) > >>> + return; > >>> + > >>> + this_cpu_add(mthp_stats->stats[order][item], delta); > >>> +} > >>> + > >>> +/* > >>> + * Fold the foreign cpu mthp stats into our own. > >>> + * > >>> + * This is adding to the stats on one processor > >>> + * but keeps the global counts constant. > >>> + */ > >>> +static inline void mthp_stats_fold_cpu(int cpu) > >>> +{ > >>> + struct mthp_stat *fold_stat; > >>> + int i, j; > >>> + > >>> + if (!mthp_stats) > >>> + return; > >>> + fold_stat = per_cpu_ptr(mthp_stats, cpu); > >>> + for (i = 1; i <= PMD_ORDER; i++) { > >>> + for (j = 0; j < __MTHP_STAT_COUNT; j++) { > >>> + count_mthp_stats(i, j, fold_stat->stats[i][j]); > >>> + fold_stat->stats[i][j] = 0; > >>> + } > >>> + } > >>> +} > >> > >> This is a pretty horrible hack; I'm pretty sure just summing for all *possible* > >> cpus should work. > >> > >>> + > >>> #define transparent_hugepage_use_zero_page() \ > >>> (transparent_hugepage_flags & \ > >>> (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG)) > >>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c > >>> index dc30139590e6..21c4ac74b484 100644 > >>> --- a/mm/huge_memory.c > >>> +++ b/mm/huge_memory.c > >>> @@ -526,6 +526,50 @@ static const struct kobj_type thpsize_ktype = { > >>> .sysfs_ops = &kobj_sysfs_ops, > >>> }; > >>> > >>> +struct mthp_stat __percpu *mthp_stats; > >>> + > >>> +static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item) > >>> +{ > >>> + unsigned long sum = 0; > >>> + int cpu; > >>> + > >>> + cpus_read_lock(); > >>> + for_each_online_cpu(cpu) { > >>> + struct mthp_stat *this = per_cpu_ptr(mthp_stats, cpu); > >>> + > >>> + sum += this->stats[order][item]; > >>> + } > >>> + cpus_read_unlock(); > >>> + > >>> + return sum; > >>> +} > >>> + > >>> +#define DEFINE_MTHP_STAT_ATTR(_name, _index) \ > >>> +static ssize_t _name##_show(struct kobject *kobj, \ > >>> + struct kobj_attribute *attr, char *buf) \ > >>> +{ \ > >>> + int order = to_thpsize(kobj)->order; \ > >>> + \ > >>> + return sysfs_emit(buf, "%lu\n", sum_mthp_stat(order, _index)); \ > >>> +} \ > >>> +static struct kobj_attribute _name##_attr = __ATTR_RO(_name) > >>> + > >>> +DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC); > >>> +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK); > >>> +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); > >>> + > >>> +static struct attribute *stats_attrs[] = { > >>> + &anon_fault_alloc_attr.attr, > >>> + &anon_fault_fallback_attr.attr, > >>> + &anon_fault_fallback_charge_attr.attr, > >>> + NULL, > >>> +}; > >>> + > >>> +static struct attribute_group stats_attr_group = { > >>> + .name = "stats", > >>> + .attrs = stats_attrs, > >>> +}; > >>> + > >>> static struct thpsize *thpsize_create(int order, struct kobject *parent) > >>> { > >>> unsigned long size = (PAGE_SIZE << order) / SZ_1K; > >>> @@ -549,6 +593,12 @@ static struct thpsize *thpsize_create(int order, struct kobject *parent) > >>> return ERR_PTR(ret); > >>> } > >>> > >>> + ret = sysfs_create_group(&thpsize->kobj, &stats_attr_group); > >>> + if (ret) { > >>> + kobject_put(&thpsize->kobj); > >>> + return ERR_PTR(ret); > >>> + } > >>> + > >>> thpsize->order = order; > >>> return thpsize; > >>> } > >>> @@ -691,6 +741,11 @@ static int __init hugepage_init(void) > >>> */ > >>> MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2); > >>> > >>> + mthp_stats = __alloc_percpu((PMD_ORDER + 1) * sizeof(mthp_stats->stats[0]), > >>> + sizeof(unsigned long)); > >> > >> Personally I think it would be cleaner to allocate statically using > >> ilog2(MAX_PTRS_PER_PTE) instead of PMD_ORDER. > > > > Hi Ryan, > > > > I don't understand why MAX_PTRS_PER_PTE is the correct size. For ARM64, > > > > #define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT) > > > > #define MAX_PTRS_PER_PTE PTRS_PER_PTE > > > > #define PTRS_PER_PTE (1 << (PAGE_SHIFT - 3)) > > > > while PAGE_SIZE is 16KiB or 64KiB, PTRS_PER_PTE can be a huge number? > > > > > > Am I missing something? > > PTRS_PER_PTE is the number of PTE entries in a PTE table. On arm64 its as follows: > > PAGE_SIZE PAGE_SHIFT PTRS_PER_PTE > 4K 12 512 > 16K 14 2048 > 64K 16 8192 > > So (PTRS_PER_PTE * PAGE_SIZE) = PMD_SIZE > > PMD_ORDER is ilog2(PMD_SIZE / PAGE_SIZE) = ilog2(PTRS_PER_PTE) > > MAX_PTRS_PER_PTE is just the maximum value that PTRS_PER_PTE will ever have, > (and its equal to PTRS_PER_PTE except for powerpc). > > Pretty sure the math is correct? I am not convinced the math is correct :-) while page size is 64KiB, the page table is as below, PMD_ORDER = L2 index bits = [41:29] = 13 != ilog2(8192) +--------+--------+--------+--------+--------+--------+--------+--------+ |63 56|55 48|47 40|39 32|31 24|23 16|15 8|7 0| +--------+--------+--------+--------+--------+--------+--------+--------+ | | | | | | | | | v | | | | [15:0] in-page offset | | | +----------> [28:16] L3 index | | +--------------------------> [41:29] L2 index | +-------------------------------> [47:42] L1 index (48-bit) | [51:42] L1 index (52-bit) +-------------------------------------------------> [63] TTBR0/1 while page size is 4KiB, the page table is as below, +--------+--------+--------+--------+--------+--------+--------+--------+ |63 56|55 48|47 40|39 32|31 24|23 16|15 8|7 0| +--------+--------+--------+--------+--------+--------+--------+--------+ | | | | | | | | | | | v | | | | | [11:0] in-page offset | | | | +-> [20:12] L3 index | | | +-----------> [29:21] L2 index | | +---------------------> [38:30] L1 index | +-------------------------------> [47:39] L0 index +-------------------------------------------------> [63] TTBR0/1 PMD_ORDER = L2 index bits = [29:21] = 9 = ilog2(512). You are only correct while page size = 4KiB.
On 12/04/2024 11:17, Barry Song wrote: > On Fri, Apr 12, 2024 at 9:56 PM Ryan Roberts <ryan.roberts@arm.com> wrote: >> >> On 12/04/2024 10:43, Barry Song wrote: >>> On Fri, Apr 12, 2024 at 9:27 PM Ryan Roberts <ryan.roberts@arm.com> wrote: >>>> >>>> Hi Barry, >>>> >>>> 2 remaining comments - otherwise looks good. (same comments I just made in the >>>> v4 conversation). >>>> >>>> On 12/04/2024 08:37, Barry Song wrote: >>>>> From: Barry Song <v-songbaohua@oppo.com> >>>>> >>>>> Profiling a system blindly with mTHP has become challenging due to the >>>>> lack of visibility into its operations. Presenting the success rate of >>>>> mTHP allocations appears to be pressing need. >>>>> >>>>> Recently, I've been experiencing significant difficulty debugging >>>>> performance improvements and regressions without these figures. It's >>>>> crucial for us to understand the true effectiveness of mTHP in real-world >>>>> scenarios, especially in systems with fragmented memory. >>>>> >>>>> This patch establishes the framework for per-order mTHP >>>>> counters. It begins by introducing the anon_fault_alloc and >>>>> anon_fault_fallback counters. Additionally, to maintain consistency >>>>> with thp_fault_fallback_charge in /proc/vmstat, this patch also tracks >>>>> anon_fault_fallback_charge when mem_cgroup_charge fails for mTHP. >>>>> Incorporating additional counters should now be straightforward as well. >>>>> >>>>> Signed-off-by: Barry Song <v-songbaohua@oppo.com> >>>>> Cc: Chris Li <chrisl@kernel.org> >>>>> Cc: David Hildenbrand <david@redhat.com> >>>>> Cc: Domenico Cerasuolo <cerasuolodomenico@gmail.com> >>>>> Cc: Kairui Song <kasong@tencent.com> >>>>> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> >>>>> Cc: Peter Xu <peterx@redhat.com> >>>>> Cc: Ryan Roberts <ryan.roberts@arm.com> >>>>> Cc: Suren Baghdasaryan <surenb@google.com> >>>>> Cc: Yosry Ahmed <yosryahmed@google.com> >>>>> Cc: Yu Zhao <yuzhao@google.com> >>>>> --- >>>>> include/linux/huge_mm.h | 51 ++++++++++++++++++++++++++++++++++ >>>>> mm/huge_memory.c | 61 +++++++++++++++++++++++++++++++++++++++++ >>>>> mm/memory.c | 3 ++ >>>>> mm/page_alloc.c | 4 +++ >>>>> 4 files changed, 119 insertions(+) >>>>> >>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h >>>>> index e896ca4760f6..c5beb54b97cb 100644 >>>>> --- a/include/linux/huge_mm.h >>>>> +++ b/include/linux/huge_mm.h >>>>> @@ -264,6 +264,57 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, >>>>> enforce_sysfs, orders); >>>>> } >>>>> >>>>> +enum mthp_stat_item { >>>>> + MTHP_STAT_ANON_FAULT_ALLOC, >>>>> + MTHP_STAT_ANON_FAULT_FALLBACK, >>>>> + MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE, >>>>> + __MTHP_STAT_COUNT >>>>> +}; >>>>> + >>>>> +struct mthp_stat { >>>>> + unsigned long stats[0][__MTHP_STAT_COUNT]; >>>>> +}; >>>>> + >>>>> +extern struct mthp_stat __percpu *mthp_stats; >>>>> + >>>>> +static inline void count_mthp_stat(int order, enum mthp_stat_item item) >>>>> +{ >>>>> + if (order <= 0 || order > PMD_ORDER || !mthp_stats) >>>>> + return; >>>>> + >>>>> + this_cpu_inc(mthp_stats->stats[order][item]); >>>>> +} >>>>> + >>>>> +static inline void count_mthp_stats(int order, enum mthp_stat_item item, long delta) >>>>> +{ >>>>> + if (order <= 0 || order > PMD_ORDER || !mthp_stats) >>>>> + return; >>>>> + >>>>> + this_cpu_add(mthp_stats->stats[order][item], delta); >>>>> +} >>>>> + >>>>> +/* >>>>> + * Fold the foreign cpu mthp stats into our own. >>>>> + * >>>>> + * This is adding to the stats on one processor >>>>> + * but keeps the global counts constant. >>>>> + */ >>>>> +static inline void mthp_stats_fold_cpu(int cpu) >>>>> +{ >>>>> + struct mthp_stat *fold_stat; >>>>> + int i, j; >>>>> + >>>>> + if (!mthp_stats) >>>>> + return; >>>>> + fold_stat = per_cpu_ptr(mthp_stats, cpu); >>>>> + for (i = 1; i <= PMD_ORDER; i++) { >>>>> + for (j = 0; j < __MTHP_STAT_COUNT; j++) { >>>>> + count_mthp_stats(i, j, fold_stat->stats[i][j]); >>>>> + fold_stat->stats[i][j] = 0; >>>>> + } >>>>> + } >>>>> +} >>>> >>>> This is a pretty horrible hack; I'm pretty sure just summing for all *possible* >>>> cpus should work. >>>> >>>>> + >>>>> #define transparent_hugepage_use_zero_page() \ >>>>> (transparent_hugepage_flags & \ >>>>> (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG)) >>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c >>>>> index dc30139590e6..21c4ac74b484 100644 >>>>> --- a/mm/huge_memory.c >>>>> +++ b/mm/huge_memory.c >>>>> @@ -526,6 +526,50 @@ static const struct kobj_type thpsize_ktype = { >>>>> .sysfs_ops = &kobj_sysfs_ops, >>>>> }; >>>>> >>>>> +struct mthp_stat __percpu *mthp_stats; >>>>> + >>>>> +static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item) >>>>> +{ >>>>> + unsigned long sum = 0; >>>>> + int cpu; >>>>> + >>>>> + cpus_read_lock(); >>>>> + for_each_online_cpu(cpu) { >>>>> + struct mthp_stat *this = per_cpu_ptr(mthp_stats, cpu); >>>>> + >>>>> + sum += this->stats[order][item]; >>>>> + } >>>>> + cpus_read_unlock(); >>>>> + >>>>> + return sum; >>>>> +} >>>>> + >>>>> +#define DEFINE_MTHP_STAT_ATTR(_name, _index) \ >>>>> +static ssize_t _name##_show(struct kobject *kobj, \ >>>>> + struct kobj_attribute *attr, char *buf) \ >>>>> +{ \ >>>>> + int order = to_thpsize(kobj)->order; \ >>>>> + \ >>>>> + return sysfs_emit(buf, "%lu\n", sum_mthp_stat(order, _index)); \ >>>>> +} \ >>>>> +static struct kobj_attribute _name##_attr = __ATTR_RO(_name) >>>>> + >>>>> +DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC); >>>>> +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK); >>>>> +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); >>>>> + >>>>> +static struct attribute *stats_attrs[] = { >>>>> + &anon_fault_alloc_attr.attr, >>>>> + &anon_fault_fallback_attr.attr, >>>>> + &anon_fault_fallback_charge_attr.attr, >>>>> + NULL, >>>>> +}; >>>>> + >>>>> +static struct attribute_group stats_attr_group = { >>>>> + .name = "stats", >>>>> + .attrs = stats_attrs, >>>>> +}; >>>>> + >>>>> static struct thpsize *thpsize_create(int order, struct kobject *parent) >>>>> { >>>>> unsigned long size = (PAGE_SIZE << order) / SZ_1K; >>>>> @@ -549,6 +593,12 @@ static struct thpsize *thpsize_create(int order, struct kobject *parent) >>>>> return ERR_PTR(ret); >>>>> } >>>>> >>>>> + ret = sysfs_create_group(&thpsize->kobj, &stats_attr_group); >>>>> + if (ret) { >>>>> + kobject_put(&thpsize->kobj); >>>>> + return ERR_PTR(ret); >>>>> + } >>>>> + >>>>> thpsize->order = order; >>>>> return thpsize; >>>>> } >>>>> @@ -691,6 +741,11 @@ static int __init hugepage_init(void) >>>>> */ >>>>> MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2); >>>>> >>>>> + mthp_stats = __alloc_percpu((PMD_ORDER + 1) * sizeof(mthp_stats->stats[0]), >>>>> + sizeof(unsigned long)); >>>> >>>> Personally I think it would be cleaner to allocate statically using >>>> ilog2(MAX_PTRS_PER_PTE) instead of PMD_ORDER. >>> >>> Hi Ryan, >>> >>> I don't understand why MAX_PTRS_PER_PTE is the correct size. For ARM64, >>> >>> #define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT) >>> >>> #define MAX_PTRS_PER_PTE PTRS_PER_PTE >>> >>> #define PTRS_PER_PTE (1 << (PAGE_SHIFT - 3)) >>> >>> while PAGE_SIZE is 16KiB or 64KiB, PTRS_PER_PTE can be a huge number? >>> >>> >>> Am I missing something? >> >> PTRS_PER_PTE is the number of PTE entries in a PTE table. On arm64 its as follows: >> >> PAGE_SIZE PAGE_SHIFT PTRS_PER_PTE >> 4K 12 512 >> 16K 14 2048 >> 64K 16 8192 >> >> So (PTRS_PER_PTE * PAGE_SIZE) = PMD_SIZE >> >> PMD_ORDER is ilog2(PMD_SIZE / PAGE_SIZE) = ilog2(PTRS_PER_PTE) >> >> MAX_PTRS_PER_PTE is just the maximum value that PTRS_PER_PTE will ever have, >> (and its equal to PTRS_PER_PTE except for powerpc). >> >> Pretty sure the math is correct? > > I am not convinced the math is correct :-) > > while page size is 64KiB, the page table is as below, > PMD_ORDER = L2 index bits = [41:29] = 13 != ilog2(8192) 1 << 13 = 8192 Right? So: ilog2(8192) = 13 What's wrong with that? I even checked in Python to make sure I'm not going mad: >>> import math >>> math.log2(8192) 13.0 > > > +--------+--------+--------+--------+--------+--------+--------+--------+ > |63 56|55 48|47 40|39 32|31 24|23 16|15 8|7 0| > +--------+--------+--------+--------+--------+--------+--------+--------+ > | | | | | > | | | | v > | | | | [15:0] in-page offset > | | | +----------> [28:16] L3 index > | | +--------------------------> [41:29] L2 index > | +-------------------------------> [47:42] L1 index (48-bit) > | [51:42] L1 index (52-bit) > +-------------------------------------------------> [63] TTBR0/1 > > while page size is 4KiB, the page table is as below, > > +--------+--------+--------+--------+--------+--------+--------+--------+ > |63 56|55 48|47 40|39 32|31 24|23 16|15 8|7 0| > +--------+--------+--------+--------+--------+--------+--------+--------+ > | | | | | | > | | | | | v > | | | | | [11:0] in-page offset > | | | | +-> [20:12] L3 index > | | | +-----------> [29:21] L2 index > | | +---------------------> [38:30] L1 index > | +-------------------------------> [47:39] L0 index > +-------------------------------------------------> [63] TTBR0/1 > > PMD_ORDER = L2 index bits = [29:21] = 9 = ilog2(512). > > You are only correct while page size = 4KiB. > > > >
On Fri, Apr 12, 2024 at 10:25 PM Ryan Roberts <ryan.roberts@arm.com> wrote: > > On 12/04/2024 11:17, Barry Song wrote: > > On Fri, Apr 12, 2024 at 9:56 PM Ryan Roberts <ryan.roberts@arm.com> wrote: > >> > >> On 12/04/2024 10:43, Barry Song wrote: > >>> On Fri, Apr 12, 2024 at 9:27 PM Ryan Roberts <ryan.roberts@arm.com> wrote: > >>>> > >>>> Hi Barry, > >>>> > >>>> 2 remaining comments - otherwise looks good. (same comments I just made in the > >>>> v4 conversation). > >>>> > >>>> On 12/04/2024 08:37, Barry Song wrote: > >>>>> From: Barry Song <v-songbaohua@oppo.com> > >>>>> > >>>>> Profiling a system blindly with mTHP has become challenging due to the > >>>>> lack of visibility into its operations. Presenting the success rate of > >>>>> mTHP allocations appears to be pressing need. > >>>>> > >>>>> Recently, I've been experiencing significant difficulty debugging > >>>>> performance improvements and regressions without these figures. It's > >>>>> crucial for us to understand the true effectiveness of mTHP in real-world > >>>>> scenarios, especially in systems with fragmented memory. > >>>>> > >>>>> This patch establishes the framework for per-order mTHP > >>>>> counters. It begins by introducing the anon_fault_alloc and > >>>>> anon_fault_fallback counters. Additionally, to maintain consistency > >>>>> with thp_fault_fallback_charge in /proc/vmstat, this patch also tracks > >>>>> anon_fault_fallback_charge when mem_cgroup_charge fails for mTHP. > >>>>> Incorporating additional counters should now be straightforward as well. > >>>>> > >>>>> Signed-off-by: Barry Song <v-songbaohua@oppo.com> > >>>>> Cc: Chris Li <chrisl@kernel.org> > >>>>> Cc: David Hildenbrand <david@redhat.com> > >>>>> Cc: Domenico Cerasuolo <cerasuolodomenico@gmail.com> > >>>>> Cc: Kairui Song <kasong@tencent.com> > >>>>> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> > >>>>> Cc: Peter Xu <peterx@redhat.com> > >>>>> Cc: Ryan Roberts <ryan.roberts@arm.com> > >>>>> Cc: Suren Baghdasaryan <surenb@google.com> > >>>>> Cc: Yosry Ahmed <yosryahmed@google.com> > >>>>> Cc: Yu Zhao <yuzhao@google.com> > >>>>> --- > >>>>> include/linux/huge_mm.h | 51 ++++++++++++++++++++++++++++++++++ > >>>>> mm/huge_memory.c | 61 +++++++++++++++++++++++++++++++++++++++++ > >>>>> mm/memory.c | 3 ++ > >>>>> mm/page_alloc.c | 4 +++ > >>>>> 4 files changed, 119 insertions(+) > >>>>> > >>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h > >>>>> index e896ca4760f6..c5beb54b97cb 100644 > >>>>> --- a/include/linux/huge_mm.h > >>>>> +++ b/include/linux/huge_mm.h > >>>>> @@ -264,6 +264,57 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, > >>>>> enforce_sysfs, orders); > >>>>> } > >>>>> > >>>>> +enum mthp_stat_item { > >>>>> + MTHP_STAT_ANON_FAULT_ALLOC, > >>>>> + MTHP_STAT_ANON_FAULT_FALLBACK, > >>>>> + MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE, > >>>>> + __MTHP_STAT_COUNT > >>>>> +}; > >>>>> + > >>>>> +struct mthp_stat { > >>>>> + unsigned long stats[0][__MTHP_STAT_COUNT]; > >>>>> +}; > >>>>> + > >>>>> +extern struct mthp_stat __percpu *mthp_stats; > >>>>> + > >>>>> +static inline void count_mthp_stat(int order, enum mthp_stat_item item) > >>>>> +{ > >>>>> + if (order <= 0 || order > PMD_ORDER || !mthp_stats) > >>>>> + return; > >>>>> + > >>>>> + this_cpu_inc(mthp_stats->stats[order][item]); > >>>>> +} > >>>>> + > >>>>> +static inline void count_mthp_stats(int order, enum mthp_stat_item item, long delta) > >>>>> +{ > >>>>> + if (order <= 0 || order > PMD_ORDER || !mthp_stats) > >>>>> + return; > >>>>> + > >>>>> + this_cpu_add(mthp_stats->stats[order][item], delta); > >>>>> +} > >>>>> + > >>>>> +/* > >>>>> + * Fold the foreign cpu mthp stats into our own. > >>>>> + * > >>>>> + * This is adding to the stats on one processor > >>>>> + * but keeps the global counts constant. > >>>>> + */ > >>>>> +static inline void mthp_stats_fold_cpu(int cpu) > >>>>> +{ > >>>>> + struct mthp_stat *fold_stat; > >>>>> + int i, j; > >>>>> + > >>>>> + if (!mthp_stats) > >>>>> + return; > >>>>> + fold_stat = per_cpu_ptr(mthp_stats, cpu); > >>>>> + for (i = 1; i <= PMD_ORDER; i++) { > >>>>> + for (j = 0; j < __MTHP_STAT_COUNT; j++) { > >>>>> + count_mthp_stats(i, j, fold_stat->stats[i][j]); > >>>>> + fold_stat->stats[i][j] = 0; > >>>>> + } > >>>>> + } > >>>>> +} > >>>> > >>>> This is a pretty horrible hack; I'm pretty sure just summing for all *possible* > >>>> cpus should work. > >>>> > >>>>> + > >>>>> #define transparent_hugepage_use_zero_page() \ > >>>>> (transparent_hugepage_flags & \ > >>>>> (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG)) > >>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c > >>>>> index dc30139590e6..21c4ac74b484 100644 > >>>>> --- a/mm/huge_memory.c > >>>>> +++ b/mm/huge_memory.c > >>>>> @@ -526,6 +526,50 @@ static const struct kobj_type thpsize_ktype = { > >>>>> .sysfs_ops = &kobj_sysfs_ops, > >>>>> }; > >>>>> > >>>>> +struct mthp_stat __percpu *mthp_stats; > >>>>> + > >>>>> +static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item) > >>>>> +{ > >>>>> + unsigned long sum = 0; > >>>>> + int cpu; > >>>>> + > >>>>> + cpus_read_lock(); > >>>>> + for_each_online_cpu(cpu) { > >>>>> + struct mthp_stat *this = per_cpu_ptr(mthp_stats, cpu); > >>>>> + > >>>>> + sum += this->stats[order][item]; > >>>>> + } > >>>>> + cpus_read_unlock(); > >>>>> + > >>>>> + return sum; > >>>>> +} > >>>>> + > >>>>> +#define DEFINE_MTHP_STAT_ATTR(_name, _index) \ > >>>>> +static ssize_t _name##_show(struct kobject *kobj, \ > >>>>> + struct kobj_attribute *attr, char *buf) \ > >>>>> +{ \ > >>>>> + int order = to_thpsize(kobj)->order; \ > >>>>> + \ > >>>>> + return sysfs_emit(buf, "%lu\n", sum_mthp_stat(order, _index)); \ > >>>>> +} \ > >>>>> +static struct kobj_attribute _name##_attr = __ATTR_RO(_name) > >>>>> + > >>>>> +DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC); > >>>>> +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK); > >>>>> +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); > >>>>> + > >>>>> +static struct attribute *stats_attrs[] = { > >>>>> + &anon_fault_alloc_attr.attr, > >>>>> + &anon_fault_fallback_attr.attr, > >>>>> + &anon_fault_fallback_charge_attr.attr, > >>>>> + NULL, > >>>>> +}; > >>>>> + > >>>>> +static struct attribute_group stats_attr_group = { > >>>>> + .name = "stats", > >>>>> + .attrs = stats_attrs, > >>>>> +}; > >>>>> + > >>>>> static struct thpsize *thpsize_create(int order, struct kobject *parent) > >>>>> { > >>>>> unsigned long size = (PAGE_SIZE << order) / SZ_1K; > >>>>> @@ -549,6 +593,12 @@ static struct thpsize *thpsize_create(int order, struct kobject *parent) > >>>>> return ERR_PTR(ret); > >>>>> } > >>>>> > >>>>> + ret = sysfs_create_group(&thpsize->kobj, &stats_attr_group); > >>>>> + if (ret) { > >>>>> + kobject_put(&thpsize->kobj); > >>>>> + return ERR_PTR(ret); > >>>>> + } > >>>>> + > >>>>> thpsize->order = order; > >>>>> return thpsize; > >>>>> } > >>>>> @@ -691,6 +741,11 @@ static int __init hugepage_init(void) > >>>>> */ > >>>>> MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2); > >>>>> > >>>>> + mthp_stats = __alloc_percpu((PMD_ORDER + 1) * sizeof(mthp_stats->stats[0]), > >>>>> + sizeof(unsigned long)); > >>>> > >>>> Personally I think it would be cleaner to allocate statically using > >>>> ilog2(MAX_PTRS_PER_PTE) instead of PMD_ORDER. > >>> > >>> Hi Ryan, > >>> > >>> I don't understand why MAX_PTRS_PER_PTE is the correct size. For ARM64, > >>> > >>> #define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT) > >>> > >>> #define MAX_PTRS_PER_PTE PTRS_PER_PTE > >>> > >>> #define PTRS_PER_PTE (1 << (PAGE_SHIFT - 3)) > >>> > >>> while PAGE_SIZE is 16KiB or 64KiB, PTRS_PER_PTE can be a huge number? > >>> > >>> > >>> Am I missing something? > >> > >> PTRS_PER_PTE is the number of PTE entries in a PTE table. On arm64 its as follows: > >> > >> PAGE_SIZE PAGE_SHIFT PTRS_PER_PTE > >> 4K 12 512 > >> 16K 14 2048 > >> 64K 16 8192 > >> > >> So (PTRS_PER_PTE * PAGE_SIZE) = PMD_SIZE > >> > >> PMD_ORDER is ilog2(PMD_SIZE / PAGE_SIZE) = ilog2(PTRS_PER_PTE) > >> > >> MAX_PTRS_PER_PTE is just the maximum value that PTRS_PER_PTE will ever have, > >> (and its equal to PTRS_PER_PTE except for powerpc). > >> > >> Pretty sure the math is correct? > > > > I am not convinced the math is correct :-) > > > > while page size is 64KiB, the page table is as below, > > PMD_ORDER = L2 index bits = [41:29] = 13 != ilog2(8192) > > 1 << 13 = 8192 > > Right? So: > > ilog2(8192) = 13 > > What's wrong with that? > > I even checked in Python to make sure I'm not going mad: > > >>> import math > >>> math.log2(8192) > 13.0 You're correct. My mind fixated on the '16' in the line '64K 16 8192'. I mistakenly thought ilog2(8192) equals 16. Apologies for the confusion. > > > > > > > +--------+--------+--------+--------+--------+--------+--------+--------+ > > |63 56|55 48|47 40|39 32|31 24|23 16|15 8|7 0| > > +--------+--------+--------+--------+--------+--------+--------+--------+ > > | | | | | > > | | | | v > > | | | | [15:0] in-page offset > > | | | +----------> [28:16] L3 index > > | | +--------------------------> [41:29] L2 index > > | +-------------------------------> [47:42] L1 index (48-bit) > > | [51:42] L1 index (52-bit) > > +-------------------------------------------------> [63] TTBR0/1 > > > > while page size is 4KiB, the page table is as below, > > > > +--------+--------+--------+--------+--------+--------+--------+--------+ > > |63 56|55 48|47 40|39 32|31 24|23 16|15 8|7 0| > > +--------+--------+--------+--------+--------+--------+--------+--------+ > > | | | | | | > > | | | | | v > > | | | | | [11:0] in-page offset > > | | | | +-> [20:12] L3 index > > | | | +-----------> [29:21] L2 index > > | | +---------------------> [38:30] L1 index > > | +-------------------------------> [47:39] L0 index > > +-------------------------------------------------> [63] TTBR0/1 > > > > PMD_ORDER = L2 index bits = [29:21] = 9 = ilog2(512). > > > > You are only correct while page size = 4KiB. > > > > > > > > >
On 12/04/2024 11:29, Barry Song wrote: > On Fri, Apr 12, 2024 at 10:25 PM Ryan Roberts <ryan.roberts@arm.com> wrote: >> >> On 12/04/2024 11:17, Barry Song wrote: >>> On Fri, Apr 12, 2024 at 9:56 PM Ryan Roberts <ryan.roberts@arm.com> wrote: >>>> >>>> On 12/04/2024 10:43, Barry Song wrote: >>>>> On Fri, Apr 12, 2024 at 9:27 PM Ryan Roberts <ryan.roberts@arm.com> wrote: >>>>>> >>>>>> Hi Barry, >>>>>> >>>>>> 2 remaining comments - otherwise looks good. (same comments I just made in the >>>>>> v4 conversation). >>>>>> >>>>>> On 12/04/2024 08:37, Barry Song wrote: >>>>>>> From: Barry Song <v-songbaohua@oppo.com> >>>>>>> >>>>>>> Profiling a system blindly with mTHP has become challenging due to the >>>>>>> lack of visibility into its operations. Presenting the success rate of >>>>>>> mTHP allocations appears to be pressing need. >>>>>>> >>>>>>> Recently, I've been experiencing significant difficulty debugging >>>>>>> performance improvements and regressions without these figures. It's >>>>>>> crucial for us to understand the true effectiveness of mTHP in real-world >>>>>>> scenarios, especially in systems with fragmented memory. >>>>>>> >>>>>>> This patch establishes the framework for per-order mTHP >>>>>>> counters. It begins by introducing the anon_fault_alloc and >>>>>>> anon_fault_fallback counters. Additionally, to maintain consistency >>>>>>> with thp_fault_fallback_charge in /proc/vmstat, this patch also tracks >>>>>>> anon_fault_fallback_charge when mem_cgroup_charge fails for mTHP. >>>>>>> Incorporating additional counters should now be straightforward as well. >>>>>>> >>>>>>> Signed-off-by: Barry Song <v-songbaohua@oppo.com> >>>>>>> Cc: Chris Li <chrisl@kernel.org> >>>>>>> Cc: David Hildenbrand <david@redhat.com> >>>>>>> Cc: Domenico Cerasuolo <cerasuolodomenico@gmail.com> >>>>>>> Cc: Kairui Song <kasong@tencent.com> >>>>>>> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> >>>>>>> Cc: Peter Xu <peterx@redhat.com> >>>>>>> Cc: Ryan Roberts <ryan.roberts@arm.com> >>>>>>> Cc: Suren Baghdasaryan <surenb@google.com> >>>>>>> Cc: Yosry Ahmed <yosryahmed@google.com> >>>>>>> Cc: Yu Zhao <yuzhao@google.com> >>>>>>> --- >>>>>>> include/linux/huge_mm.h | 51 ++++++++++++++++++++++++++++++++++ >>>>>>> mm/huge_memory.c | 61 +++++++++++++++++++++++++++++++++++++++++ >>>>>>> mm/memory.c | 3 ++ >>>>>>> mm/page_alloc.c | 4 +++ >>>>>>> 4 files changed, 119 insertions(+) >>>>>>> >>>>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h >>>>>>> index e896ca4760f6..c5beb54b97cb 100644 >>>>>>> --- a/include/linux/huge_mm.h >>>>>>> +++ b/include/linux/huge_mm.h >>>>>>> @@ -264,6 +264,57 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, >>>>>>> enforce_sysfs, orders); >>>>>>> } >>>>>>> >>>>>>> +enum mthp_stat_item { >>>>>>> + MTHP_STAT_ANON_FAULT_ALLOC, >>>>>>> + MTHP_STAT_ANON_FAULT_FALLBACK, >>>>>>> + MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE, >>>>>>> + __MTHP_STAT_COUNT >>>>>>> +}; >>>>>>> + >>>>>>> +struct mthp_stat { >>>>>>> + unsigned long stats[0][__MTHP_STAT_COUNT]; >>>>>>> +}; >>>>>>> + >>>>>>> +extern struct mthp_stat __percpu *mthp_stats; >>>>>>> + >>>>>>> +static inline void count_mthp_stat(int order, enum mthp_stat_item item) >>>>>>> +{ >>>>>>> + if (order <= 0 || order > PMD_ORDER || !mthp_stats) >>>>>>> + return; >>>>>>> + >>>>>>> + this_cpu_inc(mthp_stats->stats[order][item]); >>>>>>> +} >>>>>>> + >>>>>>> +static inline void count_mthp_stats(int order, enum mthp_stat_item item, long delta) >>>>>>> +{ >>>>>>> + if (order <= 0 || order > PMD_ORDER || !mthp_stats) >>>>>>> + return; >>>>>>> + >>>>>>> + this_cpu_add(mthp_stats->stats[order][item], delta); >>>>>>> +} >>>>>>> + >>>>>>> +/* >>>>>>> + * Fold the foreign cpu mthp stats into our own. >>>>>>> + * >>>>>>> + * This is adding to the stats on one processor >>>>>>> + * but keeps the global counts constant. >>>>>>> + */ >>>>>>> +static inline void mthp_stats_fold_cpu(int cpu) >>>>>>> +{ >>>>>>> + struct mthp_stat *fold_stat; >>>>>>> + int i, j; >>>>>>> + >>>>>>> + if (!mthp_stats) >>>>>>> + return; >>>>>>> + fold_stat = per_cpu_ptr(mthp_stats, cpu); >>>>>>> + for (i = 1; i <= PMD_ORDER; i++) { >>>>>>> + for (j = 0; j < __MTHP_STAT_COUNT; j++) { >>>>>>> + count_mthp_stats(i, j, fold_stat->stats[i][j]); >>>>>>> + fold_stat->stats[i][j] = 0; >>>>>>> + } >>>>>>> + } >>>>>>> +} >>>>>> >>>>>> This is a pretty horrible hack; I'm pretty sure just summing for all *possible* >>>>>> cpus should work. >>>>>> >>>>>>> + >>>>>>> #define transparent_hugepage_use_zero_page() \ >>>>>>> (transparent_hugepage_flags & \ >>>>>>> (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG)) >>>>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c >>>>>>> index dc30139590e6..21c4ac74b484 100644 >>>>>>> --- a/mm/huge_memory.c >>>>>>> +++ b/mm/huge_memory.c >>>>>>> @@ -526,6 +526,50 @@ static const struct kobj_type thpsize_ktype = { >>>>>>> .sysfs_ops = &kobj_sysfs_ops, >>>>>>> }; >>>>>>> >>>>>>> +struct mthp_stat __percpu *mthp_stats; >>>>>>> + >>>>>>> +static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item) >>>>>>> +{ >>>>>>> + unsigned long sum = 0; >>>>>>> + int cpu; >>>>>>> + >>>>>>> + cpus_read_lock(); >>>>>>> + for_each_online_cpu(cpu) { >>>>>>> + struct mthp_stat *this = per_cpu_ptr(mthp_stats, cpu); >>>>>>> + >>>>>>> + sum += this->stats[order][item]; >>>>>>> + } >>>>>>> + cpus_read_unlock(); >>>>>>> + >>>>>>> + return sum; >>>>>>> +} >>>>>>> + >>>>>>> +#define DEFINE_MTHP_STAT_ATTR(_name, _index) \ >>>>>>> +static ssize_t _name##_show(struct kobject *kobj, \ >>>>>>> + struct kobj_attribute *attr, char *buf) \ >>>>>>> +{ \ >>>>>>> + int order = to_thpsize(kobj)->order; \ >>>>>>> + \ >>>>>>> + return sysfs_emit(buf, "%lu\n", sum_mthp_stat(order, _index)); \ >>>>>>> +} \ >>>>>>> +static struct kobj_attribute _name##_attr = __ATTR_RO(_name) >>>>>>> + >>>>>>> +DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC); >>>>>>> +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK); >>>>>>> +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); >>>>>>> + >>>>>>> +static struct attribute *stats_attrs[] = { >>>>>>> + &anon_fault_alloc_attr.attr, >>>>>>> + &anon_fault_fallback_attr.attr, >>>>>>> + &anon_fault_fallback_charge_attr.attr, >>>>>>> + NULL, >>>>>>> +}; >>>>>>> + >>>>>>> +static struct attribute_group stats_attr_group = { >>>>>>> + .name = "stats", >>>>>>> + .attrs = stats_attrs, >>>>>>> +}; >>>>>>> + >>>>>>> static struct thpsize *thpsize_create(int order, struct kobject *parent) >>>>>>> { >>>>>>> unsigned long size = (PAGE_SIZE << order) / SZ_1K; >>>>>>> @@ -549,6 +593,12 @@ static struct thpsize *thpsize_create(int order, struct kobject *parent) >>>>>>> return ERR_PTR(ret); >>>>>>> } >>>>>>> >>>>>>> + ret = sysfs_create_group(&thpsize->kobj, &stats_attr_group); >>>>>>> + if (ret) { >>>>>>> + kobject_put(&thpsize->kobj); >>>>>>> + return ERR_PTR(ret); >>>>>>> + } >>>>>>> + >>>>>>> thpsize->order = order; >>>>>>> return thpsize; >>>>>>> } >>>>>>> @@ -691,6 +741,11 @@ static int __init hugepage_init(void) >>>>>>> */ >>>>>>> MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2); >>>>>>> >>>>>>> + mthp_stats = __alloc_percpu((PMD_ORDER + 1) * sizeof(mthp_stats->stats[0]), >>>>>>> + sizeof(unsigned long)); >>>>>> >>>>>> Personally I think it would be cleaner to allocate statically using >>>>>> ilog2(MAX_PTRS_PER_PTE) instead of PMD_ORDER. >>>>> >>>>> Hi Ryan, >>>>> >>>>> I don't understand why MAX_PTRS_PER_PTE is the correct size. For ARM64, >>>>> >>>>> #define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT) >>>>> >>>>> #define MAX_PTRS_PER_PTE PTRS_PER_PTE >>>>> >>>>> #define PTRS_PER_PTE (1 << (PAGE_SHIFT - 3)) >>>>> >>>>> while PAGE_SIZE is 16KiB or 64KiB, PTRS_PER_PTE can be a huge number? >>>>> >>>>> >>>>> Am I missing something? >>>> >>>> PTRS_PER_PTE is the number of PTE entries in a PTE table. On arm64 its as follows: >>>> >>>> PAGE_SIZE PAGE_SHIFT PTRS_PER_PTE >>>> 4K 12 512 >>>> 16K 14 2048 >>>> 64K 16 8192 >>>> >>>> So (PTRS_PER_PTE * PAGE_SIZE) = PMD_SIZE >>>> >>>> PMD_ORDER is ilog2(PMD_SIZE / PAGE_SIZE) = ilog2(PTRS_PER_PTE) >>>> >>>> MAX_PTRS_PER_PTE is just the maximum value that PTRS_PER_PTE will ever have, >>>> (and its equal to PTRS_PER_PTE except for powerpc). >>>> >>>> Pretty sure the math is correct? >>> >>> I am not convinced the math is correct :-) >>> >>> while page size is 64KiB, the page table is as below, >>> PMD_ORDER = L2 index bits = [41:29] = 13 != ilog2(8192) >> >> 1 << 13 = 8192 >> >> Right? So: >> >> ilog2(8192) = 13 >> >> What's wrong with that? >> >> I even checked in Python to make sure I'm not going mad: >> >>>>> import math >>>>> math.log2(8192) >> 13.0 > > You're correct. My mind fixated on the '16' in the line '64K 16 8192'. > I mistakenly thought ilog2(8192) equals 16. Apologies for the confusion. No worries! We got there in the end :) Of course my suggestion relies on being able to get a compile-time constant from ilog2(MAX_PTRS_PER_PTE). I think that should work, right? > >> >>> >>> >>> +--------+--------+--------+--------+--------+--------+--------+--------+ >>> |63 56|55 48|47 40|39 32|31 24|23 16|15 8|7 0| >>> +--------+--------+--------+--------+--------+--------+--------+--------+ >>> | | | | | >>> | | | | v >>> | | | | [15:0] in-page offset >>> | | | +----------> [28:16] L3 index >>> | | +--------------------------> [41:29] L2 index >>> | +-------------------------------> [47:42] L1 index (48-bit) >>> | [51:42] L1 index (52-bit) >>> +-------------------------------------------------> [63] TTBR0/1 >>> >>> while page size is 4KiB, the page table is as below, >>> >>> +--------+--------+--------+--------+--------+--------+--------+--------+ >>> |63 56|55 48|47 40|39 32|31 24|23 16|15 8|7 0| >>> +--------+--------+--------+--------+--------+--------+--------+--------+ >>> | | | | | | >>> | | | | | v >>> | | | | | [11:0] in-page offset >>> | | | | +-> [20:12] L3 index >>> | | | +-----------> [29:21] L2 index >>> | | +---------------------> [38:30] L1 index >>> | +-------------------------------> [47:39] L0 index >>> +-------------------------------------------------> [63] TTBR0/1 >>> >>> PMD_ORDER = L2 index bits = [29:21] = 9 = ilog2(512). >>> >>> You are only correct while page size = 4KiB. >>> >>> >>> >>> >>
On Fri, Apr 12, 2024 at 10:38 PM Ryan Roberts <ryan.roberts@arm.com> wrote: > > On 12/04/2024 11:29, Barry Song wrote: > > On Fri, Apr 12, 2024 at 10:25 PM Ryan Roberts <ryan.roberts@arm.com> wrote: > >> > >> On 12/04/2024 11:17, Barry Song wrote: > >>> On Fri, Apr 12, 2024 at 9:56 PM Ryan Roberts <ryan.roberts@arm.com> wrote: > >>>> > >>>> On 12/04/2024 10:43, Barry Song wrote: > >>>>> On Fri, Apr 12, 2024 at 9:27 PM Ryan Roberts <ryan.roberts@arm.com> wrote: > >>>>>> > >>>>>> Hi Barry, > >>>>>> > >>>>>> 2 remaining comments - otherwise looks good. (same comments I just made in the > >>>>>> v4 conversation). > >>>>>> > >>>>>> On 12/04/2024 08:37, Barry Song wrote: > >>>>>>> From: Barry Song <v-songbaohua@oppo.com> > >>>>>>> > >>>>>>> Profiling a system blindly with mTHP has become challenging due to the > >>>>>>> lack of visibility into its operations. Presenting the success rate of > >>>>>>> mTHP allocations appears to be pressing need. > >>>>>>> > >>>>>>> Recently, I've been experiencing significant difficulty debugging > >>>>>>> performance improvements and regressions without these figures. It's > >>>>>>> crucial for us to understand the true effectiveness of mTHP in real-world > >>>>>>> scenarios, especially in systems with fragmented memory. > >>>>>>> > >>>>>>> This patch establishes the framework for per-order mTHP > >>>>>>> counters. It begins by introducing the anon_fault_alloc and > >>>>>>> anon_fault_fallback counters. Additionally, to maintain consistency > >>>>>>> with thp_fault_fallback_charge in /proc/vmstat, this patch also tracks > >>>>>>> anon_fault_fallback_charge when mem_cgroup_charge fails for mTHP. > >>>>>>> Incorporating additional counters should now be straightforward as well. > >>>>>>> > >>>>>>> Signed-off-by: Barry Song <v-songbaohua@oppo.com> > >>>>>>> Cc: Chris Li <chrisl@kernel.org> > >>>>>>> Cc: David Hildenbrand <david@redhat.com> > >>>>>>> Cc: Domenico Cerasuolo <cerasuolodomenico@gmail.com> > >>>>>>> Cc: Kairui Song <kasong@tencent.com> > >>>>>>> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> > >>>>>>> Cc: Peter Xu <peterx@redhat.com> > >>>>>>> Cc: Ryan Roberts <ryan.roberts@arm.com> > >>>>>>> Cc: Suren Baghdasaryan <surenb@google.com> > >>>>>>> Cc: Yosry Ahmed <yosryahmed@google.com> > >>>>>>> Cc: Yu Zhao <yuzhao@google.com> > >>>>>>> --- > >>>>>>> include/linux/huge_mm.h | 51 ++++++++++++++++++++++++++++++++++ > >>>>>>> mm/huge_memory.c | 61 +++++++++++++++++++++++++++++++++++++++++ > >>>>>>> mm/memory.c | 3 ++ > >>>>>>> mm/page_alloc.c | 4 +++ > >>>>>>> 4 files changed, 119 insertions(+) > >>>>>>> > >>>>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h > >>>>>>> index e896ca4760f6..c5beb54b97cb 100644 > >>>>>>> --- a/include/linux/huge_mm.h > >>>>>>> +++ b/include/linux/huge_mm.h > >>>>>>> @@ -264,6 +264,57 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, > >>>>>>> enforce_sysfs, orders); > >>>>>>> } > >>>>>>> > >>>>>>> +enum mthp_stat_item { > >>>>>>> + MTHP_STAT_ANON_FAULT_ALLOC, > >>>>>>> + MTHP_STAT_ANON_FAULT_FALLBACK, > >>>>>>> + MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE, > >>>>>>> + __MTHP_STAT_COUNT > >>>>>>> +}; > >>>>>>> + > >>>>>>> +struct mthp_stat { > >>>>>>> + unsigned long stats[0][__MTHP_STAT_COUNT]; > >>>>>>> +}; > >>>>>>> + > >>>>>>> +extern struct mthp_stat __percpu *mthp_stats; > >>>>>>> + > >>>>>>> +static inline void count_mthp_stat(int order, enum mthp_stat_item item) > >>>>>>> +{ > >>>>>>> + if (order <= 0 || order > PMD_ORDER || !mthp_stats) > >>>>>>> + return; > >>>>>>> + > >>>>>>> + this_cpu_inc(mthp_stats->stats[order][item]); > >>>>>>> +} > >>>>>>> + > >>>>>>> +static inline void count_mthp_stats(int order, enum mthp_stat_item item, long delta) > >>>>>>> +{ > >>>>>>> + if (order <= 0 || order > PMD_ORDER || !mthp_stats) > >>>>>>> + return; > >>>>>>> + > >>>>>>> + this_cpu_add(mthp_stats->stats[order][item], delta); > >>>>>>> +} > >>>>>>> + > >>>>>>> +/* > >>>>>>> + * Fold the foreign cpu mthp stats into our own. > >>>>>>> + * > >>>>>>> + * This is adding to the stats on one processor > >>>>>>> + * but keeps the global counts constant. > >>>>>>> + */ > >>>>>>> +static inline void mthp_stats_fold_cpu(int cpu) > >>>>>>> +{ > >>>>>>> + struct mthp_stat *fold_stat; > >>>>>>> + int i, j; > >>>>>>> + > >>>>>>> + if (!mthp_stats) > >>>>>>> + return; > >>>>>>> + fold_stat = per_cpu_ptr(mthp_stats, cpu); > >>>>>>> + for (i = 1; i <= PMD_ORDER; i++) { > >>>>>>> + for (j = 0; j < __MTHP_STAT_COUNT; j++) { > >>>>>>> + count_mthp_stats(i, j, fold_stat->stats[i][j]); > >>>>>>> + fold_stat->stats[i][j] = 0; > >>>>>>> + } > >>>>>>> + } > >>>>>>> +} > >>>>>> > >>>>>> This is a pretty horrible hack; I'm pretty sure just summing for all *possible* > >>>>>> cpus should work. > >>>>>> > >>>>>>> + > >>>>>>> #define transparent_hugepage_use_zero_page() \ > >>>>>>> (transparent_hugepage_flags & \ > >>>>>>> (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG)) > >>>>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c > >>>>>>> index dc30139590e6..21c4ac74b484 100644 > >>>>>>> --- a/mm/huge_memory.c > >>>>>>> +++ b/mm/huge_memory.c > >>>>>>> @@ -526,6 +526,50 @@ static const struct kobj_type thpsize_ktype = { > >>>>>>> .sysfs_ops = &kobj_sysfs_ops, > >>>>>>> }; > >>>>>>> > >>>>>>> +struct mthp_stat __percpu *mthp_stats; > >>>>>>> + > >>>>>>> +static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item) > >>>>>>> +{ > >>>>>>> + unsigned long sum = 0; > >>>>>>> + int cpu; > >>>>>>> + > >>>>>>> + cpus_read_lock(); > >>>>>>> + for_each_online_cpu(cpu) { > >>>>>>> + struct mthp_stat *this = per_cpu_ptr(mthp_stats, cpu); > >>>>>>> + > >>>>>>> + sum += this->stats[order][item]; > >>>>>>> + } > >>>>>>> + cpus_read_unlock(); > >>>>>>> + > >>>>>>> + return sum; > >>>>>>> +} > >>>>>>> + > >>>>>>> +#define DEFINE_MTHP_STAT_ATTR(_name, _index) \ > >>>>>>> +static ssize_t _name##_show(struct kobject *kobj, \ > >>>>>>> + struct kobj_attribute *attr, char *buf) \ > >>>>>>> +{ \ > >>>>>>> + int order = to_thpsize(kobj)->order; \ > >>>>>>> + \ > >>>>>>> + return sysfs_emit(buf, "%lu\n", sum_mthp_stat(order, _index)); \ > >>>>>>> +} \ > >>>>>>> +static struct kobj_attribute _name##_attr = __ATTR_RO(_name) > >>>>>>> + > >>>>>>> +DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC); > >>>>>>> +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK); > >>>>>>> +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); > >>>>>>> + > >>>>>>> +static struct attribute *stats_attrs[] = { > >>>>>>> + &anon_fault_alloc_attr.attr, > >>>>>>> + &anon_fault_fallback_attr.attr, > >>>>>>> + &anon_fault_fallback_charge_attr.attr, > >>>>>>> + NULL, > >>>>>>> +}; > >>>>>>> + > >>>>>>> +static struct attribute_group stats_attr_group = { > >>>>>>> + .name = "stats", > >>>>>>> + .attrs = stats_attrs, > >>>>>>> +}; > >>>>>>> + > >>>>>>> static struct thpsize *thpsize_create(int order, struct kobject *parent) > >>>>>>> { > >>>>>>> unsigned long size = (PAGE_SIZE << order) / SZ_1K; > >>>>>>> @@ -549,6 +593,12 @@ static struct thpsize *thpsize_create(int order, struct kobject *parent) > >>>>>>> return ERR_PTR(ret); > >>>>>>> } > >>>>>>> > >>>>>>> + ret = sysfs_create_group(&thpsize->kobj, &stats_attr_group); > >>>>>>> + if (ret) { > >>>>>>> + kobject_put(&thpsize->kobj); > >>>>>>> + return ERR_PTR(ret); > >>>>>>> + } > >>>>>>> + > >>>>>>> thpsize->order = order; > >>>>>>> return thpsize; > >>>>>>> } > >>>>>>> @@ -691,6 +741,11 @@ static int __init hugepage_init(void) > >>>>>>> */ > >>>>>>> MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2); > >>>>>>> > >>>>>>> + mthp_stats = __alloc_percpu((PMD_ORDER + 1) * sizeof(mthp_stats->stats[0]), > >>>>>>> + sizeof(unsigned long)); > >>>>>> > >>>>>> Personally I think it would be cleaner to allocate statically using > >>>>>> ilog2(MAX_PTRS_PER_PTE) instead of PMD_ORDER. > >>>>> > >>>>> Hi Ryan, > >>>>> > >>>>> I don't understand why MAX_PTRS_PER_PTE is the correct size. For ARM64, > >>>>> > >>>>> #define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT) > >>>>> > >>>>> #define MAX_PTRS_PER_PTE PTRS_PER_PTE > >>>>> > >>>>> #define PTRS_PER_PTE (1 << (PAGE_SHIFT - 3)) > >>>>> > >>>>> while PAGE_SIZE is 16KiB or 64KiB, PTRS_PER_PTE can be a huge number? > >>>>> > >>>>> > >>>>> Am I missing something? > >>>> > >>>> PTRS_PER_PTE is the number of PTE entries in a PTE table. On arm64 its as follows: > >>>> > >>>> PAGE_SIZE PAGE_SHIFT PTRS_PER_PTE > >>>> 4K 12 512 > >>>> 16K 14 2048 > >>>> 64K 16 8192 > >>>> > >>>> So (PTRS_PER_PTE * PAGE_SIZE) = PMD_SIZE > >>>> > >>>> PMD_ORDER is ilog2(PMD_SIZE / PAGE_SIZE) = ilog2(PTRS_PER_PTE) > >>>> > >>>> MAX_PTRS_PER_PTE is just the maximum value that PTRS_PER_PTE will ever have, > >>>> (and its equal to PTRS_PER_PTE except for powerpc). > >>>> > >>>> Pretty sure the math is correct? > >>> > >>> I am not convinced the math is correct :-) > >>> > >>> while page size is 64KiB, the page table is as below, > >>> PMD_ORDER = L2 index bits = [41:29] = 13 != ilog2(8192) > >> > >> 1 << 13 = 8192 > >> > >> Right? So: > >> > >> ilog2(8192) = 13 > >> > >> What's wrong with that? > >> > >> I even checked in Python to make sure I'm not going mad: > >> > >>>>> import math > >>>>> math.log2(8192) > >> 13.0 > > > > You're correct. My mind fixated on the '16' in the line '64K 16 8192'. > > I mistakenly thought ilog2(8192) equals 16. Apologies for the confusion. > > No worries! We got there in the end :) > > Of course my suggestion relies on being able to get a compile-time constant from > ilog2(MAX_PTRS_PER_PTE). I think that should work, right? I guess so, ilog2 can detect compile-time const, otherwise, it will find the last (most-significant) bit set. I've implemented the following change, and the build all passed. Currently conducting testing. diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index c5beb54b97cb..d4fdb2641070 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -272,47 +272,17 @@ enum mthp_stat_item { }; struct mthp_stat { - unsigned long stats[0][__MTHP_STAT_COUNT]; + unsigned long stats[ilog2(MAX_PTRS_PER_PTE) + 1][__MTHP_STAT_COUNT]; }; -extern struct mthp_stat __percpu *mthp_stats; +DECLARE_PER_CPU(struct mthp_stat, mthp_stats); static inline void count_mthp_stat(int order, enum mthp_stat_item item) { - if (order <= 0 || order > PMD_ORDER || !mthp_stats) + if (order <= 0 || order > PMD_ORDER) return; - this_cpu_inc(mthp_stats->stats[order][item]); -} - -static inline void count_mthp_stats(int order, enum mthp_stat_item item, long delta) -{ - if (order <= 0 || order > PMD_ORDER || !mthp_stats) - return; - - this_cpu_add(mthp_stats->stats[order][item], delta); -} - -/* - * Fold the foreign cpu mthp stats into our own. - * - * This is adding to the stats on one processor - * but keeps the global counts constant. - */ -static inline void mthp_stats_fold_cpu(int cpu) -{ - struct mthp_stat *fold_stat; - int i, j; - - if (!mthp_stats) - return; - fold_stat = per_cpu_ptr(mthp_stats, cpu); - for (i = 1; i <= PMD_ORDER; i++) { - for (j = 0; j < __MTHP_STAT_COUNT; j++) { - count_mthp_stats(i, j, fold_stat->stats[i][j]); - fold_stat->stats[i][j] = 0; - } - } + this_cpu_inc(mthp_stats.stats[order][item]); } #define transparent_hugepage_use_zero_page() \ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 21c4ac74b484..e88961ffc398 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -526,20 +526,18 @@ static const struct kobj_type thpsize_ktype = { .sysfs_ops = &kobj_sysfs_ops, }; -struct mthp_stat __percpu *mthp_stats; +DEFINE_PER_CPU(struct mthp_stat, mthp_stats) = {{{0}}}; static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item) { unsigned long sum = 0; int cpu; - cpus_read_lock(); - for_each_online_cpu(cpu) { - struct mthp_stat *this = per_cpu_ptr(mthp_stats, cpu); + for_each_possible_cpu(cpu) { + struct mthp_stat *this = &per_cpu(mthp_stats, cpu); sum += this->stats[order][item]; } - cpus_read_unlock(); return sum; } @@ -741,11 +739,6 @@ static int __init hugepage_init(void) */ MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2); - mthp_stats = __alloc_percpu((PMD_ORDER + 1) * sizeof(mthp_stats->stats[0]), - sizeof(unsigned long)); - if (!mthp_stats) - return -ENOMEM; - err = hugepage_init_sysfs(&hugepage_kobj); if (err) goto err_sysfs; @@ -780,8 +773,6 @@ static int __init hugepage_init(void) err_slab: hugepage_exit_sysfs(hugepage_kobj); err_sysfs: - free_percpu(mthp_stats); - mthp_stats = NULL; return err; } subsys_initcall(hugepage_init); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3135b5ca2457..b51becf03d1e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5840,10 +5840,6 @@ static int page_alloc_cpu_dead(unsigned int cpu) */ vm_events_fold_cpu(cpu); -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - mthp_stats_fold_cpu(cpu); -#endif - /* * Zero the differential counters of the dead processor * so that the vm statistics are consistent. Thanks Barry
On 12/04/2024 11:53, Barry Song wrote: > On Fri, Apr 12, 2024 at 10:38 PM Ryan Roberts <ryan.roberts@arm.com> wrote: >> >> On 12/04/2024 11:29, Barry Song wrote: >>> On Fri, Apr 12, 2024 at 10:25 PM Ryan Roberts <ryan.roberts@arm.com> wrote: >>>> >>>> On 12/04/2024 11:17, Barry Song wrote: >>>>> On Fri, Apr 12, 2024 at 9:56 PM Ryan Roberts <ryan.roberts@arm.com> wrote: >>>>>> >>>>>> On 12/04/2024 10:43, Barry Song wrote: >>>>>>> On Fri, Apr 12, 2024 at 9:27 PM Ryan Roberts <ryan.roberts@arm.com> wrote: >>>>>>>> >>>>>>>> Hi Barry, >>>>>>>> >>>>>>>> 2 remaining comments - otherwise looks good. (same comments I just made in the >>>>>>>> v4 conversation). >>>>>>>> >>>>>>>> On 12/04/2024 08:37, Barry Song wrote: >>>>>>>>> From: Barry Song <v-songbaohua@oppo.com> >>>>>>>>> >>>>>>>>> Profiling a system blindly with mTHP has become challenging due to the >>>>>>>>> lack of visibility into its operations. Presenting the success rate of >>>>>>>>> mTHP allocations appears to be pressing need. >>>>>>>>> >>>>>>>>> Recently, I've been experiencing significant difficulty debugging >>>>>>>>> performance improvements and regressions without these figures. It's >>>>>>>>> crucial for us to understand the true effectiveness of mTHP in real-world >>>>>>>>> scenarios, especially in systems with fragmented memory. >>>>>>>>> >>>>>>>>> This patch establishes the framework for per-order mTHP >>>>>>>>> counters. It begins by introducing the anon_fault_alloc and >>>>>>>>> anon_fault_fallback counters. Additionally, to maintain consistency >>>>>>>>> with thp_fault_fallback_charge in /proc/vmstat, this patch also tracks >>>>>>>>> anon_fault_fallback_charge when mem_cgroup_charge fails for mTHP. >>>>>>>>> Incorporating additional counters should now be straightforward as well. >>>>>>>>> >>>>>>>>> Signed-off-by: Barry Song <v-songbaohua@oppo.com> >>>>>>>>> Cc: Chris Li <chrisl@kernel.org> >>>>>>>>> Cc: David Hildenbrand <david@redhat.com> >>>>>>>>> Cc: Domenico Cerasuolo <cerasuolodomenico@gmail.com> >>>>>>>>> Cc: Kairui Song <kasong@tencent.com> >>>>>>>>> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> >>>>>>>>> Cc: Peter Xu <peterx@redhat.com> >>>>>>>>> Cc: Ryan Roberts <ryan.roberts@arm.com> >>>>>>>>> Cc: Suren Baghdasaryan <surenb@google.com> >>>>>>>>> Cc: Yosry Ahmed <yosryahmed@google.com> >>>>>>>>> Cc: Yu Zhao <yuzhao@google.com> >>>>>>>>> --- >>>>>>>>> include/linux/huge_mm.h | 51 ++++++++++++++++++++++++++++++++++ >>>>>>>>> mm/huge_memory.c | 61 +++++++++++++++++++++++++++++++++++++++++ >>>>>>>>> mm/memory.c | 3 ++ >>>>>>>>> mm/page_alloc.c | 4 +++ >>>>>>>>> 4 files changed, 119 insertions(+) >>>>>>>>> >>>>>>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h >>>>>>>>> index e896ca4760f6..c5beb54b97cb 100644 >>>>>>>>> --- a/include/linux/huge_mm.h >>>>>>>>> +++ b/include/linux/huge_mm.h >>>>>>>>> @@ -264,6 +264,57 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, >>>>>>>>> enforce_sysfs, orders); >>>>>>>>> } >>>>>>>>> >>>>>>>>> +enum mthp_stat_item { >>>>>>>>> + MTHP_STAT_ANON_FAULT_ALLOC, >>>>>>>>> + MTHP_STAT_ANON_FAULT_FALLBACK, >>>>>>>>> + MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE, >>>>>>>>> + __MTHP_STAT_COUNT >>>>>>>>> +}; >>>>>>>>> + >>>>>>>>> +struct mthp_stat { >>>>>>>>> + unsigned long stats[0][__MTHP_STAT_COUNT]; >>>>>>>>> +}; >>>>>>>>> + >>>>>>>>> +extern struct mthp_stat __percpu *mthp_stats; >>>>>>>>> + >>>>>>>>> +static inline void count_mthp_stat(int order, enum mthp_stat_item item) >>>>>>>>> +{ >>>>>>>>> + if (order <= 0 || order > PMD_ORDER || !mthp_stats) >>>>>>>>> + return; >>>>>>>>> + >>>>>>>>> + this_cpu_inc(mthp_stats->stats[order][item]); >>>>>>>>> +} >>>>>>>>> + >>>>>>>>> +static inline void count_mthp_stats(int order, enum mthp_stat_item item, long delta) >>>>>>>>> +{ >>>>>>>>> + if (order <= 0 || order > PMD_ORDER || !mthp_stats) >>>>>>>>> + return; >>>>>>>>> + >>>>>>>>> + this_cpu_add(mthp_stats->stats[order][item], delta); >>>>>>>>> +} >>>>>>>>> + >>>>>>>>> +/* >>>>>>>>> + * Fold the foreign cpu mthp stats into our own. >>>>>>>>> + * >>>>>>>>> + * This is adding to the stats on one processor >>>>>>>>> + * but keeps the global counts constant. >>>>>>>>> + */ >>>>>>>>> +static inline void mthp_stats_fold_cpu(int cpu) >>>>>>>>> +{ >>>>>>>>> + struct mthp_stat *fold_stat; >>>>>>>>> + int i, j; >>>>>>>>> + >>>>>>>>> + if (!mthp_stats) >>>>>>>>> + return; >>>>>>>>> + fold_stat = per_cpu_ptr(mthp_stats, cpu); >>>>>>>>> + for (i = 1; i <= PMD_ORDER; i++) { >>>>>>>>> + for (j = 0; j < __MTHP_STAT_COUNT; j++) { >>>>>>>>> + count_mthp_stats(i, j, fold_stat->stats[i][j]); >>>>>>>>> + fold_stat->stats[i][j] = 0; >>>>>>>>> + } >>>>>>>>> + } >>>>>>>>> +} >>>>>>>> >>>>>>>> This is a pretty horrible hack; I'm pretty sure just summing for all *possible* >>>>>>>> cpus should work. >>>>>>>> >>>>>>>>> + >>>>>>>>> #define transparent_hugepage_use_zero_page() \ >>>>>>>>> (transparent_hugepage_flags & \ >>>>>>>>> (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG)) >>>>>>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c >>>>>>>>> index dc30139590e6..21c4ac74b484 100644 >>>>>>>>> --- a/mm/huge_memory.c >>>>>>>>> +++ b/mm/huge_memory.c >>>>>>>>> @@ -526,6 +526,50 @@ static const struct kobj_type thpsize_ktype = { >>>>>>>>> .sysfs_ops = &kobj_sysfs_ops, >>>>>>>>> }; >>>>>>>>> >>>>>>>>> +struct mthp_stat __percpu *mthp_stats; >>>>>>>>> + >>>>>>>>> +static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item) >>>>>>>>> +{ >>>>>>>>> + unsigned long sum = 0; >>>>>>>>> + int cpu; >>>>>>>>> + >>>>>>>>> + cpus_read_lock(); >>>>>>>>> + for_each_online_cpu(cpu) { >>>>>>>>> + struct mthp_stat *this = per_cpu_ptr(mthp_stats, cpu); >>>>>>>>> + >>>>>>>>> + sum += this->stats[order][item]; >>>>>>>>> + } >>>>>>>>> + cpus_read_unlock(); >>>>>>>>> + >>>>>>>>> + return sum; >>>>>>>>> +} >>>>>>>>> + >>>>>>>>> +#define DEFINE_MTHP_STAT_ATTR(_name, _index) \ >>>>>>>>> +static ssize_t _name##_show(struct kobject *kobj, \ >>>>>>>>> + struct kobj_attribute *attr, char *buf) \ >>>>>>>>> +{ \ >>>>>>>>> + int order = to_thpsize(kobj)->order; \ >>>>>>>>> + \ >>>>>>>>> + return sysfs_emit(buf, "%lu\n", sum_mthp_stat(order, _index)); \ >>>>>>>>> +} \ >>>>>>>>> +static struct kobj_attribute _name##_attr = __ATTR_RO(_name) >>>>>>>>> + >>>>>>>>> +DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC); >>>>>>>>> +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK); >>>>>>>>> +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); >>>>>>>>> + >>>>>>>>> +static struct attribute *stats_attrs[] = { >>>>>>>>> + &anon_fault_alloc_attr.attr, >>>>>>>>> + &anon_fault_fallback_attr.attr, >>>>>>>>> + &anon_fault_fallback_charge_attr.attr, >>>>>>>>> + NULL, >>>>>>>>> +}; >>>>>>>>> + >>>>>>>>> +static struct attribute_group stats_attr_group = { >>>>>>>>> + .name = "stats", >>>>>>>>> + .attrs = stats_attrs, >>>>>>>>> +}; >>>>>>>>> + >>>>>>>>> static struct thpsize *thpsize_create(int order, struct kobject *parent) >>>>>>>>> { >>>>>>>>> unsigned long size = (PAGE_SIZE << order) / SZ_1K; >>>>>>>>> @@ -549,6 +593,12 @@ static struct thpsize *thpsize_create(int order, struct kobject *parent) >>>>>>>>> return ERR_PTR(ret); >>>>>>>>> } >>>>>>>>> >>>>>>>>> + ret = sysfs_create_group(&thpsize->kobj, &stats_attr_group); >>>>>>>>> + if (ret) { >>>>>>>>> + kobject_put(&thpsize->kobj); >>>>>>>>> + return ERR_PTR(ret); >>>>>>>>> + } >>>>>>>>> + >>>>>>>>> thpsize->order = order; >>>>>>>>> return thpsize; >>>>>>>>> } >>>>>>>>> @@ -691,6 +741,11 @@ static int __init hugepage_init(void) >>>>>>>>> */ >>>>>>>>> MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2); >>>>>>>>> >>>>>>>>> + mthp_stats = __alloc_percpu((PMD_ORDER + 1) * sizeof(mthp_stats->stats[0]), >>>>>>>>> + sizeof(unsigned long)); >>>>>>>> >>>>>>>> Personally I think it would be cleaner to allocate statically using >>>>>>>> ilog2(MAX_PTRS_PER_PTE) instead of PMD_ORDER. >>>>>>> >>>>>>> Hi Ryan, >>>>>>> >>>>>>> I don't understand why MAX_PTRS_PER_PTE is the correct size. For ARM64, >>>>>>> >>>>>>> #define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT) >>>>>>> >>>>>>> #define MAX_PTRS_PER_PTE PTRS_PER_PTE >>>>>>> >>>>>>> #define PTRS_PER_PTE (1 << (PAGE_SHIFT - 3)) >>>>>>> >>>>>>> while PAGE_SIZE is 16KiB or 64KiB, PTRS_PER_PTE can be a huge number? >>>>>>> >>>>>>> >>>>>>> Am I missing something? >>>>>> >>>>>> PTRS_PER_PTE is the number of PTE entries in a PTE table. On arm64 its as follows: >>>>>> >>>>>> PAGE_SIZE PAGE_SHIFT PTRS_PER_PTE >>>>>> 4K 12 512 >>>>>> 16K 14 2048 >>>>>> 64K 16 8192 >>>>>> >>>>>> So (PTRS_PER_PTE * PAGE_SIZE) = PMD_SIZE >>>>>> >>>>>> PMD_ORDER is ilog2(PMD_SIZE / PAGE_SIZE) = ilog2(PTRS_PER_PTE) >>>>>> >>>>>> MAX_PTRS_PER_PTE is just the maximum value that PTRS_PER_PTE will ever have, >>>>>> (and its equal to PTRS_PER_PTE except for powerpc). >>>>>> >>>>>> Pretty sure the math is correct? >>>>> >>>>> I am not convinced the math is correct :-) >>>>> >>>>> while page size is 64KiB, the page table is as below, >>>>> PMD_ORDER = L2 index bits = [41:29] = 13 != ilog2(8192) >>>> >>>> 1 << 13 = 8192 >>>> >>>> Right? So: >>>> >>>> ilog2(8192) = 13 >>>> >>>> What's wrong with that? >>>> >>>> I even checked in Python to make sure I'm not going mad: >>>> >>>>>>> import math >>>>>>> math.log2(8192) >>>> 13.0 >>> >>> You're correct. My mind fixated on the '16' in the line '64K 16 8192'. >>> I mistakenly thought ilog2(8192) equals 16. Apologies for the confusion. >> >> No worries! We got there in the end :) >> >> Of course my suggestion relies on being able to get a compile-time constant from >> ilog2(MAX_PTRS_PER_PTE). I think that should work, right? > > I guess so, ilog2 can detect compile-time const, otherwise, it will find the > last (most-significant) bit set. > > I've implemented the following change, and the build all passed. > Currently conducting testing. LGTM - much cleaner! > > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h > index c5beb54b97cb..d4fdb2641070 100644 > --- a/include/linux/huge_mm.h > +++ b/include/linux/huge_mm.h > @@ -272,47 +272,17 @@ enum mthp_stat_item { > }; > > struct mthp_stat { > - unsigned long stats[0][__MTHP_STAT_COUNT]; > + unsigned long stats[ilog2(MAX_PTRS_PER_PTE) + 1][__MTHP_STAT_COUNT]; > }; > > -extern struct mthp_stat __percpu *mthp_stats; > +DECLARE_PER_CPU(struct mthp_stat, mthp_stats); > > static inline void count_mthp_stat(int order, enum mthp_stat_item item) > { > - if (order <= 0 || order > PMD_ORDER || !mthp_stats) > + if (order <= 0 || order > PMD_ORDER) > return; > > - this_cpu_inc(mthp_stats->stats[order][item]); > -} > - > -static inline void count_mthp_stats(int order, enum mthp_stat_item > item, long delta) > -{ > - if (order <= 0 || order > PMD_ORDER || !mthp_stats) > - return; > - > - this_cpu_add(mthp_stats->stats[order][item], delta); > -} > - > -/* > - * Fold the foreign cpu mthp stats into our own. > - * > - * This is adding to the stats on one processor > - * but keeps the global counts constant. > - */ > -static inline void mthp_stats_fold_cpu(int cpu) > -{ > - struct mthp_stat *fold_stat; > - int i, j; > - > - if (!mthp_stats) > - return; > - fold_stat = per_cpu_ptr(mthp_stats, cpu); > - for (i = 1; i <= PMD_ORDER; i++) { > - for (j = 0; j < __MTHP_STAT_COUNT; j++) { > - count_mthp_stats(i, j, fold_stat->stats[i][j]); > - fold_stat->stats[i][j] = 0; > - } > - } > + this_cpu_inc(mthp_stats.stats[order][item]); > } > > #define transparent_hugepage_use_zero_page() \ > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > index 21c4ac74b484..e88961ffc398 100644 > --- a/mm/huge_memory.c > +++ b/mm/huge_memory.c > @@ -526,20 +526,18 @@ static const struct kobj_type thpsize_ktype = { > .sysfs_ops = &kobj_sysfs_ops, > }; > > -struct mthp_stat __percpu *mthp_stats; > +DEFINE_PER_CPU(struct mthp_stat, mthp_stats) = {{{0}}}; > > static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item) > { > unsigned long sum = 0; > int cpu; > > - cpus_read_lock(); > - for_each_online_cpu(cpu) { > - struct mthp_stat *this = per_cpu_ptr(mthp_stats, cpu); > + for_each_possible_cpu(cpu) { > + struct mthp_stat *this = &per_cpu(mthp_stats, cpu); > > sum += this->stats[order][item]; > } > - cpus_read_unlock(); > > return sum; > } > @@ -741,11 +739,6 @@ static int __init hugepage_init(void) > */ > MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2); > > - mthp_stats = __alloc_percpu((PMD_ORDER + 1) * > sizeof(mthp_stats->stats[0]), > - sizeof(unsigned long)); > - if (!mthp_stats) > - return -ENOMEM; > - > err = hugepage_init_sysfs(&hugepage_kobj); > if (err) > goto err_sysfs; > @@ -780,8 +773,6 @@ static int __init hugepage_init(void) > err_slab: > hugepage_exit_sysfs(hugepage_kobj); > err_sysfs: > - free_percpu(mthp_stats); > - mthp_stats = NULL; > return err; > } > subsys_initcall(hugepage_init); > diff --git a/mm/page_alloc.c b/mm/page_alloc.c > index 3135b5ca2457..b51becf03d1e 100644 > --- a/mm/page_alloc.c > +++ b/mm/page_alloc.c > @@ -5840,10 +5840,6 @@ static int page_alloc_cpu_dead(unsigned int cpu) > */ > vm_events_fold_cpu(cpu); > > -#ifdef CONFIG_TRANSPARENT_HUGEPAGE > - mthp_stats_fold_cpu(cpu); > -#endif > - > /* > * Zero the differential counters of the dead processor > * so that the vm statistics are consistent. > > > Thanks > Barry
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index e896ca4760f6..c5beb54b97cb 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -264,6 +264,57 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, enforce_sysfs, orders); } +enum mthp_stat_item { + MTHP_STAT_ANON_FAULT_ALLOC, + MTHP_STAT_ANON_FAULT_FALLBACK, + MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE, + __MTHP_STAT_COUNT +}; + +struct mthp_stat { + unsigned long stats[0][__MTHP_STAT_COUNT]; +}; + +extern struct mthp_stat __percpu *mthp_stats; + +static inline void count_mthp_stat(int order, enum mthp_stat_item item) +{ + if (order <= 0 || order > PMD_ORDER || !mthp_stats) + return; + + this_cpu_inc(mthp_stats->stats[order][item]); +} + +static inline void count_mthp_stats(int order, enum mthp_stat_item item, long delta) +{ + if (order <= 0 || order > PMD_ORDER || !mthp_stats) + return; + + this_cpu_add(mthp_stats->stats[order][item], delta); +} + +/* + * Fold the foreign cpu mthp stats into our own. + * + * This is adding to the stats on one processor + * but keeps the global counts constant. + */ +static inline void mthp_stats_fold_cpu(int cpu) +{ + struct mthp_stat *fold_stat; + int i, j; + + if (!mthp_stats) + return; + fold_stat = per_cpu_ptr(mthp_stats, cpu); + for (i = 1; i <= PMD_ORDER; i++) { + for (j = 0; j < __MTHP_STAT_COUNT; j++) { + count_mthp_stats(i, j, fold_stat->stats[i][j]); + fold_stat->stats[i][j] = 0; + } + } +} + #define transparent_hugepage_use_zero_page() \ (transparent_hugepage_flags & \ (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG)) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index dc30139590e6..21c4ac74b484 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -526,6 +526,50 @@ static const struct kobj_type thpsize_ktype = { .sysfs_ops = &kobj_sysfs_ops, }; +struct mthp_stat __percpu *mthp_stats; + +static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item) +{ + unsigned long sum = 0; + int cpu; + + cpus_read_lock(); + for_each_online_cpu(cpu) { + struct mthp_stat *this = per_cpu_ptr(mthp_stats, cpu); + + sum += this->stats[order][item]; + } + cpus_read_unlock(); + + return sum; +} + +#define DEFINE_MTHP_STAT_ATTR(_name, _index) \ +static ssize_t _name##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, char *buf) \ +{ \ + int order = to_thpsize(kobj)->order; \ + \ + return sysfs_emit(buf, "%lu\n", sum_mthp_stat(order, _index)); \ +} \ +static struct kobj_attribute _name##_attr = __ATTR_RO(_name) + +DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC); +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK); +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); + +static struct attribute *stats_attrs[] = { + &anon_fault_alloc_attr.attr, + &anon_fault_fallback_attr.attr, + &anon_fault_fallback_charge_attr.attr, + NULL, +}; + +static struct attribute_group stats_attr_group = { + .name = "stats", + .attrs = stats_attrs, +}; + static struct thpsize *thpsize_create(int order, struct kobject *parent) { unsigned long size = (PAGE_SIZE << order) / SZ_1K; @@ -549,6 +593,12 @@ static struct thpsize *thpsize_create(int order, struct kobject *parent) return ERR_PTR(ret); } + ret = sysfs_create_group(&thpsize->kobj, &stats_attr_group); + if (ret) { + kobject_put(&thpsize->kobj); + return ERR_PTR(ret); + } + thpsize->order = order; return thpsize; } @@ -691,6 +741,11 @@ static int __init hugepage_init(void) */ MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2); + mthp_stats = __alloc_percpu((PMD_ORDER + 1) * sizeof(mthp_stats->stats[0]), + sizeof(unsigned long)); + if (!mthp_stats) + return -ENOMEM; + err = hugepage_init_sysfs(&hugepage_kobj); if (err) goto err_sysfs; @@ -725,6 +780,8 @@ static int __init hugepage_init(void) err_slab: hugepage_exit_sysfs(hugepage_kobj); err_sysfs: + free_percpu(mthp_stats); + mthp_stats = NULL; return err; } subsys_initcall(hugepage_init); @@ -880,6 +937,8 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, folio_put(folio); count_vm_event(THP_FAULT_FALLBACK); count_vm_event(THP_FAULT_FALLBACK_CHARGE); + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK); + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); return VM_FAULT_FALLBACK; } folio_throttle_swaprate(folio, gfp); @@ -929,6 +988,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, mm_inc_nr_ptes(vma->vm_mm); spin_unlock(vmf->ptl); count_vm_event(THP_FAULT_ALLOC); + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC); count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); } @@ -1050,6 +1110,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true); if (unlikely(!folio)) { count_vm_event(THP_FAULT_FALLBACK); + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK); return VM_FAULT_FALLBACK; } return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp); diff --git a/mm/memory.c b/mm/memory.c index 649a547fe8e3..06048af7cf9a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4368,6 +4368,7 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) folio = vma_alloc_folio(gfp, order, vma, addr, true); if (folio) { if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) { + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); folio_put(folio); goto next; } @@ -4376,6 +4377,7 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) return folio; } next: + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK); order = next_order(&orders, order); } @@ -4485,6 +4487,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) folio_ref_add(folio, nr_pages - 1); add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages); + count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_FAULT_ALLOC); folio_add_new_anon_rmap(folio, vma, addr); folio_add_lru_vma(folio, vma); setpte: diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b51becf03d1e..3135b5ca2457 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5840,6 +5840,10 @@ static int page_alloc_cpu_dead(unsigned int cpu) */ vm_events_fold_cpu(cpu); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + mthp_stats_fold_cpu(cpu); +#endif + /* * Zero the differential counters of the dead processor * so that the vm statistics are consistent.