@@ -800,7 +800,12 @@ enum zone_type {
};
-#define __MAX_NR_ZONES __MAX_NR_ZONE_TYPES
+#ifdef CONFIG_64BIT
+#define __MAX_NR_SPLIT_ZONES 4
+#else
+#define __MAX_NR_SPLIT_ZONES 0
+#endif
+#define __MAX_NR_ZONES (__MAX_NR_ZONE_TYPES + __MAX_NR_SPLIT_ZONES)
#ifndef __GENERATING_BOUNDS_H
@@ -1106,6 +1111,12 @@ static inline bool zone_intersects(struct zone *zone,
return true;
}
+#ifdef CONFIG_64BIT
+#define MAX_NR_ZONES_PER_TYPE 4
+#else
+#define MAX_NR_ZONES_PER_TYPE 1
+#endif
+
struct zone_type_struct {
int start_zone_idx;
int last_zone_idx;
@@ -32,6 +32,8 @@
#define ZONES_SHIFT 2
#elif MAX_NR_ZONES <= 8
#define ZONES_SHIFT 3
+#elif MAX_NR_ZONES <= 16
+#define ZONES_SHIFT 4
#else
#error ZONES_SHIFT "Too many zones configured"
#endif
@@ -7836,24 +7836,106 @@ void __ref free_area_init_core_hotplug(struct pglist_data *pgdat)
}
#endif
+static void __init setup_zone_size(struct pglist_data *pgdat, struct zone *zone,
+ enum zone_type zt, unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ unsigned long spanned, absent;
+ unsigned long zstart_pfn, zend_pfn;
+
+ spanned = zone_spanned_pages_in_node(pgdat->node_id, zt,
+ start_pfn,
+ end_pfn,
+ &zstart_pfn,
+ &zend_pfn);
+ absent = zone_absent_pages_in_node(pgdat->node_id, zt,
+ start_pfn,
+ end_pfn);
+ zone->zone_start_pfn = zstart_pfn;
+ zone->spanned_pages = spanned;
+ zone->present_pages = spanned - absent;
+#if defined(CONFIG_MEMORY_HOTPLUG)
+ zone->present_early_pages = zone->present_pages;
+#endif
+}
+
+#define SPLIT_ZONE_ALIGN_PAGES ((1UL * 1024 * 1024 * 1024) >> PAGE_SHIFT)
+
+#ifdef CONFIG_64BIT
+/* 254GB instead of 256GB to deal with ZONE_DMA32 and small memory holes */
+#define SPLIT_ZONE_PAGES ((254UL * 1024 * 1024 * 1024) >> PAGE_SHIFT)
+
+static int split_zone_type_number(struct pglist_data *pgdat,
+ struct zone_type_struct *zts,
+ struct zone *zone)
+{
+ int nr, remaining;
+
+ if (zts->present_pages < SPLIT_ZONE_PAGES * 2)
+ return 1;
+
+ /* Remaining number of zones can be used for the zone type */
+ remaining = 1 + (MAX_NR_ZONES - MAX_NR_ZONE_TYPES) -
+ ((zone - pgdat->node_zones) - (zts - pgdat->node_zone_types));
+ nr = zts->present_pages / SPLIT_ZONE_PAGES;
+ nr = min3(nr, remaining, MAX_NR_ZONES_PER_TYPE);
+
+ return nr;
+}
+#else
+static int split_zone_type_number(struct pglist_data *pgdat,
+ struct zone_type_struct *zts,
+ struct zone *zone)
+{
+ return 1;
+}
+#endif
+
static void __init zones_init(struct pglist_data *pgdat)
{
- enum zone_type j;
+ enum zone_type zt;
struct zone_type_struct *zts;
- struct zone *zone;
+ struct zone *zone = pgdat->node_zones;
+ int split_nr;
- for (j = 0; j < MAX_NR_ZONE_TYPES; j++) {
- zts = pgdat->node_zone_types + j;
- zone = pgdat->node_zones + j;
+ BUILD_BUG_ON(MAX_NR_ZONES_PER_TYPE > __MAX_NR_SPLIT_ZONES + 1);
+ for (zt = 0; zt < MAX_NR_ZONE_TYPES; zt++) {
+ zts = pgdat->node_zone_types + zt;
- zts->start_zone_idx = zts->last_zone_idx = zone - pgdat->node_zones;
- zone->type = j;
- zone->zone_start_pfn = zts->zts_start_pfn;
- zone->spanned_pages = zts->spanned_pages;
- zone->present_pages = zts->present_pages;
+ zts->start_zone_idx = zone - pgdat->node_zones;
+ split_nr = split_zone_type_number(pgdat, zts, zone);
+ if (split_nr > 1) {
+ unsigned long split_span = zts->spanned_pages / split_nr;
+ unsigned long start_pfn = zts->zts_start_pfn;
+ unsigned long end_pfn;
+ unsigned long zts_end_pfn = zts->zts_start_pfn + zts->spanned_pages;
+ int i;
+
+ for (i = 0; i < split_nr && start_pfn < zts_end_pfn; i++) {
+ if (i == split_nr - 1) {
+ end_pfn = zts_end_pfn;
+ } else {
+ end_pfn = ALIGN(start_pfn + split_span,
+ SPLIT_ZONE_ALIGN_PAGES);
+ if (end_pfn > zts_end_pfn)
+ end_pfn = zts_end_pfn;
+ }
+ setup_zone_size(pgdat, zone, zt, start_pfn, end_pfn);
+ zone->type = zt;
+ start_pfn = end_pfn;
+ zone++;
+ }
+ } else {
+ zone->type = zt;
+ zone->zone_start_pfn = zts->zts_start_pfn;
+ zone->spanned_pages = zts->spanned_pages;
+ zone->present_pages = zts->present_pages;
#if defined(CONFIG_MEMORY_HOTPLUG)
- zone->present_early_pages = zts->present_early_pages;
+ zone->present_early_pages = zts->present_early_pages;
#endif
+ zone++;
+ }
+ zts->last_zone_idx = (zone - pgdat->node_zones) - 1;
}
}
More and more cores are put in one physical CPU (usually one NUMA node too). In 2023, one high-end server CPU has 56, 64, or more cores. Even more cores per physical CPU are planned for future CPUs. While all cores in one physical CPU will contend for the page allocation on one zone in most cases. This causes heavy zone lock contention in some workloads. And the situation will become worse and worse in the future. For example, on an 2-socket Intel server machine with 224 logical CPUs, if the kernel is built with `make -j224`, the zone lock contention cycles% can reach up to about 12.7%. To improve the scalability of the zone lock contention, in this patch, we will create one zone instance for each about 256 GB memory of a zone type generally. In next patch of the series, different logical CPUs will prefer different zone instances based on the logical CPU No. So the total number of logical CPUs contend on one zone will be reduced. Thus the scalability is improved. Combined with the next patch in the series ("mm: prefer different zone list on different logical CPU"), the zone lock contention cycles% reduces to less than 1.6% in the above kbuild test case when 4 zone instances are created for ZONE_NORMAL. Also tested with the will-it-scale/page_fault1 with 16 processes. With the optimization, the benchmark score increases up to 18.2% and the zone lock contention reduces from 13.01% to 0.56%. To create multiple zone instances for a zone type, another choice is to create zone instances based on the total number of logical CPUs. We choose to use memory size because it is easier to be implemented. In most cases, the more the cores, the larger the memory size is. And, on system with larger memory size, the performance requirement of the page allocator is usually higher. Signed-off-by: "Huang, Ying" <ying.huang@intel.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: David Hildenbrand <david@redhat.com> Cc: Johannes Weiner <jweiner@redhat.com> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Michal Hocko <mhocko@suse.com> Cc: Pavel Tatashin <pasha.tatashin@soleen.com> Cc: Matthew Wilcox <willy@infradead.org> --- include/linux/mmzone.h | 13 +++- include/linux/page-flags-layout.h | 2 + mm/page_alloc.c | 104 ++++++++++++++++++++++++++---- 3 files changed, 107 insertions(+), 12 deletions(-)