@@ -164,7 +164,11 @@ static inline int gfp_zonelist(gfp_t flags)
*/
static inline struct zonelist *node_zonelist(int nid, gfp_t flags)
{
- return NODE_DATA(nid)->node_zonelists + gfp_zonelist(flags);
+ pg_data_t *pgdat = NODE_DATA(nid);
+ int li;
+
+ li = raw_smp_processor_id() % pgdat->max_nr_zones_per_type;
+ return pgdat->node_zonelists[li] + gfp_zonelist(flags);
}
#ifndef HAVE_ARCH_FREE_PAGE
@@ -1240,10 +1240,11 @@ typedef struct pglist_data {
* Generally the first zones will be references to this node's
* node_zones.
*/
- struct zonelist node_zonelists[MAX_ZONELISTS];
+ struct zonelist node_zonelists[MAX_NR_ZONES_PER_TYPE][MAX_ZONELISTS];
int nr_zones; /* number of populated zones in this node */
int nr_zone_types;
+ int max_nr_zones_per_type;
#ifdef CONFIG_FLATMEM /* means !SPARSEMEM */
struct page *node_mem_map;
#ifdef CONFIG_PAGE_EXTENSION
@@ -1699,7 +1700,7 @@ static inline bool movable_only_nodes(nodemask_t *nodes)
* at least one zone that can satisfy kernel allocations.
*/
nid = first_node(*nodes);
- zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK];
+ zonelist = &NODE_DATA(nid)->node_zonelists[0][ZONELIST_FALLBACK];
z = first_zones_zonelist(zonelist, ZONE_NORMAL, nodes);
return (!z->zone) ? true : false;
}
@@ -1924,7 +1924,11 @@ unsigned int mempolicy_slab_node(void)
*/
struct zonelist *zonelist;
enum zone_type highest_zone_type = gfp_zone(GFP_KERNEL);
- zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
+ pg_data_t *pgdat = NODE_DATA(node);
+ int li;
+
+ li = raw_smp_processor_id() % pgdat->max_nr_zones_per_type;
+ zonelist = &pgdat->node_zonelists[li][ZONELIST_FALLBACK];
z = first_zones_zonelist(zonelist, highest_zone_type,
&policy->nodes);
return z->zone ? zone_to_nid(z->zone) : node;
@@ -40,7 +40,7 @@ void __init mminit_verify_zonelist(void)
/* Identify the zone and nodelist */
zoneid = i % MAX_NR_ZONES;
listid = i / MAX_NR_ZONES;
- zonelist = &pgdat->node_zonelists[listid];
+ zonelist = &pgdat->node_zonelists[0][listid];
zone = &pgdat->node_zones[zoneid];
if (!populated_zone(zone))
continue;
@@ -6351,20 +6351,25 @@ static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
*
* Add all populated zones of a node to the zonelist.
*/
-static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs)
+static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs,
+ int zidx_in_type)
{
+ struct zone_type_struct *zts;
struct zone *zone;
- int zid = MAX_NR_ZONES;
- int nr_zones = 0;
+ int zt, i, nr, nr_zones = 0;
- do {
- zid--;
- zone = pgdat->node_zones + zid;
- if (populated_zone(zone)) {
+ for (zt = MAX_NR_ZONE_TYPES - 1; zt >= 0; zt--) {
+ zts = pgdat->node_zone_types + zt;
+ if (!zts->present_pages)
+ continue;
+ nr = zts->last_zone_idx - zts->start_zone_idx + 1;
+ for (i = 0; i < nr; i++) {
+ zone = pgdat->node_zones + zts->start_zone_idx;
+ zone += (zidx_in_type + i) % nr;
zoneref_set_zone(zone, &zonerefs[nr_zones++]);
check_highest_zone(zone_type_num(zone));
}
- } while (zid);
+ }
return nr_zones;
}
@@ -6462,27 +6467,48 @@ int find_next_best_node(int node, nodemask_t *used_node_mask)
}
+static void __build_zonelists_in_node_order(pg_data_t *pgdat, int *node_order,
+ unsigned nr_nodes, int zidx_in_type)
+{
+ struct zoneref *zonerefs;
+ int i;
+
+ zonerefs = pgdat->node_zonelists[zidx_in_type][ZONELIST_FALLBACK]._zonerefs;
+
+ for (i = 0; i < nr_nodes; i++) {
+ int nr_zones;
+
+ pg_data_t *node = NODE_DATA(node_order[i]);
+
+ nr_zones = build_zonerefs_node(node, zonerefs, zidx_in_type);
+ zonerefs += nr_zones;
+ }
+ zonerefs->zone = NULL;
+ zonerefs->zone_type = 0;
+}
+
/*
* Build zonelists ordered by node and zones within node.
* This results in maximum locality--normal zone overflows into local
* DMA zone, if any--but risks exhausting DMA zone.
*/
static void build_zonelists_in_node_order(pg_data_t *pgdat, int *node_order,
- unsigned nr_nodes)
+ unsigned nr_nodes)
{
- struct zoneref *zonerefs;
int i;
- zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
+ for (i = 0; i < pgdat->max_nr_zones_per_type; i++)
+ __build_zonelists_in_node_order(pgdat, node_order, nr_nodes, i);
+}
- for (i = 0; i < nr_nodes; i++) {
- int nr_zones;
+static void __build_thisnode_zonelists(pg_data_t *pgdat, int zidx_in_type)
+{
+ struct zoneref *zonerefs;
+ int nr_zones;
- pg_data_t *node = NODE_DATA(node_order[i]);
-
- nr_zones = build_zonerefs_node(node, zonerefs);
- zonerefs += nr_zones;
- }
+ zonerefs = pgdat->node_zonelists[zidx_in_type][ZONELIST_NOFALLBACK]._zonerefs;
+ nr_zones = build_zonerefs_node(pgdat, zonerefs, zidx_in_type);
+ zonerefs += nr_zones;
zonerefs->zone = NULL;
zonerefs->zone_type = 0;
}
@@ -6492,14 +6518,10 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int *node_order,
*/
static void build_thisnode_zonelists(pg_data_t *pgdat)
{
- struct zoneref *zonerefs;
- int nr_zones;
+ int i;
- zonerefs = pgdat->node_zonelists[ZONELIST_NOFALLBACK]._zonerefs;
- nr_zones = build_zonerefs_node(pgdat, zonerefs);
- zonerefs += nr_zones;
- zonerefs->zone = NULL;
- zonerefs->zone_type = 0;
+ for (i = 0; i < pgdat->max_nr_zones_per_type; i++)
+ __build_thisnode_zonelists(pgdat, i);
}
/*
@@ -6565,7 +6587,7 @@ static void setup_min_unmapped_ratio(void);
static void setup_min_slab_ratio(void);
#else /* CONFIG_NUMA */
-static void build_zonelists(pg_data_t *pgdat)
+static void __build_zonelists(pg_data_t *pgdat, int zidx_in_type)
{
int node, local_node;
struct zoneref *zonerefs;
@@ -6573,8 +6595,8 @@ static void build_zonelists(pg_data_t *pgdat)
local_node = pgdat->node_id;
- zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
- nr_zones = build_zonerefs_node(pgdat, zonerefs);
+ zonerefs = pgdat->node_zonelists[zidx_in_type][ZONELIST_FALLBACK]._zonerefs;
+ nr_zones = build_zonerefs_node(pgdat, zonerefs, zidx_in_type);
zonerefs += nr_zones;
/*
@@ -6588,13 +6610,13 @@ static void build_zonelists(pg_data_t *pgdat)
for (node = local_node + 1; node < MAX_NUMNODES; node++) {
if (!node_online(node))
continue;
- nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
+ nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs, zidx_in_type);
zonerefs += nr_zones;
}
for (node = 0; node < local_node; node++) {
if (!node_online(node))
continue;
- nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
+ nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs, zidx_in_type);
zonerefs += nr_zones;
}
@@ -6602,6 +6624,14 @@ static void build_zonelists(pg_data_t *pgdat)
zonerefs->zone_type = 0;
}
+static void build_zonelists(pg_data_t *pgdat)
+{
+ int i;
+
+ for (i = 0; i < pgdat->max_nr_zones_per_type; i++)
+ __build_zonelists(pgdat, i);
+}
+
#endif /* CONFIG_NUMA */
/*
@@ -7899,6 +7929,7 @@ static void __init zones_init(struct pglist_data *pgdat)
int split_nr;
BUILD_BUG_ON(MAX_NR_ZONES_PER_TYPE > __MAX_NR_SPLIT_ZONES + 1);
+ pgdat->max_nr_zones_per_type = 1;
for (zt = 0; zt < MAX_NR_ZONE_TYPES; zt++) {
zts = pgdat->node_zone_types + zt;
@@ -7925,6 +7956,8 @@ static void __init zones_init(struct pglist_data *pgdat)
start_pfn = end_pfn;
zone++;
}
+ if (i > pgdat->max_nr_zones_per_type)
+ pgdat->max_nr_zones_per_type = i;
} else {
zone->type = zt;
zone->zone_start_pfn = zts->zts_start_pfn;
Originally, there are only one fallback and one no-fallback zone list for each NUMA node (pglist_data->node_zonelists). That is, all logical CPUs of a NUMA node will use one zone list during the page allocation. This isn't a problem before, because there's at most one instance for each zone type. Now, we may create multiple zone instances for one zone type. This makes it possible for the different logical CPUs to prefer different zone instance of one zone type to improve the zone lock scalability. So, in this patch, multiple fallback and multiple no-fallback zone lists can be created for each NUMA node based on the max zone instances number for one zone type of the NUMA node. Then different logical CPUs will prefer different zone list based on logical CPU number. Combined with the previous patches in the series, this can improve the scalability of zone lock contention effectively in the kbuild test case. Details can be found in the description of the previous patch of the series. Signed-off-by: "Huang, Ying" <ying.huang@intel.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: David Hildenbrand <david@redhat.com> Cc: Johannes Weiner <jweiner@redhat.com> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Michal Hocko <mhocko@suse.com> Cc: Pavel Tatashin <pasha.tatashin@soleen.com> Cc: Matthew Wilcox <willy@infradead.org> --- include/linux/gfp.h | 6 ++- include/linux/mmzone.h | 5 ++- mm/mempolicy.c | 6 ++- mm/mm_init.c | 2 +- mm/page_alloc.c | 93 ++++++++++++++++++++++++++++-------------- 5 files changed, 77 insertions(+), 35 deletions(-)