@@ -587,6 +587,12 @@ struct zone {
bool contiguous;
+ /*
+ * Structures to use for memory consumption prediction for
+ * each order
+ */
+ struct lsq_struct mem_prediction[MAX_ORDER];
+
ZONE_PADDING(_pad3_)
/* Zone statistics */
atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
@@ -611,6 +617,9 @@ enum zone_flags {
ZONE_BOOSTED_WATERMARK, /* zone recently boosted watermarks.
* Cleared when kswapd is woken.
*/
+ ZONE_POTENTIAL_FRAG, /* zone detected with a potential
+ * external fragmentation event.
+ */
};
extern int mem_predict(struct frag_info *frag_vec, struct zone *zone);
@@ -1130,6 +1139,35 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
#define for_each_zone_zonelist(zone, z, zlist, highidx) \
for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, NULL)
+extern int watermark_boost_factor;
+
+static inline void boost_watermark(struct zone *zone)
+{
+ unsigned long max_boost;
+
+ if (!watermark_boost_factor)
+ return;
+
+ max_boost = mult_frac(zone->_watermark[WMARK_HIGH],
+ watermark_boost_factor, 10000);
+
+ /*
+ * high watermark may be uninitialised if fragmentation occurs
+ * very early in boot so do not boost. We do not fall
+ * through and boost by pageblock_nr_pages as failing
+ * allocations that early means that reclaim is not going
+ * to help and it may even be impossible to reclaim the
+ * boosted watermark resulting in a hang.
+ */
+ if (!max_boost)
+ return;
+
+ max_boost = max(pageblock_nr_pages, max_boost);
+
+ zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages,
+ max_boost);
+}
+
#ifdef CONFIG_SPARSEMEM
#include <asm/sparsemem.h>
#endif
@@ -2351,33 +2351,6 @@ static bool can_steal_fallback(unsigned int order, int start_mt)
return false;
}
-static inline void boost_watermark(struct zone *zone)
-{
- unsigned long max_boost;
-
- if (!watermark_boost_factor)
- return;
-
- max_boost = mult_frac(zone->_watermark[WMARK_HIGH],
- watermark_boost_factor, 10000);
-
- /*
- * high watermark may be uninitialised if fragmentation occurs
- * very early in boot so do not boost. We do not fall
- * through and boost by pageblock_nr_pages as failing
- * allocations that early means that reclaim is not going
- * to help and it may even be impossible to reclaim the
- * boosted watermark resulting in a hang.
- */
- if (!max_boost)
- return;
-
- max_boost = max(pageblock_nr_pages, max_boost);
-
- zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages,
- max_boost);
-}
-
/*
* This function implements actual steal behaviour. If order is large enough,
* we can steal whole pageblock. If not, we first move freepages in this
@@ -51,6 +51,7 @@
#include <linux/printk.h>
#include <linux/dax.h>
#include <linux/psi.h>
+#include <linux/jiffies.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -3397,14 +3398,82 @@ static void clear_pgdat_congested(pg_data_t *pgdat)
clear_bit(PGDAT_WRITEBACK, &pgdat->flags);
}
+/*
+ * Update trend data and perform trend analysis for a zone to foresee
+ * a low memory or severe fragmentation event
+ */
+static int zone_trend_analysis(struct zone *zone)
+{
+ struct frag_info frag_vec[MAX_ORDER];
+ int order, result;
+ unsigned long total_free_pages;
+ unsigned long curr_free_pages;
+
+ total_free_pages = frag_vec[0].free_pages = 0;
+ for (order = 0; order < MAX_ORDER; order++) {
+ curr_free_pages = zone->free_area[order].nr_free << order;
+ total_free_pages += curr_free_pages;
+
+ if (order < MAX_ORDER - 1) {
+ frag_vec[order + 1].free_pages =
+ frag_vec[order].free_pages + curr_free_pages;
+ frag_vec[order + 1].time =
+ jiffies64_to_msecs(get_jiffies_64()
+ - INITIAL_JIFFIES);
+ }
+ }
+ frag_vec[0].free_pages = total_free_pages;
+ frag_vec[0].time = frag_vec[MAX_ORDER - 1].time;
+
+ result = mem_predict(frag_vec, zone);
+
+ return result;
+}
+
+/*
+ * Perform trend analysis for memory usage for each zone in the node to
+ * detect potential upcoming low memory or fragmented memory conditions
+ */
+static int node_trend_analysis(pg_data_t *pgdat, int classzone_idx)
+{
+ struct zone *zone = NULL;
+ int i, retval = 0;
+
+ for (i = 0; i <= classzone_idx; i++) {
+ int zoneval;
+
+ zone = pgdat->node_zones + i;
+
+ if (!managed_zone(zone))
+ continue;
+
+ /*
+ * Check if trend analysis shows potential fragmentation
+ * in near future
+ */
+ zoneval = zone_trend_analysis(zone);
+ if (zoneval & MEMPREDICT_COMPACT)
+ set_bit(ZONE_POTENTIAL_FRAG, &zone->flags);
+ if (zoneval & MEMPREDICT_RECLAIM)
+ boost_watermark(zone);
+ retval |= zoneval;
+ }
+
+ return retval;
+}
+
/*
* Prepare kswapd for sleeping. This verifies that there are no processes
* waiting in throttle_direct_reclaim() and that watermarks have been met.
+ * It also checks if this node could have a potential external fragmentation
+ * event which could lead to direct reclaim/compaction stalls.
*
* Returns true if kswapd is ready to sleep
*/
static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
{
+ int retval;
+
/*
* The throttled processes are normally woken up in balance_pgdat() as
* soon as allow_direct_reclaim() is true. But there is a potential
@@ -3425,6 +3494,21 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
return true;
+ /*
+ * Check whether this node could have a potential memory
+ * exhaustion in near future. If trend analysis shows such
+ * an event occurring, don't allow kswapd to sleep so
+ * reclamation starts now to prevent memory exhaustion. If
+ * trend analysis shows no impending memory exhaustion but
+ * shows impending severe fragmentation, return true to
+ * wake up kcompactd.
+ */
+ retval = node_trend_analysis(pgdat, classzone_idx);
+ if (retval & MEMPREDICT_RECLAIM)
+ return false;
+ if (retval & MEMPREDICT_COMPACT)
+ return true;
+
if (pgdat_balanced(pgdat, order, classzone_idx)) {
clear_pgdat_congested(pgdat);
return true;
@@ -3498,6 +3582,8 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
unsigned long nr_boost_reclaim;
unsigned long zone_boosts[MAX_NR_ZONES] = { 0, };
bool boosted;
+ bool potential_frag = 0;
+ bool need_compact;
struct zone *zone;
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
@@ -3524,9 +3610,27 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
nr_boost_reclaim += zone->watermark_boost;
zone_boosts[i] = zone->watermark_boost;
+
+ /*
+ * Check if any of the zones could have a potential
+ * fragmentation event.
+ */
+ if (test_bit(ZONE_POTENTIAL_FRAG, &zone->flags)) {
+ potential_frag = 1;
+ clear_bit(ZONE_POTENTIAL_FRAG, &zone->flags);
+ }
}
boosted = nr_boost_reclaim;
+ /*
+ * If kswapd is woken up because of watermark boosting or forced
+ * to run another balance_pgdat run because it detected an
+ * external fragmentation event, run compaction after
+ * reclaiming some pages. need_compact is true if such compaction
+ * is required.
+ */
+ need_compact = boosted || potential_frag;
+
restart:
sc.priority = DEF_PRIORITY;
do {
@@ -3645,7 +3749,6 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
*/
nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
nr_boost_reclaim -= min(nr_boost_reclaim, nr_reclaimed);
-
/*
* If reclaim made no progress for a boost, stop reclaim as
* IO cannot be queued and it could be an infinite loop in
@@ -3676,13 +3779,14 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
zone->watermark_boost -= min(zone->watermark_boost, zone_boosts[i]);
spin_unlock_irqrestore(&zone->lock, flags);
}
+ }
- /*
- * As there is now likely space, wakeup kcompact to defragment
- * pageblocks.
- */
+ /*
+ * As there is now likely space, wakeup kcompactd to defragment
+ * pageblocks.
+ */
+ if (need_compact)
wakeup_kcompactd(pgdat, pageblock_order, classzone_idx);
- }
snapshot_refaults(NULL, pgdat);
__fs_reclaim_release();