@@ -355,6 +355,38 @@ struct per_cpu_nodestat {
#endif /* !__GENERATING_BOUNDS.H */
+/*
+ * Size of lookback window for the free memory exhaustion prediction
+ * algorithm. Keep it to less than 16 to keep data manageable
+ */
+#define LSQ_LOOKBACK 8
+
+/*
+ * How far forward to look when determining if memory exhaustion would
+ * become an issue.
+ */
+extern unsigned long mempredict_threshold;
+
+/*
+ * Structure to keep track of current values required to compute the best
+ * fit line using method of least squares
+ */
+struct lsq_struct {
+ bool ready;
+ int next;
+ u64 x[LSQ_LOOKBACK];
+ unsigned long y[LSQ_LOOKBACK];
+};
+
+struct frag_info {
+ unsigned long free_pages;
+ unsigned long time;
+};
+
+/* Possile bits to be set by mem_predict in its return value */
+#define MEMPREDICT_RECLAIM 0x01
+#define MEMPREDICT_COMPACT 0x02
+
enum zone_type {
#ifdef CONFIG_ZONE_DMA
/*
@@ -581,6 +613,8 @@ enum zone_flags {
*/
};
+extern int mem_predict(struct frag_info *frag_vec, struct zone *zone);
+
static inline unsigned long zone_managed_pages(struct zone *zone)
{
return (unsigned long)atomic_long_read(&zone->managed_pages);
@@ -39,7 +39,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
mm_init.o mmu_context.o percpu.o slab_common.o \
compaction.o vmacache.o \
interval_tree.o list_lru.o workingset.o \
- debug.o gup.o $(mmu-y)
+ debug.o gup.o lsq.o $(mmu-y)
# Give 'page_alloc' its own module-parameter namespace
page-alloc-y := page_alloc.o
new file mode 100644
@@ -0,0 +1,273 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * lsq.c: Provide a prediction on whether free memory exhaustion is
+ * imminent or not by using a best fit line based upon method of
+ * least squares. Best fit line is based upon recent historical
+ * data. This historical data forms the lookback window for the
+ * algorithm.
+ *
+ *
+ * Author: Robert Harris
+ * Author: Khalid Aziz <khalid.aziz@oracle.com>
+ *
+ * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/math64.h>
+
+/*
+ * How far forward to look when determining if fragmentation would
+ * become an issue. The unit for this is same as the unit for the
+ * x-axis of graph where sample points for memory utilization are being
+ * plotted. We start with a default value of 1000 units but can tweak it
+ * dynamically to get better prediction results. With data points for
+ * memory being gathered with granularity of milliseconds, this translates
+ * to a look ahead of 1 second. If system is 1 second away from severe
+ * fragmentation, start compaction now to avoid direct comapction.
+ */
+unsigned long mempredict_threshold = 1000;
+
+/*
+ * Threshold for number of free pages that should trigger reclamation,
+ * expressed as percentage of total number of pages
+ */
+#define MEMRECLAMATION_THRESHOLD 20
+
+/*
+ * This function inserts the given value into the list of most recently seen
+ * data and returns the parameters, m and c, of a straight line of the form
+ * y = (mx/100) + c that, according to the the method of least squares
+ * fits them best. This implementation looks at just the last few data points
+ * (lookback window) which allows for fixed amount of storage required for
+ * data points and a nearly fixed time to calculate best fit line. Using
+ * line equation of the form y=(mx/100)+c instead of y=mx+c allows us to
+ * avoid floating point operations since m can be fractional often.
+ */
+static int
+lsq_fit(struct lsq_struct *lsq, unsigned long new_y, u64 new_x,
+ long long *m, long long *c)
+{
+ u64 sigma_x, sigma_y;
+ u64 sigma_xy, sigma_xx;
+ long long slope_divisor;
+ int i, next;
+ u64 x_offset;
+
+ next = lsq->next++;
+ lsq->x[next] = new_x;
+ lsq->y[next] = new_y;
+
+ if (lsq->next == LSQ_LOOKBACK) {
+ lsq->next = 0;
+ /*
+ * Lookback window is fill which means a reasonable
+ * best fit line can be computed. Flag enough data
+ * is available in lookback window now.
+ */
+ lsq->ready = true;
+ }
+
+ /*
+ * If lookback window is not full, do not continue with
+ * computing slope and intercept of best fit line.
+ */
+ if (!lsq->ready)
+ return -1;
+
+ /*
+ * If lookback window is full, compute slope and intercept
+ * for the best fit line. In the process of computing those, we need
+ * to compute squares of values along x-axis. Sqaure values can be
+ * large enough to overflow 64-bits if they are large enough to
+ * begin with. To solve this problem, transform the line on
+ * x-axis so the first point falls at x=0. Since lsq->x is a
+ * circular buffer, lsq->next points to the oldest entry in this
+ * buffer.
+ */
+ x_offset = lsq->x[lsq->next];
+ for (i = 0; i < LSQ_LOOKBACK; i++)
+ lsq->x[i] -= x_offset;
+
+ /*
+ * Lookback window is full. Compute slope and intercept
+ * for the best fit line
+ */
+ sigma_x = sigma_y = sigma_xy = sigma_xx = 0;
+ for (i = 0; i < LSQ_LOOKBACK; i++) {
+ sigma_x += lsq->x[i];
+ sigma_y += lsq->y[i];
+ sigma_xy += (lsq->x[i] * lsq->y[i]);
+ sigma_xx += (lsq->x[i] * lsq->x[i]);
+ }
+
+ /*
+ * guard against divide-by-zero
+ */
+ slope_divisor = LSQ_LOOKBACK * sigma_xx - sigma_x * sigma_x;
+ if (slope_divisor == 0)
+ return -1;
+ *m = div64_s64(((LSQ_LOOKBACK * sigma_xy - sigma_x * sigma_y) * 100),
+ slope_divisor);
+
+ *c = div64_long((sigma_y - *m * sigma_x), LSQ_LOOKBACK);
+
+ /*
+ * Restore original values for x-axis
+ */
+ for (i = 0; i < LSQ_LOOKBACK; ++i)
+ lsq->x[i] += x_offset;
+
+ return 0;
+}
+
+/*
+ * This function determines whether it is necessary to begin
+ * reclamation/compaction now in order to avert exhaustion of any of the
+ * free lists.
+ *
+ * Its basis is a simple model in which the total free memory, f_T, is
+ * consumed at a constant rate, R_T, i.e.
+ *
+ * f_T(t) = R_T * t + f_T(0)
+ *
+ * For any given order, o > 0, members of subordinate lists constitute
+ * fragmented free memory, f_f(o): the blocks are notionally free but
+ * they are unavailable for allocation. The fragmented free memory is
+ * also assumed to behave linearly and in the absence of compaction is
+ * given by
+ *
+ * f_f(o, t) = R_f(o) t + f_f(o, 0)
+ *
+ * Order 0 function represents current trend line for total free pages
+ * instead.
+ *
+ * It is assumed that all allocations will be made from contiguous
+ * memory meaning that, under net memory pressure and with no change in
+ * fragmentation, f_T will become equal to f_f and subsequent allocations
+ * will stall in either direct compaction or reclaim. Preemptive compaction
+ * will delay the onset of exhaustion but, to be useful, must begin early
+ * enough and must proceed at a sufficient rate.
+ *
+ * On each invocation, this function obtains estimates for the
+ * parameters f_T(0), R_T, f_f(o, 0) and R_f(o). Using the best fit
+ * line, it then determines if reclamation or compaction should be started
+ * now to avert free pages exhaustion or severe fragmentation. Return value
+ * is a set of bits which represent which condition has been observed -
+ * potential free memory exhaustion, and potential severe fragmentation.
+ */
+int mem_predict(struct frag_info *frag_vec, struct zone *zone)
+{
+ int order, retval = 0;
+ long long m[MAX_ORDER];
+ long long c[MAX_ORDER];
+ bool is_ready = true;
+ long long x_cross;
+ struct lsq_struct *lsq = zone->mem_prediction;
+
+ /*
+ * Compute the trend line for fragmentation on each order page.
+ * For order 0 pages, it will be a trend line showing rate
+ * of consumption of pages. For higher order pages, trend line
+ * shows loss/gain of pages of that order. When the trend line
+ * for example for order n pages intersects with trend line for
+ * total free pages, it means all available pages are of order
+ * (n-1) or lower and there is 100% fragmentation of order n
+ * pages. Kernel must compact pages at this point to gain
+ * new order n pages.
+ */
+ for (order = 0; order < MAX_ORDER; order++) {
+ if (lsq_fit(&lsq[order], frag_vec[order].free_pages,
+ frag_vec[order].time, &m[order],
+ &c[order]) == -1)
+ is_ready = false;
+ }
+
+ if (!is_ready)
+ return 0;
+
+ /*
+ * Trend line for each order page is available now. If the trend
+ * line for overall free pages is trending upwards (positive
+ * slope), there is no need to reclaim pages but there may be
+ * need to compact pages if system is running out of contiguous pages
+ * for higher orders.
+ */
+ if (m[0] >= 0) {
+ for (order = 1; order < MAX_ORDER; order++) {
+ /*
+ * If lines are parallel, then they never intersect.
+ */
+ if (m[0] == m[order])
+ continue;
+ /*
+ * Find the point of intersection of the two lines.
+ * The point of intersection represents 100%
+ * fragmentation for this order.
+ */
+ x_cross = div64_s64(((c[0] - c[order]) * 100),
+ (m[order] - m[0]));
+
+ /*
+ * If they intersect anytime soon in the future
+ * or intersected recently in the past, then it
+ * is time for compaction and there is no need
+ * to continue evaluating remaining order pages
+ *
+ * TODO: Instead of a fixed time threshold,
+ * track compaction rate on the system and compute
+ * how soon should compaction be started with the
+ * current compaction rate to avoid direct
+ * compaction
+ */
+ if ((x_cross < mempredict_threshold) &&
+ (x_cross > -mempredict_threshold)) {
+ retval |= MEMPREDICT_COMPACT;
+ return retval;
+ }
+ }
+ } else {
+ unsigned long threshold;
+
+ /*
+ * Trend line for overall free pages is showing a
+ * negative trend. Check if less than threshold
+ * pages are free. If so, start reclamation now to stave
+ * off memory exhaustion
+ *
+ * TODO: This is not the best way to use trend analysis.
+ * The right way to determine if it is time to start
+ * reclamation to avoid memory exhaustion is to compute
+ * how far away is exhaustion (least square fit
+ * line can provide that) and what is the average rate of
+ * memory reclamation. Using those two rates, compute how
+ * far in advance of exhaustion should reclamation be
+ * started to avoid exhaustion. This can be done after
+ * additional code has been added to keep track of current
+ * rate of reclamation.
+ */
+ threshold = (zone_managed_pages(zone)*MEMRECLAMATION_THRESHOLD)
+ /100;
+ if (frag_vec[0].free_pages < threshold)
+ retval |= MEMPREDICT_RECLAIM;
+ }
+
+ return retval;
+}