@@ -918,6 +918,15 @@ typedef struct pglist_data {
unsigned int nbp_rl_start;
/* number of promote candidate pages at start time of current rate limit period */
unsigned long nbp_rl_nr_cand;
+ /* promote threshold in ms */
+ unsigned int nbp_threshold;
+ /* start time in ms of current promote threshold adjustment period */
+ unsigned int nbp_th_start;
+ /*
+ * number of promote candidate pages at stat time of current promote
+ * threshold adjustment period
+ */
+ unsigned long nbp_th_nr_cand;
#endif
/* Fields commonly accessed by the page reclaim scanner */
@@ -4361,6 +4361,17 @@ void set_numabalancing_state(bool enabled)
}
#ifdef CONFIG_PROC_SYSCTL
+static void reset_memory_tiering(void)
+{
+ struct pglist_data *pgdat;
+
+ for_each_online_pgdat(pgdat) {
+ pgdat->nbp_threshold = 0;
+ pgdat->nbp_th_nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
+ pgdat->nbp_th_start = jiffies_to_msecs(jiffies);
+ }
+}
+
int sysctl_numa_balancing(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
@@ -4377,6 +4388,9 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
if (err < 0)
return err;
if (write) {
+ if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
+ (state & NUMA_BALANCING_MEMORY_TIERING))
+ reset_memory_tiering();
sysctl_numa_balancing_mode = state;
__set_numabalancing_state(state);
}
@@ -1503,6 +1503,35 @@ static bool numa_promotion_rate_limit(struct pglist_data *pgdat,
return false;
}
+#define NUMA_MIGRATION_ADJUST_STEPS 16
+
+static void numa_promotion_adjust_threshold(struct pglist_data *pgdat,
+ unsigned long rate_limit,
+ unsigned int ref_th)
+{
+ unsigned int now, start, th_period, unit_th, th;
+ unsigned long nr_cand, ref_cand, diff_cand;
+
+ now = jiffies_to_msecs(jiffies);
+ th_period = sysctl_numa_balancing_scan_period_max;
+ start = pgdat->nbp_th_start;
+ if (now - start > th_period &&
+ cmpxchg(&pgdat->nbp_th_start, start, now) == start) {
+ ref_cand = rate_limit *
+ sysctl_numa_balancing_scan_period_max / MSEC_PER_SEC;
+ nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
+ diff_cand = nr_cand - pgdat->nbp_th_nr_cand;
+ unit_th = ref_th * 2 / NUMA_MIGRATION_ADJUST_STEPS;
+ th = pgdat->nbp_threshold ? : ref_th;
+ if (diff_cand > ref_cand * 11 / 10)
+ th = max(th - unit_th, unit_th);
+ else if (diff_cand < ref_cand * 9 / 10)
+ th = min(th + unit_th, ref_th * 2);
+ pgdat->nbp_th_nr_cand = nr_cand;
+ pgdat->nbp_threshold = th;
+ }
+}
+
bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
int src_nid, int dst_cpu)
{
@@ -1517,19 +1546,26 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING &&
!node_is_toptier(src_nid)) {
struct pglist_data *pgdat;
- unsigned long rate_limit, latency, th;
+ unsigned long rate_limit;
+ unsigned int latency, th, def_th;
pgdat = NODE_DATA(dst_nid);
- if (pgdat_free_space_enough(pgdat))
+ if (pgdat_free_space_enough(pgdat)) {
+ /* workload changed, reset hot threshold */
+ pgdat->nbp_threshold = 0;
return true;
+ }
+
+ def_th = sysctl_numa_balancing_hot_threshold;
+ rate_limit = sysctl_numa_balancing_promote_rate_limit << \
+ (20 - PAGE_SHIFT);
+ numa_promotion_adjust_threshold(pgdat, rate_limit, def_th);
- th = sysctl_numa_balancing_hot_threshold;
+ th = pgdat->nbp_threshold ? : def_th;
latency = numa_hint_fault_latency(page);
if (latency >= th)
return false;
- rate_limit = sysctl_numa_balancing_promote_rate_limit << \
- (20 - PAGE_SHIFT);
return !numa_promotion_rate_limit(pgdat, rate_limit,
thp_nr_pages(page));
}