@@ -81,7 +81,7 @@ struct shrinker {
int id;
#endif
/* objs pending delete, per node */
- atomic_long_t *nr_deferred;
+ atomic64_t *nr_deferred;
};
#define DEFAULT_SEEKS 2 /* A good number if you don't know better. */
@@ -516,16 +516,16 @@ static int64_t shrink_scan_count(struct shrink_control *shrinkctl,
static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
struct shrinker *shrinker, int priority)
{
- unsigned long freed = 0;
- long total_scan;
+ uint64_t freed = 0;
int64_t freeable_objects = 0;
int64_t scan_count;
- long nr;
- long new_nr;
+ int64_t scanned_objects = 0;
+ int64_t next_deferred = 0;
+ int64_t deferred_count = 0;
+ int64_t new_nr;
int nid = shrinkctl->nid;
long batch_size = shrinker->batch ? shrinker->batch
: SHRINK_BATCH;
- long scanned = 0, next_deferred;
if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
nid = 0;
@@ -536,47 +536,51 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
return scan_count;
/*
- * copy the current shrinker scan count into a local variable
- * and zero it so that other concurrent shrinker invocations
- * don't also do this scanning work.
+ * If kswapd, we take all the deferred work and do it here. We don't let
+ * direct reclaim do this, because then it means some poor sod is going
+ * to have to do somebody else's GFP_NOFS reclaim, and it hides the real
+ * amount of reclaim work from concurrent kswapd operations. Hence we do
+ * the work in the wrong place, at the wrong time, and it's largely
+ * unpredictable.
+ *
+ * By doing the deferred work only in kswapd, we can schedule the work
+ * according the the reclaim priority - low priority reclaim will do
+ * less deferred work, hence we'll do more of the deferred work the more
+ * desperate we become for free memory. This avoids the need for needing
+ * to specifically avoid deferred work windup as low amount os memory
+ * pressure won't excessive trim caches anymore.
*/
- nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
+ if (current_is_kswapd()) {
+ int64_t deferred_scan;
- total_scan = nr + scan_count;
- if (total_scan < 0) {
- pr_err("shrink_slab: %pS negative objects to delete nr=%ld\n",
- shrinker->scan_objects, total_scan);
- total_scan = scan_count;
- next_deferred = nr;
- } else
- next_deferred = total_scan;
+ deferred_count = atomic64_xchg(&shrinker->nr_deferred[nid], 0);
- /*
- * We need to avoid excessive windup on filesystem shrinkers
- * due to large numbers of GFP_NOFS allocations causing the
- * shrinkers to return -1 all the time. This results in a large
- * nr being built up so when a shrink that can do some work
- * comes along it empties the entire cache due to nr >>>
- * freeable. This is bad for sustaining a working set in
- * memory.
- *
- * Hence only allow the shrinker to scan the entire cache when
- * a large delta change is calculated directly.
- */
- if (scan_count < freeable_objects / 4)
- total_scan = min_t(long, total_scan, freeable_objects / 2);
+ /* we want to scan 5-10% of the deferred work here at minimum */
+ deferred_scan = deferred_count;
+ if (priority)
+ do_div(deferred_scan, priority);
+ scan_count += deferred_scan;
+
+ /*
+ * If there is more deferred work than the number of freeable
+ * items in the cache, limit the amount of work we will carry
+ * over to the next kswapd run on this cache. This prevents
+ * deferred work windup.
+ */
+ deferred_count = min(deferred_count, freeable_objects * 2);
+
+ }
/*
* Avoid risking looping forever due to too large nr value:
* never try to free more than twice the estimate number of
* freeable entries.
*/
- if (total_scan > freeable_objects * 2)
- total_scan = freeable_objects * 2;
+ scan_count = min(scan_count, freeable_objects * 2);
- trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
+ trace_mm_shrink_slab_start(shrinker, shrinkctl, deferred_count,
freeable_objects, scan_count,
- total_scan, priority);
+ scan_count, priority);
/*
* If the shrinker can't run (e.g. due to gfp_mask constraints), then
@@ -600,10 +604,10 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
* scanning at high prio and therefore should try to reclaim as much as
* possible.
*/
- while (total_scan >= batch_size ||
- total_scan >= freeable_objects) {
+ while (scan_count >= batch_size ||
+ scan_count >= freeable_objects) {
unsigned long ret;
- unsigned long nr_to_scan = min(batch_size, total_scan);
+ unsigned long nr_to_scan = min_t(long, batch_size, scan_count);
shrinkctl->nr_to_scan = nr_to_scan;
shrinkctl->nr_scanned = nr_to_scan;
@@ -613,29 +617,29 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
freed += ret;
count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned);
- total_scan -= shrinkctl->nr_scanned;
- scanned += shrinkctl->nr_scanned;
+ scan_count -= shrinkctl->nr_scanned;
+ scanned_objects += shrinkctl->nr_scanned;
cond_resched();
}
-
done:
- if (next_deferred >= scanned)
- next_deferred -= scanned;
+ if (deferred_count)
+ next_deferred = deferred_count - scanned_objects;
else
- next_deferred = 0;
+ next_deferred = scan_count;
/*
* move the unused scan count back into the shrinker in a
* manner that handles concurrent updates. If we exhausted the
* scan, there is no need to do an update.
*/
if (next_deferred > 0)
- new_nr = atomic_long_add_return(next_deferred,
+ new_nr = atomic64_add_return(next_deferred,
&shrinker->nr_deferred[nid]);
else
- new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
+ new_nr = atomic64_read(&shrinker->nr_deferred[nid]);
- trace_mm_shrink_slab_end(shrinker, nid, freed, nr, new_nr, total_scan);
+ trace_mm_shrink_slab_end(shrinker, nid, freed, deferred_count, new_nr,
+ scan_count);
return freed;
}