@@ -391,6 +391,7 @@ static inline void lru_cache_enable(void)
}
extern void lru_cache_disable(void);
+extern void maybe_lru_add_drain(void);
extern void lru_add_drain(void);
extern void lru_add_drain_cpu(int cpu);
extern void lru_add_drain_cpu_zone(struct zone *zone);
@@ -1919,7 +1919,7 @@ void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
struct mmu_notifier_range range;
struct mmu_gather tlb;
- lru_add_drain();
+ maybe_lru_add_drain();
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
address, end);
hugetlb_zap_begin(vma, &range.start, &range.end);
@@ -777,6 +777,24 @@ void lru_add_drain(void)
mlock_drain_local();
}
+static bool should_lru_add_drain(void)
+{
+ struct cpu_fbatches *fbatches = this_cpu_ptr(&cpu_fbatches);
+ int pending = folio_batch_count(&fbatches->lru_add);
+ pending += folio_batch_count(&fbatches->lru_deactivate);
+ pending += folio_batch_count(&fbatches->lru_deactivate_file);
+ pending += folio_batch_count(&fbatches->lru_lazyfree);
+
+ /* Don't bother draining unless we have several pages pending. */
+ return pending > SWAP_CLUSTER_MAX;
+}
+
+void maybe_lru_add_drain(void)
+{
+ if (should_lru_add_drain())
+ lru_add_drain();
+}
+
/*
* It's called from per-cpu workqueue context in SMP case so
* lru_add_drain_cpu and invalidate_bh_lrus_cpu should run on
@@ -317,7 +317,7 @@ void free_pages_and_swap_cache(struct encoded_page **pages, int nr)
struct folio_batch folios;
unsigned int refs[PAGEVEC_SIZE];
- lru_add_drain();
+ maybe_lru_add_drain();
folio_batch_init(&folios);
for (int i = 0; i < nr; i++) {
struct folio *folio = page_folio(encoded_page_ptr(pages[i]));
The lru_add_drain() call in zap_page_range_single() always takes some locks, and will drain the buffers even when there is only a single page pending. We probably don't need to do that, since we already deal fine with zap_page_range encountering pages that are still in the buffers of other CPUs. On an AMD Milan CPU, will-it-scale the tlb_flush2_threads test performance with 36 threads (one for each core) increases from 526k to 730k loops per second. The overhead in this case was on the lruvec locks, taking the lock to flush a single page. There may be other spots where this variant could be appropriate. Signed-off-by: Rik van Riel <riel@surriel.com> --- include/linux/swap.h | 1 + mm/memory.c | 2 +- mm/swap.c | 18 ++++++++++++++++++ mm/swap_state.c | 2 +- 4 files changed, 21 insertions(+), 2 deletions(-)