@@ -154,7 +154,6 @@ static inline int wb_congested(struct bdi_writeback *wb, int cong_bits)
}
long congestion_wait(int sync, long timeout);
-long wait_iff_congested(int sync, long timeout);
static inline bool mapping_can_writeback(struct address_space *mapping)
{
@@ -199,6 +199,7 @@ enum node_stat_item {
NR_VMSCAN_IMMEDIATE, /* Prioritise for reclaim when writeback ends */
NR_DIRTIED, /* page dirtyings since bootup */
NR_WRITTEN, /* page writings since bootup */
+ NR_THROTTLED_WRITTEN, /* NR_WRITTEN while reclaim throttled */
NR_KERNEL_MISC_RECLAIMABLE, /* reclaimable non-slab kernel pages */
NR_FOLL_PIN_ACQUIRED, /* via: pin_user_page(), gup flag: FOLL_PIN */
NR_FOLL_PIN_RELEASED, /* pages returned via unpin_user_page() */
@@ -272,6 +273,10 @@ enum lru_list {
NR_LRU_LISTS
};
+enum vmscan_throttle_state {
+ VMSCAN_THROTTLE_WRITEBACK,
+};
+
#define for_each_lru(lru) for (lru = 0; lru < NR_LRU_LISTS; lru++)
#define for_each_evictable_lru(lru) for (lru = 0; lru <= LRU_ACTIVE_FILE; lru++)
@@ -841,6 +846,10 @@ typedef struct pglist_data {
int node_id;
wait_queue_head_t kswapd_wait;
wait_queue_head_t pfmemalloc_wait;
+ wait_queue_head_t reclaim_wait; /* wq for throttling reclaim */
+ atomic_t nr_reclaim_throttled; /* nr of throtted tasks */
+ unsigned long nr_reclaim_start; /* nr pages written while throttled
+ * when throttling started. */
struct task_struct *kswapd; /* Protected by
mem_hotplug_begin/end() */
int kswapd_order;
@@ -27,6 +27,14 @@
{RECLAIM_WB_ASYNC, "RECLAIM_WB_ASYNC"} \
) : "RECLAIM_WB_NONE"
+#define _VMSCAN_THROTTLE_WRITEBACK (1 << VMSCAN_THROTTLE_WRITEBACK)
+
+#define show_throttle_flags(flags) \
+ (flags) ? __print_flags(flags, "|", \
+ {_VMSCAN_THROTTLE_WRITEBACK, "VMSCAN_THROTTLE_WRITEBACK"} \
+ ) : "VMSCAN_THROTTLE_NONE"
+
+
#define trace_reclaim_flags(file) ( \
(file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \
(RECLAIM_WB_ASYNC) \
@@ -454,6 +462,32 @@ DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_node_reclaim_end,
TP_ARGS(nr_reclaimed)
);
+TRACE_EVENT(mm_vmscan_throttled,
+
+ TP_PROTO(int nid, int usec_timeout, int usec_delayed, int reason),
+
+ TP_ARGS(nid, usec_timeout, usec_delayed, reason),
+
+ TP_STRUCT__entry(
+ __field(int, nid)
+ __field(int, usec_timeout)
+ __field(int, usec_delayed)
+ __field(int, reason)
+ ),
+
+ TP_fast_assign(
+ __entry->nid = nid;
+ __entry->usec_timeout = usec_timeout;
+ __entry->usec_delayed = usec_delayed;
+ __entry->reason = 1U << reason;
+ ),
+
+ TP_printk("nid=%d usec_timeout=%d usect_delayed=%d reason=%s",
+ __entry->nid,
+ __entry->usec_timeout,
+ __entry->usec_delayed,
+ show_throttle_flags(__entry->reason))
+);
#endif /* _TRACE_VMSCAN_H */
/* This part must be outside protection */
@@ -763,13 +763,6 @@ DEFINE_EVENT(writeback_congest_waited_template, writeback_congestion_wait,
TP_ARGS(usec_timeout, usec_delayed)
);
-DEFINE_EVENT(writeback_congest_waited_template, writeback_wait_iff_congested,
-
- TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed),
-
- TP_ARGS(usec_timeout, usec_delayed)
-);
-
DECLARE_EVENT_CLASS(writeback_single_inode_template,
TP_PROTO(struct inode *inode,
@@ -1041,51 +1041,3 @@ long congestion_wait(int sync, long timeout)
return ret;
}
EXPORT_SYMBOL(congestion_wait);
-
-/**
- * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a pgdat to complete writes
- * @sync: SYNC or ASYNC IO
- * @timeout: timeout in jiffies
- *
- * In the event of a congested backing_dev (any backing_dev) this waits
- * for up to @timeout jiffies for either a BDI to exit congestion of the
- * given @sync queue or a write to complete.
- *
- * The return value is 0 if the sleep is for the full timeout. Otherwise,
- * it is the number of jiffies that were still remaining when the function
- * returned. return_value == timeout implies the function did not sleep.
- */
-long wait_iff_congested(int sync, long timeout)
-{
- long ret;
- unsigned long start = jiffies;
- DEFINE_WAIT(wait);
- wait_queue_head_t *wqh = &congestion_wqh[sync];
-
- /*
- * If there is no congestion, yield if necessary instead
- * of sleeping on the congestion queue
- */
- if (atomic_read(&nr_wb_congested[sync]) == 0) {
- cond_resched();
-
- /* In case we scheduled, work out time remaining */
- ret = timeout - (jiffies - start);
- if (ret < 0)
- ret = 0;
-
- goto out;
- }
-
- /* Sleep until uncongested or a write happens */
- prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
- ret = io_schedule_timeout(timeout);
- finish_wait(wqh, &wait);
-
-out:
- trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout),
- jiffies_to_usecs(jiffies - start));
-
- return ret;
-}
-EXPORT_SYMBOL(wait_iff_congested);
@@ -1605,6 +1605,7 @@ void end_page_writeback(struct page *page)
smp_mb__after_atomic();
wake_up_page(page, PG_writeback);
+ acct_reclaim_writeback(page);
put_page(page);
}
EXPORT_SYMBOL(end_page_writeback);
@@ -34,6 +34,17 @@
void page_writeback_init(void);
+void __acct_reclaim_writeback(pg_data_t *pgdat, struct page *page,
+ int nr_throttled);
+static inline void acct_reclaim_writeback(struct page *page)
+{
+ pg_data_t *pgdat = page_pgdat(page);
+ int nr_throttled = atomic_read(&pgdat->nr_reclaim_throttled);
+
+ if (nr_throttled)
+ __acct_reclaim_writeback(pgdat, page, nr_throttled);
+}
+
vm_fault_t do_swap_page(struct vm_fault *vmf);
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
@@ -7396,6 +7396,7 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
init_waitqueue_head(&pgdat->kswapd_wait);
init_waitqueue_head(&pgdat->pfmemalloc_wait);
+ init_waitqueue_head(&pgdat->reclaim_wait);
pgdat_page_ext_init(pgdat);
lruvec_init(&pgdat->__lruvec);
@@ -1006,6 +1006,64 @@ static void handle_write_error(struct address_space *mapping,
unlock_page(page);
}
+static void
+reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason,
+ long timeout)
+{
+ wait_queue_head_t *wqh = &pgdat->reclaim_wait;
+ long ret;
+ DEFINE_WAIT(wait);
+
+ /*
+ * Do not throttle IO workers, kthreads other than kswapd or
+ * workqueues. They may be required for reclaim to make
+ * forward progress (e.g. journalling workqueues or kthreads).
+ */
+ if (!current_is_kswapd() &&
+ current->flags & (PF_IO_WORKER|PF_KTHREAD))
+ return;
+
+ if (atomic_inc_return(&pgdat->nr_reclaim_throttled) == 1) {
+ WRITE_ONCE(pgdat->nr_reclaim_start,
+ node_page_state(pgdat, NR_THROTTLED_WRITTEN));
+ }
+
+ prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
+ ret = schedule_timeout(timeout);
+ finish_wait(wqh, &wait);
+ atomic_dec(&pgdat->nr_reclaim_throttled);
+
+ trace_mm_vmscan_throttled(pgdat->node_id, jiffies_to_usecs(timeout),
+ jiffies_to_usecs(timeout - ret),
+ reason);
+}
+
+/*
+ * Account for pages written if tasks are throttled waiting on dirty
+ * pages to clean. If enough pages have been cleaned since throttling
+ * started then wakeup the throttled tasks.
+ */
+void __acct_reclaim_writeback(pg_data_t *pgdat, struct page *page,
+ int nr_throttled)
+{
+ unsigned long nr_written;
+
+ inc_node_page_state(page, NR_THROTTLED_WRITTEN);
+
+ /*
+ * This is an inaccurate read as the per-cpu deltas may not
+ * be synchronised. However, given that the system is
+ * writeback throttled, it is not worth taking the penalty
+ * of getting an accurate count. At worst, the throttle
+ * timeout guarantees forward progress.
+ */
+ nr_written = node_page_state(pgdat, NR_THROTTLED_WRITTEN) -
+ READ_ONCE(pgdat->nr_reclaim_start);
+
+ if (nr_written > SWAP_CLUSTER_MAX * nr_throttled)
+ wake_up_all(&pgdat->reclaim_wait);
+}
+
/* possible outcome of pageout() */
typedef enum {
/* failed to write page out, page is locked */
@@ -1412,9 +1470,8 @@ static unsigned int shrink_page_list(struct list_head *page_list,
/*
* The number of dirty pages determines if a node is marked
- * reclaim_congested which affects wait_iff_congested. kswapd
- * will stall and start writing pages if the tail of the LRU
- * is all dirty unqueued pages.
+ * reclaim_congested. kswapd will stall and start writing
+ * pages if the tail of the LRU is all dirty unqueued pages.
*/
page_check_dirty_writeback(page, &dirty, &writeback);
if (dirty || writeback)
@@ -3180,19 +3237,19 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
* If kswapd scans pages marked for immediate
* reclaim and under writeback (nr_immediate), it
* implies that pages are cycling through the LRU
- * faster than they are written so also forcibly stall.
+ * faster than they are written so forcibly stall
+ * until some pages complete writeback.
*/
if (sc->nr.immediate)
- congestion_wait(BLK_RW_ASYNC, HZ/10);
+ reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK, HZ/10);
}
/*
- * Tag a node/memcg as congested if all the dirty pages
- * scanned were backed by a congested BDI and
- * wait_iff_congested will stall.
+ * Tag a node/memcg as congested if all the dirty pages were marked
+ * for writeback and immediate reclaim (counted in nr.congested).
*
* Legacy memcg will stall in page writeback so avoid forcibly
- * stalling in wait_iff_congested().
+ * stalling in reclaim_throttle().
*/
if ((current_is_kswapd() ||
(cgroup_reclaim(sc) && writeback_throttling_sane(sc))) &&
@@ -3200,15 +3257,15 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
set_bit(LRUVEC_CONGESTED, &target_lruvec->flags);
/*
- * Stall direct reclaim for IO completions if underlying BDIs
- * and node is congested. Allow kswapd to continue until it
+ * Stall direct reclaim for IO completions if the lruvec is
+ * node is congested. Allow kswapd to continue until it
* starts encountering unqueued dirty pages or cycling through
* the LRU too quickly.
*/
if (!current_is_kswapd() && current_may_throttle() &&
!sc->hibernation_mode &&
test_bit(LRUVEC_CONGESTED, &target_lruvec->flags))
- wait_iff_congested(BLK_RW_ASYNC, HZ/10);
+ reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK, HZ/10);
if (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
sc))
@@ -4286,6 +4343,7 @@ static int kswapd(void *p)
WRITE_ONCE(pgdat->kswapd_order, 0);
WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
+ atomic_set(&pgdat->nr_reclaim_throttled, 0);
for ( ; ; ) {
bool ret;
@@ -1225,6 +1225,7 @@ const char * const vmstat_text[] = {
"nr_vmscan_immediate_reclaim",
"nr_dirtied",
"nr_written",
+ "nr_throttled_written",
"nr_kernel_misc_reclaimable",
"nr_foll_pin_acquired",
"nr_foll_pin_released",