Message ID | 20220516164718.2419891-13-shr@fb.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | io-uring/xfs: support async buffered writes | expand |
On Mon 16-05-22 09:47:14, Stefan Roesch wrote: > This factors out the for loop in balance_dirty_pages() into a new > function called _balance_dirty_pages(). By factoring out this function > the async write code can determine if it has to wait to throttle writes > or not. The function _balance_dirty_pages() returns the sleep time. > If the sleep time is greater 0, then the async write code needs to throttle. > > To maintain the context for consecutive calls of _balance_dirty_pages() > and maintain the current behavior a new data structure called bdp_ctx > has been introduced. > > Signed-off-by: Stefan Roesch <shr@fb.com> ... > --- > mm/page-writeback.c | 452 +++++++++++++++++++++++--------------------- > 1 file changed, 239 insertions(+), 213 deletions(-) > > diff --git a/mm/page-writeback.c b/mm/page-writeback.c > index 7e2da284e427..cbb74c0666c6 100644 > --- a/mm/page-writeback.c > +++ b/mm/page-writeback.c > @@ -140,6 +140,29 @@ struct dirty_throttle_control { > unsigned long pos_ratio; > }; > > +/* context for consecutive calls to _balance_dirty_pages() */ > +struct bdp_ctx { > + long pause; > + unsigned long now; > + unsigned long start_time; > + unsigned long task_ratelimit; > + unsigned long dirty_ratelimit; > + unsigned long nr_reclaimable; > + int nr_dirtied_pause; > + bool dirty_exceeded; > + > + struct dirty_throttle_control gdtc_stor; > + struct dirty_throttle_control mdtc_stor; > + struct dirty_throttle_control *sdtc; > +} bdp_ctx; Looking at how much context you propagate into _balance_dirty_pages() I don't think this suggestion was as great as I've hoped. I'm sorry for that. We could actually significantly reduce the amount of context passed in/out but some things would be difficult to get rid of and some interactions of code in _balance_dirty_pages() and the caller are actually pretty subtle. I think something like attached three patches should make things NOWAIT support in balance_dirty_pages() reasonably readable. Honza > + > +/* initialize _balance_dirty_pages() context */ > +#define BDP_CTX_INIT(ctx, wb) \ > + .gdtc_stor = { GDTC_INIT(wb) }, \ > + .mdtc_stor = { MDTC_INIT(wb, &ctx.gdtc_stor) }, \ > + .start_time = jiffies, \ > + .dirty_exceeded = false > + > /* > * Length of period for aging writeout fractions of bdis. This is an > * arbitrarily chosen number. The longer the period, the slower fractions will > @@ -1538,261 +1561,264 @@ static inline void wb_dirty_limits(struct dirty_throttle_control *dtc) > } > } > > -/* > - * balance_dirty_pages() must be called by processes which are generating dirty > - * data. It looks at the number of dirty pages in the machine and will force > - * the caller to wait once crossing the (background_thresh + dirty_thresh) / 2. > - * If we're over `background_thresh' then the writeback threads are woken to > - * perform some writeout. > - */ > -static void balance_dirty_pages(struct bdi_writeback *wb, > - unsigned long pages_dirtied) > +static inline int _balance_dirty_pages(struct bdi_writeback *wb, > + unsigned long pages_dirtied, struct bdp_ctx *ctx) > { > - struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) }; > - struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) }; > - struct dirty_throttle_control * const gdtc = &gdtc_stor; > - struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ? > - &mdtc_stor : NULL; > - struct dirty_throttle_control *sdtc; > - unsigned long nr_reclaimable; /* = file_dirty */ > + struct dirty_throttle_control * const gdtc = &ctx->gdtc_stor; > + struct dirty_throttle_control * const mdtc = mdtc_valid(&ctx->mdtc_stor) ? > + &ctx->mdtc_stor : NULL; > long period; > - long pause; > long max_pause; > long min_pause; > - int nr_dirtied_pause; > - bool dirty_exceeded = false; > - unsigned long task_ratelimit; > - unsigned long dirty_ratelimit; > struct backing_dev_info *bdi = wb->bdi; > bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT; > - unsigned long start_time = jiffies; > > - for (;;) { > - unsigned long now = jiffies; > - unsigned long dirty, thresh, bg_thresh; > - unsigned long m_dirty = 0; /* stop bogus uninit warnings */ > - unsigned long m_thresh = 0; > - unsigned long m_bg_thresh = 0; > - > - nr_reclaimable = global_node_page_state(NR_FILE_DIRTY); > - gdtc->avail = global_dirtyable_memory(); > - gdtc->dirty = nr_reclaimable + global_node_page_state(NR_WRITEBACK); > + unsigned long dirty, thresh, bg_thresh; > + unsigned long m_dirty = 0; /* stop bogus uninit warnings */ > + unsigned long m_thresh = 0; > + unsigned long m_bg_thresh = 0; > > - domain_dirty_limits(gdtc); > + ctx->now = jiffies; > + ctx->nr_reclaimable = global_node_page_state(NR_FILE_DIRTY); > + gdtc->avail = global_dirtyable_memory(); > + gdtc->dirty = ctx->nr_reclaimable + global_node_page_state(NR_WRITEBACK); > > - if (unlikely(strictlimit)) { > - wb_dirty_limits(gdtc); > + domain_dirty_limits(gdtc); > > - dirty = gdtc->wb_dirty; > - thresh = gdtc->wb_thresh; > - bg_thresh = gdtc->wb_bg_thresh; > - } else { > - dirty = gdtc->dirty; > - thresh = gdtc->thresh; > - bg_thresh = gdtc->bg_thresh; > - } > + if (unlikely(strictlimit)) { > + wb_dirty_limits(gdtc); > > - if (mdtc) { > - unsigned long filepages, headroom, writeback; > + dirty = gdtc->wb_dirty; > + thresh = gdtc->wb_thresh; > + bg_thresh = gdtc->wb_bg_thresh; > + } else { > + dirty = gdtc->dirty; > + thresh = gdtc->thresh; > + bg_thresh = gdtc->bg_thresh; > + } > > - /* > - * If @wb belongs to !root memcg, repeat the same > - * basic calculations for the memcg domain. > - */ > - mem_cgroup_wb_stats(wb, &filepages, &headroom, > - &mdtc->dirty, &writeback); > - mdtc->dirty += writeback; > - mdtc_calc_avail(mdtc, filepages, headroom); > - > - domain_dirty_limits(mdtc); > - > - if (unlikely(strictlimit)) { > - wb_dirty_limits(mdtc); > - m_dirty = mdtc->wb_dirty; > - m_thresh = mdtc->wb_thresh; > - m_bg_thresh = mdtc->wb_bg_thresh; > - } else { > - m_dirty = mdtc->dirty; > - m_thresh = mdtc->thresh; > - m_bg_thresh = mdtc->bg_thresh; > - } > - } > + if (mdtc) { > + unsigned long filepages, headroom, writeback; > > /* > - * Throttle it only when the background writeback cannot > - * catch-up. This avoids (excessively) small writeouts > - * when the wb limits are ramping up in case of !strictlimit. > - * > - * In strictlimit case make decision based on the wb counters > - * and limits. Small writeouts when the wb limits are ramping > - * up are the price we consciously pay for strictlimit-ing. > - * > - * If memcg domain is in effect, @dirty should be under > - * both global and memcg freerun ceilings. > + * If @wb belongs to !root memcg, repeat the same > + * basic calculations for the memcg domain. > */ > - if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) && > - (!mdtc || > - m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) { > - unsigned long intv; > - unsigned long m_intv; > + mem_cgroup_wb_stats(wb, &filepages, &headroom, > + &mdtc->dirty, &writeback); > + mdtc->dirty += writeback; > + mdtc_calc_avail(mdtc, filepages, headroom); > > -free_running: > - intv = dirty_poll_interval(dirty, thresh); > - m_intv = ULONG_MAX; > + domain_dirty_limits(mdtc); > > - current->dirty_paused_when = now; > - current->nr_dirtied = 0; > - if (mdtc) > - m_intv = dirty_poll_interval(m_dirty, m_thresh); > - current->nr_dirtied_pause = min(intv, m_intv); > - break; > + if (unlikely(strictlimit)) { > + wb_dirty_limits(mdtc); > + m_dirty = mdtc->wb_dirty; > + m_thresh = mdtc->wb_thresh; > + m_bg_thresh = mdtc->wb_bg_thresh; > + } else { > + m_dirty = mdtc->dirty; > + m_thresh = mdtc->thresh; > + m_bg_thresh = mdtc->bg_thresh; > } > + } > > - if (unlikely(!writeback_in_progress(wb))) > - wb_start_background_writeback(wb); > + /* > + * Throttle it only when the background writeback cannot > + * catch-up. This avoids (excessively) small writeouts > + * when the wb limits are ramping up in case of !strictlimit. > + * > + * In strictlimit case make decision based on the wb counters > + * and limits. Small writeouts when the wb limits are ramping > + * up are the price we consciously pay for strictlimit-ing. > + * > + * If memcg domain is in effect, @dirty should be under > + * both global and memcg freerun ceilings. > + */ > + if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) && > + (!mdtc || > + m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) { > + unsigned long intv; > + unsigned long m_intv; > > - mem_cgroup_flush_foreign(wb); > +free_running: > + intv = dirty_poll_interval(dirty, thresh); > + m_intv = ULONG_MAX; > + > + current->dirty_paused_when = ctx->now; > + current->nr_dirtied = 0; > + if (mdtc) > + m_intv = dirty_poll_interval(m_dirty, m_thresh); > + current->nr_dirtied_pause = min(intv, m_intv); > + return 0; > + } > + > + if (unlikely(!writeback_in_progress(wb))) > + wb_start_background_writeback(wb); > > + mem_cgroup_flush_foreign(wb); > + > + /* > + * Calculate global domain's pos_ratio and select the > + * global dtc by default. > + */ > + if (!strictlimit) { > + wb_dirty_limits(gdtc); > + > + if ((current->flags & PF_LOCAL_THROTTLE) && > + gdtc->wb_dirty < > + dirty_freerun_ceiling(gdtc->wb_thresh, > + gdtc->wb_bg_thresh)) > + /* > + * LOCAL_THROTTLE tasks must not be throttled > + * when below the per-wb freerun ceiling. > + */ > + goto free_running; > + } > + > + ctx->dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) && > + ((gdtc->dirty > gdtc->thresh) || strictlimit); > + > + wb_position_ratio(gdtc); > + ctx->sdtc = gdtc; > + > + if (mdtc) { > /* > - * Calculate global domain's pos_ratio and select the > - * global dtc by default. > + * If memcg domain is in effect, calculate its > + * pos_ratio. @wb should satisfy constraints from > + * both global and memcg domains. Choose the one > + * w/ lower pos_ratio. > */ > if (!strictlimit) { > - wb_dirty_limits(gdtc); > + wb_dirty_limits(mdtc); > > if ((current->flags & PF_LOCAL_THROTTLE) && > - gdtc->wb_dirty < > - dirty_freerun_ceiling(gdtc->wb_thresh, > - gdtc->wb_bg_thresh)) > + mdtc->wb_dirty < > + dirty_freerun_ceiling(mdtc->wb_thresh, > + mdtc->wb_bg_thresh)) > /* > - * LOCAL_THROTTLE tasks must not be throttled > - * when below the per-wb freerun ceiling. > + * LOCAL_THROTTLE tasks must not be > + * throttled when below the per-wb > + * freerun ceiling. > */ > goto free_running; > } > + ctx->dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) && > + ((mdtc->dirty > mdtc->thresh) || strictlimit); > > - dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) && > - ((gdtc->dirty > gdtc->thresh) || strictlimit); > + wb_position_ratio(mdtc); > + if (mdtc->pos_ratio < gdtc->pos_ratio) > + ctx->sdtc = mdtc; > + } > > - wb_position_ratio(gdtc); > - sdtc = gdtc; > + if (ctx->dirty_exceeded && !wb->dirty_exceeded) > + wb->dirty_exceeded = 1; > > - if (mdtc) { > - /* > - * If memcg domain is in effect, calculate its > - * pos_ratio. @wb should satisfy constraints from > - * both global and memcg domains. Choose the one > - * w/ lower pos_ratio. > - */ > - if (!strictlimit) { > - wb_dirty_limits(mdtc); > - > - if ((current->flags & PF_LOCAL_THROTTLE) && > - mdtc->wb_dirty < > - dirty_freerun_ceiling(mdtc->wb_thresh, > - mdtc->wb_bg_thresh)) > - /* > - * LOCAL_THROTTLE tasks must not be > - * throttled when below the per-wb > - * freerun ceiling. > - */ > - goto free_running; > - } > - dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) && > - ((mdtc->dirty > mdtc->thresh) || strictlimit); > + if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) + > + BANDWIDTH_INTERVAL)) > + __wb_update_bandwidth(gdtc, mdtc, true); > + > + /* throttle according to the chosen dtc */ > + ctx->dirty_ratelimit = READ_ONCE(wb->dirty_ratelimit); > + ctx->task_ratelimit = ((u64)ctx->dirty_ratelimit * ctx->sdtc->pos_ratio) >> > + RATELIMIT_CALC_SHIFT; > + max_pause = wb_max_pause(wb, ctx->sdtc->wb_dirty); > + min_pause = wb_min_pause(wb, max_pause, > + ctx->task_ratelimit, ctx->dirty_ratelimit, > + &ctx->nr_dirtied_pause); > + > + if (unlikely(ctx->task_ratelimit == 0)) { > + period = max_pause; > + ctx->pause = max_pause; > + goto pause; > + } > + period = HZ * pages_dirtied / ctx->task_ratelimit; > + ctx->pause = period; > + if (current->dirty_paused_when) > + ctx->pause -= ctx->now - current->dirty_paused_when; > + /* > + * For less than 1s think time (ext3/4 may block the dirtier > + * for up to 800ms from time to time on 1-HDD; so does xfs, > + * however at much less frequency), try to compensate it in > + * future periods by updating the virtual time; otherwise just > + * do a reset, as it may be a light dirtier. > + */ > + if (ctx->pause < min_pause) { > + trace_balance_dirty_pages(wb, > + ctx->sdtc->thresh, > + ctx->sdtc->bg_thresh, > + ctx->sdtc->dirty, > + ctx->sdtc->wb_thresh, > + ctx->sdtc->wb_dirty, > + ctx->dirty_ratelimit, > + ctx->task_ratelimit, > + pages_dirtied, > + period, > + min(ctx->pause, 0L), > + ctx->start_time); > + if (ctx->pause < -HZ) { > + current->dirty_paused_when = ctx->now; > + current->nr_dirtied = 0; > + } else if (period) { > + current->dirty_paused_when += period; > + current->nr_dirtied = 0; > + } else if (current->nr_dirtied_pause <= pages_dirtied) > + current->nr_dirtied_pause += pages_dirtied; > + return 0; > + } > + if (unlikely(ctx->pause > max_pause)) { > + /* for occasional dropped task_ratelimit */ > + ctx->now += min(ctx->pause - max_pause, max_pause); > + ctx->pause = max_pause; > + } > > - wb_position_ratio(mdtc); > - if (mdtc->pos_ratio < gdtc->pos_ratio) > - sdtc = mdtc; > - } > +pause: > + trace_balance_dirty_pages(wb, > + ctx->sdtc->thresh, > + ctx->sdtc->bg_thresh, > + ctx->sdtc->dirty, > + ctx->sdtc->wb_thresh, > + ctx->sdtc->wb_dirty, > + ctx->dirty_ratelimit, > + ctx->task_ratelimit, > + pages_dirtied, > + period, > + ctx->pause, > + ctx->start_time); > + > + return ctx->pause; > +} > > - if (dirty_exceeded && !wb->dirty_exceeded) > - wb->dirty_exceeded = 1; > - > - if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) + > - BANDWIDTH_INTERVAL)) > - __wb_update_bandwidth(gdtc, mdtc, true); > - > - /* throttle according to the chosen dtc */ > - dirty_ratelimit = READ_ONCE(wb->dirty_ratelimit); > - task_ratelimit = ((u64)dirty_ratelimit * sdtc->pos_ratio) >> > - RATELIMIT_CALC_SHIFT; > - max_pause = wb_max_pause(wb, sdtc->wb_dirty); > - min_pause = wb_min_pause(wb, max_pause, > - task_ratelimit, dirty_ratelimit, > - &nr_dirtied_pause); > - > - if (unlikely(task_ratelimit == 0)) { > - period = max_pause; > - pause = max_pause; > - goto pause; > - } > - period = HZ * pages_dirtied / task_ratelimit; > - pause = period; > - if (current->dirty_paused_when) > - pause -= now - current->dirty_paused_when; > - /* > - * For less than 1s think time (ext3/4 may block the dirtier > - * for up to 800ms from time to time on 1-HDD; so does xfs, > - * however at much less frequency), try to compensate it in > - * future periods by updating the virtual time; otherwise just > - * do a reset, as it may be a light dirtier. > - */ > - if (pause < min_pause) { > - trace_balance_dirty_pages(wb, > - sdtc->thresh, > - sdtc->bg_thresh, > - sdtc->dirty, > - sdtc->wb_thresh, > - sdtc->wb_dirty, > - dirty_ratelimit, > - task_ratelimit, > - pages_dirtied, > - period, > - min(pause, 0L), > - start_time); > - if (pause < -HZ) { > - current->dirty_paused_when = now; > - current->nr_dirtied = 0; > - } else if (period) { > - current->dirty_paused_when += period; > - current->nr_dirtied = 0; > - } else if (current->nr_dirtied_pause <= pages_dirtied) > - current->nr_dirtied_pause += pages_dirtied; > +/* > + * balance_dirty_pages() must be called by processes which are generating dirty > + * data. It looks at the number of dirty pages in the machine and will force > + * the caller to wait once crossing the (background_thresh + dirty_thresh) / 2. > + * If we're over `background_thresh' then the writeback threads are woken to > + * perform some writeout. > + */ > +static void balance_dirty_pages(struct bdi_writeback *wb, unsigned long pages_dirtied) > +{ > + int ret; > + struct bdp_ctx ctx = { BDP_CTX_INIT(ctx, wb) }; > + > + for (;;) { > + ret = _balance_dirty_pages(wb, current->nr_dirtied, &ctx); > + if (!ret) > break; > - } > - if (unlikely(pause > max_pause)) { > - /* for occasional dropped task_ratelimit */ > - now += min(pause - max_pause, max_pause); > - pause = max_pause; > - } > > -pause: > - trace_balance_dirty_pages(wb, > - sdtc->thresh, > - sdtc->bg_thresh, > - sdtc->dirty, > - sdtc->wb_thresh, > - sdtc->wb_dirty, > - dirty_ratelimit, > - task_ratelimit, > - pages_dirtied, > - period, > - pause, > - start_time); > __set_current_state(TASK_KILLABLE); > - wb->dirty_sleep = now; > - io_schedule_timeout(pause); > + wb->dirty_sleep = ctx.now; > + io_schedule_timeout(ctx.pause); > > - current->dirty_paused_when = now + pause; > + current->dirty_paused_when = ctx.now + ctx.pause; > current->nr_dirtied = 0; > - current->nr_dirtied_pause = nr_dirtied_pause; > + current->nr_dirtied_pause = ctx.nr_dirtied_pause; > > /* > * This is typically equal to (dirty < thresh) and can also > * keep "1000+ dd on a slow USB stick" under control. > */ > - if (task_ratelimit) > + if (ctx.task_ratelimit) > break; > > /* > @@ -1805,14 +1831,14 @@ static void balance_dirty_pages(struct bdi_writeback *wb, > * more page. However wb_dirty has accounting errors. So use > * the larger and more IO friendly wb_stat_error. > */ > - if (sdtc->wb_dirty <= wb_stat_error()) > + if (ctx.sdtc->wb_dirty <= wb_stat_error()) > break; > > if (fatal_signal_pending(current)) > break; > } > > - if (!dirty_exceeded && wb->dirty_exceeded) > + if (!ctx.dirty_exceeded && wb->dirty_exceeded) > wb->dirty_exceeded = 0; > > if (writeback_in_progress(wb)) > @@ -1829,7 +1855,7 @@ static void balance_dirty_pages(struct bdi_writeback *wb, > if (laptop_mode) > return; > > - if (nr_reclaimable > gdtc->bg_thresh) > + if (ctx.nr_reclaimable > ctx.gdtc_stor.bg_thresh) > wb_start_background_writeback(wb); > } > > -- > 2.30.2 >
On 5/18/22 4:07 AM, Jan Kara wrote: > On Mon 16-05-22 09:47:14, Stefan Roesch wrote: >> This factors out the for loop in balance_dirty_pages() into a new >> function called _balance_dirty_pages(). By factoring out this function >> the async write code can determine if it has to wait to throttle writes >> or not. The function _balance_dirty_pages() returns the sleep time. >> If the sleep time is greater 0, then the async write code needs to throttle. >> >> To maintain the context for consecutive calls of _balance_dirty_pages() >> and maintain the current behavior a new data structure called bdp_ctx >> has been introduced. >> >> Signed-off-by: Stefan Roesch <shr@fb.com> > > ... > >> --- >> mm/page-writeback.c | 452 +++++++++++++++++++++++--------------------- >> 1 file changed, 239 insertions(+), 213 deletions(-) >> >> diff --git a/mm/page-writeback.c b/mm/page-writeback.c >> index 7e2da284e427..cbb74c0666c6 100644 >> --- a/mm/page-writeback.c >> +++ b/mm/page-writeback.c >> @@ -140,6 +140,29 @@ struct dirty_throttle_control { >> unsigned long pos_ratio; >> }; >> >> +/* context for consecutive calls to _balance_dirty_pages() */ >> +struct bdp_ctx { >> + long pause; >> + unsigned long now; >> + unsigned long start_time; >> + unsigned long task_ratelimit; >> + unsigned long dirty_ratelimit; >> + unsigned long nr_reclaimable; >> + int nr_dirtied_pause; >> + bool dirty_exceeded; >> + >> + struct dirty_throttle_control gdtc_stor; >> + struct dirty_throttle_control mdtc_stor; >> + struct dirty_throttle_control *sdtc; >> +} bdp_ctx; > > Looking at how much context you propagate into _balance_dirty_pages() I > don't think this suggestion was as great as I've hoped. I'm sorry for that. > We could actually significantly reduce the amount of context passed in/out > but some things would be difficult to get rid of and some interactions of > code in _balance_dirty_pages() and the caller are actually pretty subtle. > > I think something like attached three patches should make things NOWAIT > support in balance_dirty_pages() reasonably readable. > Thanks a lot Jan for the three patches. I integrated them in the patch series and changed the callers accordingly. > Honza > >> + >> +/* initialize _balance_dirty_pages() context */ >> +#define BDP_CTX_INIT(ctx, wb) \ >> + .gdtc_stor = { GDTC_INIT(wb) }, \ >> + .mdtc_stor = { MDTC_INIT(wb, &ctx.gdtc_stor) }, \ >> + .start_time = jiffies, \ >> + .dirty_exceeded = false >> + >> /* >> * Length of period for aging writeout fractions of bdis. This is an >> * arbitrarily chosen number. The longer the period, the slower fractions will >> @@ -1538,261 +1561,264 @@ static inline void wb_dirty_limits(struct dirty_throttle_control *dtc) >> } >> } >> >> -/* >> - * balance_dirty_pages() must be called by processes which are generating dirty >> - * data. It looks at the number of dirty pages in the machine and will force >> - * the caller to wait once crossing the (background_thresh + dirty_thresh) / 2. >> - * If we're over `background_thresh' then the writeback threads are woken to >> - * perform some writeout. >> - */ >> -static void balance_dirty_pages(struct bdi_writeback *wb, >> - unsigned long pages_dirtied) >> +static inline int _balance_dirty_pages(struct bdi_writeback *wb, >> + unsigned long pages_dirtied, struct bdp_ctx *ctx) >> { >> - struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) }; >> - struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) }; >> - struct dirty_throttle_control * const gdtc = &gdtc_stor; >> - struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ? >> - &mdtc_stor : NULL; >> - struct dirty_throttle_control *sdtc; >> - unsigned long nr_reclaimable; /* = file_dirty */ >> + struct dirty_throttle_control * const gdtc = &ctx->gdtc_stor; >> + struct dirty_throttle_control * const mdtc = mdtc_valid(&ctx->mdtc_stor) ? >> + &ctx->mdtc_stor : NULL; >> long period; >> - long pause; >> long max_pause; >> long min_pause; >> - int nr_dirtied_pause; >> - bool dirty_exceeded = false; >> - unsigned long task_ratelimit; >> - unsigned long dirty_ratelimit; >> struct backing_dev_info *bdi = wb->bdi; >> bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT; >> - unsigned long start_time = jiffies; >> >> - for (;;) { >> - unsigned long now = jiffies; >> - unsigned long dirty, thresh, bg_thresh; >> - unsigned long m_dirty = 0; /* stop bogus uninit warnings */ >> - unsigned long m_thresh = 0; >> - unsigned long m_bg_thresh = 0; >> - >> - nr_reclaimable = global_node_page_state(NR_FILE_DIRTY); >> - gdtc->avail = global_dirtyable_memory(); >> - gdtc->dirty = nr_reclaimable + global_node_page_state(NR_WRITEBACK); >> + unsigned long dirty, thresh, bg_thresh; >> + unsigned long m_dirty = 0; /* stop bogus uninit warnings */ >> + unsigned long m_thresh = 0; >> + unsigned long m_bg_thresh = 0; >> >> - domain_dirty_limits(gdtc); >> + ctx->now = jiffies; >> + ctx->nr_reclaimable = global_node_page_state(NR_FILE_DIRTY); >> + gdtc->avail = global_dirtyable_memory(); >> + gdtc->dirty = ctx->nr_reclaimable + global_node_page_state(NR_WRITEBACK); >> >> - if (unlikely(strictlimit)) { >> - wb_dirty_limits(gdtc); >> + domain_dirty_limits(gdtc); >> >> - dirty = gdtc->wb_dirty; >> - thresh = gdtc->wb_thresh; >> - bg_thresh = gdtc->wb_bg_thresh; >> - } else { >> - dirty = gdtc->dirty; >> - thresh = gdtc->thresh; >> - bg_thresh = gdtc->bg_thresh; >> - } >> + if (unlikely(strictlimit)) { >> + wb_dirty_limits(gdtc); >> >> - if (mdtc) { >> - unsigned long filepages, headroom, writeback; >> + dirty = gdtc->wb_dirty; >> + thresh = gdtc->wb_thresh; >> + bg_thresh = gdtc->wb_bg_thresh; >> + } else { >> + dirty = gdtc->dirty; >> + thresh = gdtc->thresh; >> + bg_thresh = gdtc->bg_thresh; >> + } >> >> - /* >> - * If @wb belongs to !root memcg, repeat the same >> - * basic calculations for the memcg domain. >> - */ >> - mem_cgroup_wb_stats(wb, &filepages, &headroom, >> - &mdtc->dirty, &writeback); >> - mdtc->dirty += writeback; >> - mdtc_calc_avail(mdtc, filepages, headroom); >> - >> - domain_dirty_limits(mdtc); >> - >> - if (unlikely(strictlimit)) { >> - wb_dirty_limits(mdtc); >> - m_dirty = mdtc->wb_dirty; >> - m_thresh = mdtc->wb_thresh; >> - m_bg_thresh = mdtc->wb_bg_thresh; >> - } else { >> - m_dirty = mdtc->dirty; >> - m_thresh = mdtc->thresh; >> - m_bg_thresh = mdtc->bg_thresh; >> - } >> - } >> + if (mdtc) { >> + unsigned long filepages, headroom, writeback; >> >> /* >> - * Throttle it only when the background writeback cannot >> - * catch-up. This avoids (excessively) small writeouts >> - * when the wb limits are ramping up in case of !strictlimit. >> - * >> - * In strictlimit case make decision based on the wb counters >> - * and limits. Small writeouts when the wb limits are ramping >> - * up are the price we consciously pay for strictlimit-ing. >> - * >> - * If memcg domain is in effect, @dirty should be under >> - * both global and memcg freerun ceilings. >> + * If @wb belongs to !root memcg, repeat the same >> + * basic calculations for the memcg domain. >> */ >> - if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) && >> - (!mdtc || >> - m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) { >> - unsigned long intv; >> - unsigned long m_intv; >> + mem_cgroup_wb_stats(wb, &filepages, &headroom, >> + &mdtc->dirty, &writeback); >> + mdtc->dirty += writeback; >> + mdtc_calc_avail(mdtc, filepages, headroom); >> >> -free_running: >> - intv = dirty_poll_interval(dirty, thresh); >> - m_intv = ULONG_MAX; >> + domain_dirty_limits(mdtc); >> >> - current->dirty_paused_when = now; >> - current->nr_dirtied = 0; >> - if (mdtc) >> - m_intv = dirty_poll_interval(m_dirty, m_thresh); >> - current->nr_dirtied_pause = min(intv, m_intv); >> - break; >> + if (unlikely(strictlimit)) { >> + wb_dirty_limits(mdtc); >> + m_dirty = mdtc->wb_dirty; >> + m_thresh = mdtc->wb_thresh; >> + m_bg_thresh = mdtc->wb_bg_thresh; >> + } else { >> + m_dirty = mdtc->dirty; >> + m_thresh = mdtc->thresh; >> + m_bg_thresh = mdtc->bg_thresh; >> } >> + } >> >> - if (unlikely(!writeback_in_progress(wb))) >> - wb_start_background_writeback(wb); >> + /* >> + * Throttle it only when the background writeback cannot >> + * catch-up. This avoids (excessively) small writeouts >> + * when the wb limits are ramping up in case of !strictlimit. >> + * >> + * In strictlimit case make decision based on the wb counters >> + * and limits. Small writeouts when the wb limits are ramping >> + * up are the price we consciously pay for strictlimit-ing. >> + * >> + * If memcg domain is in effect, @dirty should be under >> + * both global and memcg freerun ceilings. >> + */ >> + if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) && >> + (!mdtc || >> + m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) { >> + unsigned long intv; >> + unsigned long m_intv; >> >> - mem_cgroup_flush_foreign(wb); >> +free_running: >> + intv = dirty_poll_interval(dirty, thresh); >> + m_intv = ULONG_MAX; >> + >> + current->dirty_paused_when = ctx->now; >> + current->nr_dirtied = 0; >> + if (mdtc) >> + m_intv = dirty_poll_interval(m_dirty, m_thresh); >> + current->nr_dirtied_pause = min(intv, m_intv); >> + return 0; >> + } >> + >> + if (unlikely(!writeback_in_progress(wb))) >> + wb_start_background_writeback(wb); >> >> + mem_cgroup_flush_foreign(wb); >> + >> + /* >> + * Calculate global domain's pos_ratio and select the >> + * global dtc by default. >> + */ >> + if (!strictlimit) { >> + wb_dirty_limits(gdtc); >> + >> + if ((current->flags & PF_LOCAL_THROTTLE) && >> + gdtc->wb_dirty < >> + dirty_freerun_ceiling(gdtc->wb_thresh, >> + gdtc->wb_bg_thresh)) >> + /* >> + * LOCAL_THROTTLE tasks must not be throttled >> + * when below the per-wb freerun ceiling. >> + */ >> + goto free_running; >> + } >> + >> + ctx->dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) && >> + ((gdtc->dirty > gdtc->thresh) || strictlimit); >> + >> + wb_position_ratio(gdtc); >> + ctx->sdtc = gdtc; >> + >> + if (mdtc) { >> /* >> - * Calculate global domain's pos_ratio and select the >> - * global dtc by default. >> + * If memcg domain is in effect, calculate its >> + * pos_ratio. @wb should satisfy constraints from >> + * both global and memcg domains. Choose the one >> + * w/ lower pos_ratio. >> */ >> if (!strictlimit) { >> - wb_dirty_limits(gdtc); >> + wb_dirty_limits(mdtc); >> >> if ((current->flags & PF_LOCAL_THROTTLE) && >> - gdtc->wb_dirty < >> - dirty_freerun_ceiling(gdtc->wb_thresh, >> - gdtc->wb_bg_thresh)) >> + mdtc->wb_dirty < >> + dirty_freerun_ceiling(mdtc->wb_thresh, >> + mdtc->wb_bg_thresh)) >> /* >> - * LOCAL_THROTTLE tasks must not be throttled >> - * when below the per-wb freerun ceiling. >> + * LOCAL_THROTTLE tasks must not be >> + * throttled when below the per-wb >> + * freerun ceiling. >> */ >> goto free_running; >> } >> + ctx->dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) && >> + ((mdtc->dirty > mdtc->thresh) || strictlimit); >> >> - dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) && >> - ((gdtc->dirty > gdtc->thresh) || strictlimit); >> + wb_position_ratio(mdtc); >> + if (mdtc->pos_ratio < gdtc->pos_ratio) >> + ctx->sdtc = mdtc; >> + } >> >> - wb_position_ratio(gdtc); >> - sdtc = gdtc; >> + if (ctx->dirty_exceeded && !wb->dirty_exceeded) >> + wb->dirty_exceeded = 1; >> >> - if (mdtc) { >> - /* >> - * If memcg domain is in effect, calculate its >> - * pos_ratio. @wb should satisfy constraints from >> - * both global and memcg domains. Choose the one >> - * w/ lower pos_ratio. >> - */ >> - if (!strictlimit) { >> - wb_dirty_limits(mdtc); >> - >> - if ((current->flags & PF_LOCAL_THROTTLE) && >> - mdtc->wb_dirty < >> - dirty_freerun_ceiling(mdtc->wb_thresh, >> - mdtc->wb_bg_thresh)) >> - /* >> - * LOCAL_THROTTLE tasks must not be >> - * throttled when below the per-wb >> - * freerun ceiling. >> - */ >> - goto free_running; >> - } >> - dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) && >> - ((mdtc->dirty > mdtc->thresh) || strictlimit); >> + if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) + >> + BANDWIDTH_INTERVAL)) >> + __wb_update_bandwidth(gdtc, mdtc, true); >> + >> + /* throttle according to the chosen dtc */ >> + ctx->dirty_ratelimit = READ_ONCE(wb->dirty_ratelimit); >> + ctx->task_ratelimit = ((u64)ctx->dirty_ratelimit * ctx->sdtc->pos_ratio) >> >> + RATELIMIT_CALC_SHIFT; >> + max_pause = wb_max_pause(wb, ctx->sdtc->wb_dirty); >> + min_pause = wb_min_pause(wb, max_pause, >> + ctx->task_ratelimit, ctx->dirty_ratelimit, >> + &ctx->nr_dirtied_pause); >> + >> + if (unlikely(ctx->task_ratelimit == 0)) { >> + period = max_pause; >> + ctx->pause = max_pause; >> + goto pause; >> + } >> + period = HZ * pages_dirtied / ctx->task_ratelimit; >> + ctx->pause = period; >> + if (current->dirty_paused_when) >> + ctx->pause -= ctx->now - current->dirty_paused_when; >> + /* >> + * For less than 1s think time (ext3/4 may block the dirtier >> + * for up to 800ms from time to time on 1-HDD; so does xfs, >> + * however at much less frequency), try to compensate it in >> + * future periods by updating the virtual time; otherwise just >> + * do a reset, as it may be a light dirtier. >> + */ >> + if (ctx->pause < min_pause) { >> + trace_balance_dirty_pages(wb, >> + ctx->sdtc->thresh, >> + ctx->sdtc->bg_thresh, >> + ctx->sdtc->dirty, >> + ctx->sdtc->wb_thresh, >> + ctx->sdtc->wb_dirty, >> + ctx->dirty_ratelimit, >> + ctx->task_ratelimit, >> + pages_dirtied, >> + period, >> + min(ctx->pause, 0L), >> + ctx->start_time); >> + if (ctx->pause < -HZ) { >> + current->dirty_paused_when = ctx->now; >> + current->nr_dirtied = 0; >> + } else if (period) { >> + current->dirty_paused_when += period; >> + current->nr_dirtied = 0; >> + } else if (current->nr_dirtied_pause <= pages_dirtied) >> + current->nr_dirtied_pause += pages_dirtied; >> + return 0; >> + } >> + if (unlikely(ctx->pause > max_pause)) { >> + /* for occasional dropped task_ratelimit */ >> + ctx->now += min(ctx->pause - max_pause, max_pause); >> + ctx->pause = max_pause; >> + } >> >> - wb_position_ratio(mdtc); >> - if (mdtc->pos_ratio < gdtc->pos_ratio) >> - sdtc = mdtc; >> - } >> +pause: >> + trace_balance_dirty_pages(wb, >> + ctx->sdtc->thresh, >> + ctx->sdtc->bg_thresh, >> + ctx->sdtc->dirty, >> + ctx->sdtc->wb_thresh, >> + ctx->sdtc->wb_dirty, >> + ctx->dirty_ratelimit, >> + ctx->task_ratelimit, >> + pages_dirtied, >> + period, >> + ctx->pause, >> + ctx->start_time); >> + >> + return ctx->pause; >> +} >> >> - if (dirty_exceeded && !wb->dirty_exceeded) >> - wb->dirty_exceeded = 1; >> - >> - if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) + >> - BANDWIDTH_INTERVAL)) >> - __wb_update_bandwidth(gdtc, mdtc, true); >> - >> - /* throttle according to the chosen dtc */ >> - dirty_ratelimit = READ_ONCE(wb->dirty_ratelimit); >> - task_ratelimit = ((u64)dirty_ratelimit * sdtc->pos_ratio) >> >> - RATELIMIT_CALC_SHIFT; >> - max_pause = wb_max_pause(wb, sdtc->wb_dirty); >> - min_pause = wb_min_pause(wb, max_pause, >> - task_ratelimit, dirty_ratelimit, >> - &nr_dirtied_pause); >> - >> - if (unlikely(task_ratelimit == 0)) { >> - period = max_pause; >> - pause = max_pause; >> - goto pause; >> - } >> - period = HZ * pages_dirtied / task_ratelimit; >> - pause = period; >> - if (current->dirty_paused_when) >> - pause -= now - current->dirty_paused_when; >> - /* >> - * For less than 1s think time (ext3/4 may block the dirtier >> - * for up to 800ms from time to time on 1-HDD; so does xfs, >> - * however at much less frequency), try to compensate it in >> - * future periods by updating the virtual time; otherwise just >> - * do a reset, as it may be a light dirtier. >> - */ >> - if (pause < min_pause) { >> - trace_balance_dirty_pages(wb, >> - sdtc->thresh, >> - sdtc->bg_thresh, >> - sdtc->dirty, >> - sdtc->wb_thresh, >> - sdtc->wb_dirty, >> - dirty_ratelimit, >> - task_ratelimit, >> - pages_dirtied, >> - period, >> - min(pause, 0L), >> - start_time); >> - if (pause < -HZ) { >> - current->dirty_paused_when = now; >> - current->nr_dirtied = 0; >> - } else if (period) { >> - current->dirty_paused_when += period; >> - current->nr_dirtied = 0; >> - } else if (current->nr_dirtied_pause <= pages_dirtied) >> - current->nr_dirtied_pause += pages_dirtied; >> +/* >> + * balance_dirty_pages() must be called by processes which are generating dirty >> + * data. It looks at the number of dirty pages in the machine and will force >> + * the caller to wait once crossing the (background_thresh + dirty_thresh) / 2. >> + * If we're over `background_thresh' then the writeback threads are woken to >> + * perform some writeout. >> + */ >> +static void balance_dirty_pages(struct bdi_writeback *wb, unsigned long pages_dirtied) >> +{ >> + int ret; >> + struct bdp_ctx ctx = { BDP_CTX_INIT(ctx, wb) }; >> + >> + for (;;) { >> + ret = _balance_dirty_pages(wb, current->nr_dirtied, &ctx); >> + if (!ret) >> break; >> - } >> - if (unlikely(pause > max_pause)) { >> - /* for occasional dropped task_ratelimit */ >> - now += min(pause - max_pause, max_pause); >> - pause = max_pause; >> - } >> >> -pause: >> - trace_balance_dirty_pages(wb, >> - sdtc->thresh, >> - sdtc->bg_thresh, >> - sdtc->dirty, >> - sdtc->wb_thresh, >> - sdtc->wb_dirty, >> - dirty_ratelimit, >> - task_ratelimit, >> - pages_dirtied, >> - period, >> - pause, >> - start_time); >> __set_current_state(TASK_KILLABLE); >> - wb->dirty_sleep = now; >> - io_schedule_timeout(pause); >> + wb->dirty_sleep = ctx.now; >> + io_schedule_timeout(ctx.pause); >> >> - current->dirty_paused_when = now + pause; >> + current->dirty_paused_when = ctx.now + ctx.pause; >> current->nr_dirtied = 0; >> - current->nr_dirtied_pause = nr_dirtied_pause; >> + current->nr_dirtied_pause = ctx.nr_dirtied_pause; >> >> /* >> * This is typically equal to (dirty < thresh) and can also >> * keep "1000+ dd on a slow USB stick" under control. >> */ >> - if (task_ratelimit) >> + if (ctx.task_ratelimit) >> break; >> >> /* >> @@ -1805,14 +1831,14 @@ static void balance_dirty_pages(struct bdi_writeback *wb, >> * more page. However wb_dirty has accounting errors. So use >> * the larger and more IO friendly wb_stat_error. >> */ >> - if (sdtc->wb_dirty <= wb_stat_error()) >> + if (ctx.sdtc->wb_dirty <= wb_stat_error()) >> break; >> >> if (fatal_signal_pending(current)) >> break; >> } >> >> - if (!dirty_exceeded && wb->dirty_exceeded) >> + if (!ctx.dirty_exceeded && wb->dirty_exceeded) >> wb->dirty_exceeded = 0; >> >> if (writeback_in_progress(wb)) >> @@ -1829,7 +1855,7 @@ static void balance_dirty_pages(struct bdi_writeback *wb, >> if (laptop_mode) >> return; >> >> - if (nr_reclaimable > gdtc->bg_thresh) >> + if (ctx.nr_reclaimable > ctx.gdtc_stor.bg_thresh) >> wb_start_background_writeback(wb); >> } >> >> -- >> 2.30.2 >>
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 7e2da284e427..cbb74c0666c6 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -140,6 +140,29 @@ struct dirty_throttle_control { unsigned long pos_ratio; }; +/* context for consecutive calls to _balance_dirty_pages() */ +struct bdp_ctx { + long pause; + unsigned long now; + unsigned long start_time; + unsigned long task_ratelimit; + unsigned long dirty_ratelimit; + unsigned long nr_reclaimable; + int nr_dirtied_pause; + bool dirty_exceeded; + + struct dirty_throttle_control gdtc_stor; + struct dirty_throttle_control mdtc_stor; + struct dirty_throttle_control *sdtc; +} bdp_ctx; + +/* initialize _balance_dirty_pages() context */ +#define BDP_CTX_INIT(ctx, wb) \ + .gdtc_stor = { GDTC_INIT(wb) }, \ + .mdtc_stor = { MDTC_INIT(wb, &ctx.gdtc_stor) }, \ + .start_time = jiffies, \ + .dirty_exceeded = false + /* * Length of period for aging writeout fractions of bdis. This is an * arbitrarily chosen number. The longer the period, the slower fractions will @@ -1538,261 +1561,264 @@ static inline void wb_dirty_limits(struct dirty_throttle_control *dtc) } } -/* - * balance_dirty_pages() must be called by processes which are generating dirty - * data. It looks at the number of dirty pages in the machine and will force - * the caller to wait once crossing the (background_thresh + dirty_thresh) / 2. - * If we're over `background_thresh' then the writeback threads are woken to - * perform some writeout. - */ -static void balance_dirty_pages(struct bdi_writeback *wb, - unsigned long pages_dirtied) +static inline int _balance_dirty_pages(struct bdi_writeback *wb, + unsigned long pages_dirtied, struct bdp_ctx *ctx) { - struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) }; - struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) }; - struct dirty_throttle_control * const gdtc = &gdtc_stor; - struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ? - &mdtc_stor : NULL; - struct dirty_throttle_control *sdtc; - unsigned long nr_reclaimable; /* = file_dirty */ + struct dirty_throttle_control * const gdtc = &ctx->gdtc_stor; + struct dirty_throttle_control * const mdtc = mdtc_valid(&ctx->mdtc_stor) ? + &ctx->mdtc_stor : NULL; long period; - long pause; long max_pause; long min_pause; - int nr_dirtied_pause; - bool dirty_exceeded = false; - unsigned long task_ratelimit; - unsigned long dirty_ratelimit; struct backing_dev_info *bdi = wb->bdi; bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT; - unsigned long start_time = jiffies; - for (;;) { - unsigned long now = jiffies; - unsigned long dirty, thresh, bg_thresh; - unsigned long m_dirty = 0; /* stop bogus uninit warnings */ - unsigned long m_thresh = 0; - unsigned long m_bg_thresh = 0; - - nr_reclaimable = global_node_page_state(NR_FILE_DIRTY); - gdtc->avail = global_dirtyable_memory(); - gdtc->dirty = nr_reclaimable + global_node_page_state(NR_WRITEBACK); + unsigned long dirty, thresh, bg_thresh; + unsigned long m_dirty = 0; /* stop bogus uninit warnings */ + unsigned long m_thresh = 0; + unsigned long m_bg_thresh = 0; - domain_dirty_limits(gdtc); + ctx->now = jiffies; + ctx->nr_reclaimable = global_node_page_state(NR_FILE_DIRTY); + gdtc->avail = global_dirtyable_memory(); + gdtc->dirty = ctx->nr_reclaimable + global_node_page_state(NR_WRITEBACK); - if (unlikely(strictlimit)) { - wb_dirty_limits(gdtc); + domain_dirty_limits(gdtc); - dirty = gdtc->wb_dirty; - thresh = gdtc->wb_thresh; - bg_thresh = gdtc->wb_bg_thresh; - } else { - dirty = gdtc->dirty; - thresh = gdtc->thresh; - bg_thresh = gdtc->bg_thresh; - } + if (unlikely(strictlimit)) { + wb_dirty_limits(gdtc); - if (mdtc) { - unsigned long filepages, headroom, writeback; + dirty = gdtc->wb_dirty; + thresh = gdtc->wb_thresh; + bg_thresh = gdtc->wb_bg_thresh; + } else { + dirty = gdtc->dirty; + thresh = gdtc->thresh; + bg_thresh = gdtc->bg_thresh; + } - /* - * If @wb belongs to !root memcg, repeat the same - * basic calculations for the memcg domain. - */ - mem_cgroup_wb_stats(wb, &filepages, &headroom, - &mdtc->dirty, &writeback); - mdtc->dirty += writeback; - mdtc_calc_avail(mdtc, filepages, headroom); - - domain_dirty_limits(mdtc); - - if (unlikely(strictlimit)) { - wb_dirty_limits(mdtc); - m_dirty = mdtc->wb_dirty; - m_thresh = mdtc->wb_thresh; - m_bg_thresh = mdtc->wb_bg_thresh; - } else { - m_dirty = mdtc->dirty; - m_thresh = mdtc->thresh; - m_bg_thresh = mdtc->bg_thresh; - } - } + if (mdtc) { + unsigned long filepages, headroom, writeback; /* - * Throttle it only when the background writeback cannot - * catch-up. This avoids (excessively) small writeouts - * when the wb limits are ramping up in case of !strictlimit. - * - * In strictlimit case make decision based on the wb counters - * and limits. Small writeouts when the wb limits are ramping - * up are the price we consciously pay for strictlimit-ing. - * - * If memcg domain is in effect, @dirty should be under - * both global and memcg freerun ceilings. + * If @wb belongs to !root memcg, repeat the same + * basic calculations for the memcg domain. */ - if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) && - (!mdtc || - m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) { - unsigned long intv; - unsigned long m_intv; + mem_cgroup_wb_stats(wb, &filepages, &headroom, + &mdtc->dirty, &writeback); + mdtc->dirty += writeback; + mdtc_calc_avail(mdtc, filepages, headroom); -free_running: - intv = dirty_poll_interval(dirty, thresh); - m_intv = ULONG_MAX; + domain_dirty_limits(mdtc); - current->dirty_paused_when = now; - current->nr_dirtied = 0; - if (mdtc) - m_intv = dirty_poll_interval(m_dirty, m_thresh); - current->nr_dirtied_pause = min(intv, m_intv); - break; + if (unlikely(strictlimit)) { + wb_dirty_limits(mdtc); + m_dirty = mdtc->wb_dirty; + m_thresh = mdtc->wb_thresh; + m_bg_thresh = mdtc->wb_bg_thresh; + } else { + m_dirty = mdtc->dirty; + m_thresh = mdtc->thresh; + m_bg_thresh = mdtc->bg_thresh; } + } - if (unlikely(!writeback_in_progress(wb))) - wb_start_background_writeback(wb); + /* + * Throttle it only when the background writeback cannot + * catch-up. This avoids (excessively) small writeouts + * when the wb limits are ramping up in case of !strictlimit. + * + * In strictlimit case make decision based on the wb counters + * and limits. Small writeouts when the wb limits are ramping + * up are the price we consciously pay for strictlimit-ing. + * + * If memcg domain is in effect, @dirty should be under + * both global and memcg freerun ceilings. + */ + if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) && + (!mdtc || + m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) { + unsigned long intv; + unsigned long m_intv; - mem_cgroup_flush_foreign(wb); +free_running: + intv = dirty_poll_interval(dirty, thresh); + m_intv = ULONG_MAX; + + current->dirty_paused_when = ctx->now; + current->nr_dirtied = 0; + if (mdtc) + m_intv = dirty_poll_interval(m_dirty, m_thresh); + current->nr_dirtied_pause = min(intv, m_intv); + return 0; + } + + if (unlikely(!writeback_in_progress(wb))) + wb_start_background_writeback(wb); + mem_cgroup_flush_foreign(wb); + + /* + * Calculate global domain's pos_ratio and select the + * global dtc by default. + */ + if (!strictlimit) { + wb_dirty_limits(gdtc); + + if ((current->flags & PF_LOCAL_THROTTLE) && + gdtc->wb_dirty < + dirty_freerun_ceiling(gdtc->wb_thresh, + gdtc->wb_bg_thresh)) + /* + * LOCAL_THROTTLE tasks must not be throttled + * when below the per-wb freerun ceiling. + */ + goto free_running; + } + + ctx->dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) && + ((gdtc->dirty > gdtc->thresh) || strictlimit); + + wb_position_ratio(gdtc); + ctx->sdtc = gdtc; + + if (mdtc) { /* - * Calculate global domain's pos_ratio and select the - * global dtc by default. + * If memcg domain is in effect, calculate its + * pos_ratio. @wb should satisfy constraints from + * both global and memcg domains. Choose the one + * w/ lower pos_ratio. */ if (!strictlimit) { - wb_dirty_limits(gdtc); + wb_dirty_limits(mdtc); if ((current->flags & PF_LOCAL_THROTTLE) && - gdtc->wb_dirty < - dirty_freerun_ceiling(gdtc->wb_thresh, - gdtc->wb_bg_thresh)) + mdtc->wb_dirty < + dirty_freerun_ceiling(mdtc->wb_thresh, + mdtc->wb_bg_thresh)) /* - * LOCAL_THROTTLE tasks must not be throttled - * when below the per-wb freerun ceiling. + * LOCAL_THROTTLE tasks must not be + * throttled when below the per-wb + * freerun ceiling. */ goto free_running; } + ctx->dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) && + ((mdtc->dirty > mdtc->thresh) || strictlimit); - dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) && - ((gdtc->dirty > gdtc->thresh) || strictlimit); + wb_position_ratio(mdtc); + if (mdtc->pos_ratio < gdtc->pos_ratio) + ctx->sdtc = mdtc; + } - wb_position_ratio(gdtc); - sdtc = gdtc; + if (ctx->dirty_exceeded && !wb->dirty_exceeded) + wb->dirty_exceeded = 1; - if (mdtc) { - /* - * If memcg domain is in effect, calculate its - * pos_ratio. @wb should satisfy constraints from - * both global and memcg domains. Choose the one - * w/ lower pos_ratio. - */ - if (!strictlimit) { - wb_dirty_limits(mdtc); - - if ((current->flags & PF_LOCAL_THROTTLE) && - mdtc->wb_dirty < - dirty_freerun_ceiling(mdtc->wb_thresh, - mdtc->wb_bg_thresh)) - /* - * LOCAL_THROTTLE tasks must not be - * throttled when below the per-wb - * freerun ceiling. - */ - goto free_running; - } - dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) && - ((mdtc->dirty > mdtc->thresh) || strictlimit); + if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) + + BANDWIDTH_INTERVAL)) + __wb_update_bandwidth(gdtc, mdtc, true); + + /* throttle according to the chosen dtc */ + ctx->dirty_ratelimit = READ_ONCE(wb->dirty_ratelimit); + ctx->task_ratelimit = ((u64)ctx->dirty_ratelimit * ctx->sdtc->pos_ratio) >> + RATELIMIT_CALC_SHIFT; + max_pause = wb_max_pause(wb, ctx->sdtc->wb_dirty); + min_pause = wb_min_pause(wb, max_pause, + ctx->task_ratelimit, ctx->dirty_ratelimit, + &ctx->nr_dirtied_pause); + + if (unlikely(ctx->task_ratelimit == 0)) { + period = max_pause; + ctx->pause = max_pause; + goto pause; + } + period = HZ * pages_dirtied / ctx->task_ratelimit; + ctx->pause = period; + if (current->dirty_paused_when) + ctx->pause -= ctx->now - current->dirty_paused_when; + /* + * For less than 1s think time (ext3/4 may block the dirtier + * for up to 800ms from time to time on 1-HDD; so does xfs, + * however at much less frequency), try to compensate it in + * future periods by updating the virtual time; otherwise just + * do a reset, as it may be a light dirtier. + */ + if (ctx->pause < min_pause) { + trace_balance_dirty_pages(wb, + ctx->sdtc->thresh, + ctx->sdtc->bg_thresh, + ctx->sdtc->dirty, + ctx->sdtc->wb_thresh, + ctx->sdtc->wb_dirty, + ctx->dirty_ratelimit, + ctx->task_ratelimit, + pages_dirtied, + period, + min(ctx->pause, 0L), + ctx->start_time); + if (ctx->pause < -HZ) { + current->dirty_paused_when = ctx->now; + current->nr_dirtied = 0; + } else if (period) { + current->dirty_paused_when += period; + current->nr_dirtied = 0; + } else if (current->nr_dirtied_pause <= pages_dirtied) + current->nr_dirtied_pause += pages_dirtied; + return 0; + } + if (unlikely(ctx->pause > max_pause)) { + /* for occasional dropped task_ratelimit */ + ctx->now += min(ctx->pause - max_pause, max_pause); + ctx->pause = max_pause; + } - wb_position_ratio(mdtc); - if (mdtc->pos_ratio < gdtc->pos_ratio) - sdtc = mdtc; - } +pause: + trace_balance_dirty_pages(wb, + ctx->sdtc->thresh, + ctx->sdtc->bg_thresh, + ctx->sdtc->dirty, + ctx->sdtc->wb_thresh, + ctx->sdtc->wb_dirty, + ctx->dirty_ratelimit, + ctx->task_ratelimit, + pages_dirtied, + period, + ctx->pause, + ctx->start_time); + + return ctx->pause; +} - if (dirty_exceeded && !wb->dirty_exceeded) - wb->dirty_exceeded = 1; - - if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) + - BANDWIDTH_INTERVAL)) - __wb_update_bandwidth(gdtc, mdtc, true); - - /* throttle according to the chosen dtc */ - dirty_ratelimit = READ_ONCE(wb->dirty_ratelimit); - task_ratelimit = ((u64)dirty_ratelimit * sdtc->pos_ratio) >> - RATELIMIT_CALC_SHIFT; - max_pause = wb_max_pause(wb, sdtc->wb_dirty); - min_pause = wb_min_pause(wb, max_pause, - task_ratelimit, dirty_ratelimit, - &nr_dirtied_pause); - - if (unlikely(task_ratelimit == 0)) { - period = max_pause; - pause = max_pause; - goto pause; - } - period = HZ * pages_dirtied / task_ratelimit; - pause = period; - if (current->dirty_paused_when) - pause -= now - current->dirty_paused_when; - /* - * For less than 1s think time (ext3/4 may block the dirtier - * for up to 800ms from time to time on 1-HDD; so does xfs, - * however at much less frequency), try to compensate it in - * future periods by updating the virtual time; otherwise just - * do a reset, as it may be a light dirtier. - */ - if (pause < min_pause) { - trace_balance_dirty_pages(wb, - sdtc->thresh, - sdtc->bg_thresh, - sdtc->dirty, - sdtc->wb_thresh, - sdtc->wb_dirty, - dirty_ratelimit, - task_ratelimit, - pages_dirtied, - period, - min(pause, 0L), - start_time); - if (pause < -HZ) { - current->dirty_paused_when = now; - current->nr_dirtied = 0; - } else if (period) { - current->dirty_paused_when += period; - current->nr_dirtied = 0; - } else if (current->nr_dirtied_pause <= pages_dirtied) - current->nr_dirtied_pause += pages_dirtied; +/* + * balance_dirty_pages() must be called by processes which are generating dirty + * data. It looks at the number of dirty pages in the machine and will force + * the caller to wait once crossing the (background_thresh + dirty_thresh) / 2. + * If we're over `background_thresh' then the writeback threads are woken to + * perform some writeout. + */ +static void balance_dirty_pages(struct bdi_writeback *wb, unsigned long pages_dirtied) +{ + int ret; + struct bdp_ctx ctx = { BDP_CTX_INIT(ctx, wb) }; + + for (;;) { + ret = _balance_dirty_pages(wb, current->nr_dirtied, &ctx); + if (!ret) break; - } - if (unlikely(pause > max_pause)) { - /* for occasional dropped task_ratelimit */ - now += min(pause - max_pause, max_pause); - pause = max_pause; - } -pause: - trace_balance_dirty_pages(wb, - sdtc->thresh, - sdtc->bg_thresh, - sdtc->dirty, - sdtc->wb_thresh, - sdtc->wb_dirty, - dirty_ratelimit, - task_ratelimit, - pages_dirtied, - period, - pause, - start_time); __set_current_state(TASK_KILLABLE); - wb->dirty_sleep = now; - io_schedule_timeout(pause); + wb->dirty_sleep = ctx.now; + io_schedule_timeout(ctx.pause); - current->dirty_paused_when = now + pause; + current->dirty_paused_when = ctx.now + ctx.pause; current->nr_dirtied = 0; - current->nr_dirtied_pause = nr_dirtied_pause; + current->nr_dirtied_pause = ctx.nr_dirtied_pause; /* * This is typically equal to (dirty < thresh) and can also * keep "1000+ dd on a slow USB stick" under control. */ - if (task_ratelimit) + if (ctx.task_ratelimit) break; /* @@ -1805,14 +1831,14 @@ static void balance_dirty_pages(struct bdi_writeback *wb, * more page. However wb_dirty has accounting errors. So use * the larger and more IO friendly wb_stat_error. */ - if (sdtc->wb_dirty <= wb_stat_error()) + if (ctx.sdtc->wb_dirty <= wb_stat_error()) break; if (fatal_signal_pending(current)) break; } - if (!dirty_exceeded && wb->dirty_exceeded) + if (!ctx.dirty_exceeded && wb->dirty_exceeded) wb->dirty_exceeded = 0; if (writeback_in_progress(wb)) @@ -1829,7 +1855,7 @@ static void balance_dirty_pages(struct bdi_writeback *wb, if (laptop_mode) return; - if (nr_reclaimable > gdtc->bg_thresh) + if (ctx.nr_reclaimable > ctx.gdtc_stor.bg_thresh) wb_start_background_writeback(wb); }
This factors out the for loop in balance_dirty_pages() into a new function called _balance_dirty_pages(). By factoring out this function the async write code can determine if it has to wait to throttle writes or not. The function _balance_dirty_pages() returns the sleep time. If the sleep time is greater 0, then the async write code needs to throttle. To maintain the context for consecutive calls of _balance_dirty_pages() and maintain the current behavior a new data structure called bdp_ctx has been introduced. Signed-off-by: Stefan Roesch <shr@fb.com> --- mm/page-writeback.c | 452 +++++++++++++++++++++++--------------------- 1 file changed, 239 insertions(+), 213 deletions(-)