diff mbox series

[RFC,2/2] mm: introduce budgt control in readahead

Message ID 20240515012350.1166350-3-zhaoyang.huang@unisoc.com (mailing list archive)
State New
Headers show
Series introduce precised blk-throttle control | expand

Commit Message

zhaoyang.huang May 15, 2024, 1:23 a.m. UTC
From: Zhaoyang Huang <zhaoyang.huang@unisoc.com>

Currently, readahead's size is decided mainly by page cache's status
like hit/miss or hole size which could lead to suspension of following
bio which is over the size of blk-throttle allowed size when
BLK_THROTTLING is on. Introduce the budgt value here to have the bio's
size be within the legal size.

Signed-off-by: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
---
 mm/readahead.c | 33 ++++++++++++++++++++++++---------
 1 file changed, 24 insertions(+), 9 deletions(-)

Comments

Matthew Wilcox (Oracle) May 15, 2024, 4:09 a.m. UTC | #1
On Wed, May 15, 2024 at 09:23:50AM +0800, zhaoyang.huang wrote:
> +	unsigned long budgt = inode->i_sb->s_bdev ?
> +			blk_throttle_budgt(inode->i_sb->s_bdev) : 0;

NAK as previously explained.
Zhaoyang Huang May 15, 2024, 6:31 a.m. UTC | #2
On Wed, May 15, 2024 at 12:09 PM Matthew Wilcox <willy@infradead.org> wrote:
>
> On Wed, May 15, 2024 at 09:23:50AM +0800, zhaoyang.huang wrote:
> > +     unsigned long budgt = inode->i_sb->s_bdev ?
> > +                     blk_throttle_budgt(inode->i_sb->s_bdev) : 0;
>
> NAK as previously explained.
ok. But this commit could work by following the configuration of
blk-throttle as long as it works on btrfs with internal RAID on.
Furthermore, this will help the blkcg meet the desired BPS value
perfectly.
Tejun Heo May 15, 2024, 7:40 a.m. UTC | #3
Hello,

On Wed, May 15, 2024 at 09:23:50AM +0800, zhaoyang.huang wrote:
> +static unsigned long get_next_ra_size(struct readahead_control *ractl,
>  				      unsigned long max)
>  {
> +	unsigned long cur = ractl->ra->size;
> +	struct inode *inode = ractl->mapping->host;
> +	unsigned long budgt = inode->i_sb->s_bdev ?
> +			blk_throttle_budgt(inode->i_sb->s_bdev) : 0;

Technical correctness aside, I'm not convinced it's generally a good idea to
bubble up one specific IO control mechanism's detail all the way upto RA
layer. Besides what's the gain here? For continuous IO stream, whether some
RA bios are oversized or not shouldn't matter, no? Doesn't this just affect
the accuracy of the last RA IO of a finite read stream?

Thanks.
Zhaoyang Huang May 15, 2024, 8:17 a.m. UTC | #4
On Wed, May 15, 2024 at 3:40 PM Tejun Heo <tj@kernel.org> wrote:
>
> Hello,
>
> On Wed, May 15, 2024 at 09:23:50AM +0800, zhaoyang.huang wrote:
> > +static unsigned long get_next_ra_size(struct readahead_control *ractl,
> >                                     unsigned long max)
> >  {
> > +     unsigned long cur = ractl->ra->size;
> > +     struct inode *inode = ractl->mapping->host;
> > +     unsigned long budgt = inode->i_sb->s_bdev ?
> > +                     blk_throttle_budgt(inode->i_sb->s_bdev) : 0;
>
> Technical correctness aside, I'm not convinced it's generally a good idea to
> bubble up one specific IO control mechanism's detail all the way upto RA
> layer. Besides what's the gain here? For continuous IO stream, whether some
> RA bios are oversized or not shouldn't matter, no? Doesn't this just affect
> the accuracy of the last RA IO of a finite read stream?
Thanks for feedback. If I understand right, the oversized RA bios of a
finite read will fail by being queued to tg's queue which should be
deemed as introducing a drop of IOPS.

submit_bio
    blk_throtl_bio
        if(!tg_may_dispatch) //failed, queue the bio to tg's queue

What we get here is a more precise BW of the throttled blkcg like
below, from which we can find the result of 'after' could exactly meet
the configured bps value and a little bit enhancement since there are
no hung(oversized) bios any more.

blkio.throttle.read_bps_device = 20MB/s
fio ... -numjobs=8 ...

    before : IOPS=37.9k, BW=148MiB/s (155MB/s)(11.6GiB/80333msec)
    after  : IOPS=39.0k, BW=153MiB/s (160MB/s)(15.6GiB/104914msec)

    before : clat (usec): min=4, max=1056.6k, avg=197.23, stdev=10080.69
    after  : clat (usec): min=4, max=193481, avg=188.83, stdev=4651.29

    before : lat (usec): min=5, max=1056.6k, avg=200.48, stdev=10080.76
    after  : lat (usec): min=5, max=193483, avg=192.68, stdev=4651.87

blkio.throttle.read_bps_device = 30MB/s
fio ... -numjobs=8 ...

    before : IOPS=57.2k, BW=224MiB/s (234MB/s)(15.6GiB/71561msec)
    after  : IOPS=58.5k, BW=229MiB/s (240MB/s)(15.6GiB/69996msec)

    before : clat (usec): min=4, max=1105.5k, avg=126.20, stdev=6419.22
    after  : clat (usec): min=4, max=183956, avg=120.60, stdev=2957.28

    before : lat (usec): min=5, max=1105.5k, avg=129.45, stdev=6419.29
    after  : lat (usec): min=5, max=183958, avg=124.40, stdev=2958.18

>
> Thanks. blk_throttle_budgt

>
> --
> tejun
diff mbox series

Patch

diff --git a/mm/readahead.c b/mm/readahead.c
index 130c0e7df99f..2b6120ced6f9 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -128,6 +128,7 @@ 
 #include <linux/blk-cgroup.h>
 #include <linux/fadvise.h>
 #include <linux/sched/mm.h>
+#include <linux/minmax.h>
 
 #include "internal.h"
 
@@ -358,16 +359,23 @@  static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
  *  Get the previous window size, ramp it up, and
  *  return it as the new window size.
  */
-static unsigned long get_next_ra_size(struct file_ra_state *ra,
+static unsigned long get_next_ra_size(struct readahead_control *ractl,
 				      unsigned long max)
 {
-	unsigned long cur = ra->size;
+	unsigned long cur = ractl->ra->size;
+	struct inode *inode = ractl->mapping->host;
+	unsigned long budgt = inode->i_sb->s_bdev ?
+			blk_throttle_budgt(inode->i_sb->s_bdev) : 0;
+	unsigned long val = max;
 
 	if (cur < max / 16)
-		return 4 * cur;
+		val = 4 * cur;
 	if (cur <= max / 2)
-		return 2 * cur;
-	return max;
+		val = 2 * cur;
+
+	val = budgt ? min(budgt / PAGE_SIZE, val) : val;
+
+	return val;
 }
 
 /*
@@ -437,6 +445,8 @@  static int try_context_readahead(struct address_space *mapping,
 				 unsigned long max)
 {
 	pgoff_t size;
+	unsigned long budgt = mapping->host->i_sb->s_bdev ?
+		blk_throttle_budgt(mapping->host->i_sb->s_bdev) : 0;
 
 	size = count_history_pages(mapping, index, max);
 
@@ -455,7 +465,7 @@  static int try_context_readahead(struct address_space *mapping,
 		size *= 2;
 
 	ra->start = index;
-	ra->size = min(size + req_size, max);
+	ra->size = min3(budgt / PAGE_SIZE, size + req_size, max);
 	ra->async_size = 1;
 
 	return 1;
@@ -552,6 +562,8 @@  static void ondemand_readahead(struct readahead_control *ractl,
 	pgoff_t index = readahead_index(ractl);
 	pgoff_t expected, prev_index;
 	unsigned int order = folio ? folio_order(folio) : 0;
+	unsigned long budgt = ractl->mapping->host->i_sb->s_bdev ?
+		blk_throttle_budgt(ractl->mapping->host->i_sb->s_bdev) : 0;
 
 	/*
 	 * If the request exceeds the readahead window, allow the read to
@@ -574,7 +586,7 @@  static void ondemand_readahead(struct readahead_control *ractl,
 			1UL << order);
 	if (index == expected || index == (ra->start + ra->size)) {
 		ra->start += ra->size;
-		ra->size = get_next_ra_size(ra, max_pages);
+		ra->size = get_next_ra_size(ractl, max_pages);
 		ra->async_size = ra->size;
 		goto readit;
 	}
@@ -599,7 +611,7 @@  static void ondemand_readahead(struct readahead_control *ractl,
 		ra->start = start;
 		ra->size = start - index;	/* old async_size */
 		ra->size += req_size;
-		ra->size = get_next_ra_size(ra, max_pages);
+		ra->size = get_next_ra_size(ractl, max_pages);
 		ra->async_size = ra->size;
 		goto readit;
 	}
@@ -631,6 +643,9 @@  static void ondemand_readahead(struct readahead_control *ractl,
 	 * standalone, small random read
 	 * Read as is, and do not pollute the readahead state.
 	 */
+	if (budgt)
+		req_size = min(budgt / PAGE_SIZE, req_size);
+
 	do_page_cache_ra(ractl, req_size, 0);
 	return;
 
@@ -647,7 +662,7 @@  static void ondemand_readahead(struct readahead_control *ractl,
 	 * Take care of maximum IO pages as above.
 	 */
 	if (index == ra->start && ra->size == ra->async_size) {
-		add_pages = get_next_ra_size(ra, max_pages);
+		add_pages = get_next_ra_size(ractl, max_pages);
 		if (ra->size + add_pages <= max_pages) {
 			ra->async_size = add_pages;
 			ra->size += add_pages;