@@ -1703,7 +1703,7 @@ static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
if (cached)
goto out;
- ll_ras_enter(file);
+ ll_ras_enter(file, iocb->ki_pos, iov_iter_count(to));
result = ll_do_fast_read(iocb, to);
if (result < 0 || iov_iter_count(to) == 0)
@@ -654,11 +654,6 @@ struct ll_readahead_state {
*/
unsigned long ras_requests;
/*
- * Page index with respect to the current request, these value
- * will not be accurate when dealing with reads issued via mmap.
- */
- unsigned long ras_request_index;
- /*
* The following 3 items are used for detecting the stride I/O
* mode.
* In stride I/O mode,
@@ -681,6 +676,10 @@ struct ll_readahead_state {
unsigned long ras_consecutive_stride_requests;
/* index of the last page that async readahead starts */
pgoff_t ras_async_last_readpage;
+ /* whether we should increase readahead window */
+ bool ras_need_increase_window;
+ /* whether ra miss check should be skipped */
+ bool ras_no_miss_check;
};
struct ll_readahead_work {
@@ -778,7 +777,7 @@ static inline bool ll_sbi_has_file_heat(struct ll_sb_info *sbi)
return !!(sbi->ll_flags & LL_SBI_FILE_HEAT);
}
-void ll_ras_enter(struct file *f);
+void ll_ras_enter(struct file *f, unsigned long pos, unsigned long count);
/* llite/lcommon_misc.c */
int cl_ocd_update(struct obd_device *host, struct obd_device *watched,
@@ -131,12 +131,11 @@ void ll_ra_stats_inc(struct inode *inode, enum ra_stat which)
#define RAS_CDEBUG(ras) \
CDEBUG(D_READA, \
- "lre %lu cr %lu cb %lu ws %lu wl %lu nra %lu rpc %lu r %lu ri %lu csr %lu sf %lu sb %lu sl %lu lr %lu\n", \
+ "lre %lu cr %lu cb %lu ws %lu wl %lu nra %lu rpc %lu r %lu csr %lu sf %lu sb %lu sl %lu lr %lu\n", \
ras->ras_last_read_end, ras->ras_consecutive_requests, \
ras->ras_consecutive_bytes, ras->ras_window_start, \
ras->ras_window_len, ras->ras_next_readahead, \
- ras->ras_rpc_size, \
- ras->ras_requests, ras->ras_request_index, \
+ ras->ras_rpc_size, ras->ras_requests, \
ras->ras_consecutive_stride_requests, ras->ras_stride_offset, \
ras->ras_stride_bytes, ras->ras_stride_length, \
ras->ras_async_last_readpage)
@@ -154,18 +153,6 @@ static int pos_in_window(unsigned long pos, unsigned long point,
return start <= pos && pos <= end;
}
-void ll_ras_enter(struct file *f)
-{
- struct ll_file_data *fd = LUSTRE_FPRIVATE(f);
- struct ll_readahead_state *ras = &fd->fd_ras;
-
- spin_lock(&ras->ras_lock);
- ras->ras_requests++;
- ras->ras_request_index = 0;
- ras->ras_consecutive_requests++;
- spin_unlock(&ras->ras_lock);
-}
-
/**
* Initiates read-ahead of a page with given index.
*
@@ -311,15 +298,23 @@ static inline int stride_io_mode(struct ll_readahead_state *ras)
static int ria_page_count(struct ra_io_arg *ria)
{
- u64 length = ria->ria_end >= ria->ria_start ?
- ria->ria_end - ria->ria_start + 1 : 0;
- unsigned int bytes_count;
-
+ u64 length_bytes = ria->ria_end >= ria->ria_start ?
+ (ria->ria_end - ria->ria_start + 1) << PAGE_SHIFT : 0;
+ unsigned int bytes_count, pg_count;
+
+ if (ria->ria_length > ria->ria_bytes && ria->ria_bytes &&
+ (ria->ria_length % PAGE_SIZE || ria->ria_bytes % PAGE_SIZE ||
+ ria->ria_stoff % PAGE_SIZE)) {
+ /* Over-estimate un-aligned page stride read */
+ pg_count = ((ria->ria_bytes + PAGE_SIZE - 1) >> PAGE_SHIFT) + 1;
+ pg_count *= length_bytes / ria->ria_length + 1;
+
+ return pg_count;
+ }
bytes_count = stride_byte_count(ria->ria_stoff, ria->ria_length,
ria->ria_bytes, ria->ria_start,
- length << PAGE_SHIFT);
+ length_bytes);
return (bytes_count + PAGE_SIZE - 1) >> PAGE_SHIFT;
-
}
static unsigned long ras_align(struct ll_readahead_state *ras,
@@ -333,16 +328,28 @@ static unsigned long ras_align(struct ll_readahead_state *ras,
}
/*Check whether the index is in the defined ra-window */
-static int ras_inside_ra_window(unsigned long idx, struct ra_io_arg *ria)
+static bool ras_inside_ra_window(unsigned long idx, struct ra_io_arg *ria)
{
+ unsigned long pos = idx << PAGE_SHIFT;
+ unsigned long offset;
+
/* If ria_length == ria_pages, it means non-stride I/O mode,
* idx should always inside read-ahead window in this case
* For stride I/O mode, just check whether the idx is inside
* the ria_pages.
*/
- return ria->ria_length == 0 || ria->ria_length == ria->ria_bytes ||
- (idx >= ria->ria_stoff && (idx - ria->ria_stoff) %
- ria->ria_length < ria->ria_bytes);
+ if (ria->ria_length == 0 || ria->ria_length == ria->ria_bytes)
+ return true;
+
+ if (pos >= ria->ria_stoff) {
+ offset = (pos - ria->ria_stoff) % ria->ria_length;
+ if (offset < ria->ria_bytes ||
+ (ria->ria_length - offset) < PAGE_SIZE)
+ return true;
+ } else if (pos + PAGE_SIZE > ria->ria_stoff)
+ return true;
+
+ return false;
}
static unsigned long
@@ -351,7 +358,6 @@ static int ras_inside_ra_window(unsigned long idx, struct ra_io_arg *ria)
struct ra_io_arg *ria, pgoff_t *ra_end)
{
struct cl_read_ahead ra = { 0 };
- bool stride_ria;
pgoff_t page_idx;
int count = 0;
int rc;
@@ -359,7 +365,6 @@ static int ras_inside_ra_window(unsigned long idx, struct ra_io_arg *ria)
LASSERT(ria);
RIA_DEBUG(ria);
- stride_ria = ria->ria_length > ria->ria_bytes && ria->ria_bytes > 0;
for (page_idx = ria->ria_start;
page_idx <= ria->ria_end && ria->ria_reserved > 0; page_idx++) {
if (ras_inside_ra_window(page_idx, ria)) {
@@ -417,7 +422,7 @@ static int ras_inside_ra_window(unsigned long idx, struct ra_io_arg *ria)
ria->ria_reserved--;
count++;
}
- } else if (stride_ria) {
+ } else if (stride_io_mode(ras)) {
/* If it is not in the read-ahead window, and it is
* read-ahead mode, then check whether it should skip
* the stride gap.
@@ -428,7 +433,8 @@ static int ras_inside_ra_window(unsigned long idx, struct ra_io_arg *ria)
offset = (pos - ria->ria_stoff) % ria->ria_length;
if (offset >= ria->ria_bytes) {
pos += (ria->ria_length - offset);
- page_idx = (pos >> PAGE_SHIFT) - 1;
+ if ((pos >> PAGE_SHIFT) >= page_idx + 1)
+ page_idx = (pos >> PAGE_SHIFT) - 1;
CDEBUG(D_READA,
"Stride: jump %lu pages to %lu\n",
ria->ria_length - offset, page_idx);
@@ -775,11 +781,10 @@ void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras)
* Check whether the read request is in the stride window.
* If it is in the stride window, return true, otherwise return false.
*/
-static bool index_in_stride_window(struct ll_readahead_state *ras,
- pgoff_t index)
+static bool read_in_stride_window(struct ll_readahead_state *ras,
+ unsigned long pos, unsigned long count)
{
unsigned long stride_gap;
- unsigned long pos = index << PAGE_SHIFT;
if (ras->ras_stride_length == 0 || ras->ras_stride_bytes == 0 ||
ras->ras_stride_bytes == ras->ras_stride_length)
@@ -789,12 +794,13 @@ static bool index_in_stride_window(struct ll_readahead_state *ras,
/* If it is contiguous read */
if (stride_gap == 0)
- return ras->ras_consecutive_bytes + PAGE_SIZE <=
+ return ras->ras_consecutive_bytes + count <=
ras->ras_stride_bytes;
/* Otherwise check the stride by itself */
return (ras->ras_stride_length - ras->ras_stride_bytes) == stride_gap &&
- ras->ras_consecutive_bytes == ras->ras_stride_bytes;
+ ras->ras_consecutive_bytes == ras->ras_stride_bytes &&
+ count <= ras->ras_stride_bytes;
}
static void ras_init_stride_detector(struct ll_readahead_state *ras,
@@ -802,13 +808,6 @@ static void ras_init_stride_detector(struct ll_readahead_state *ras,
{
unsigned long stride_gap = pos - ras->ras_last_read_end - 1;
- if ((stride_gap != 0 || ras->ras_consecutive_stride_requests == 0) &&
- !stride_io_mode(ras)) {
- ras->ras_stride_bytes = ras->ras_consecutive_bytes;
- ras->ras_stride_length = ras->ras_consecutive_bytes +
- stride_gap;
- }
- LASSERT(ras->ras_request_index == 0);
LASSERT(ras->ras_consecutive_stride_requests == 0);
if (pos <= ras->ras_last_read_end) {
@@ -819,6 +818,8 @@ static void ras_init_stride_detector(struct ll_readahead_state *ras,
ras->ras_stride_bytes = ras->ras_consecutive_bytes;
ras->ras_stride_length = stride_gap + ras->ras_consecutive_bytes;
+ ras->ras_consecutive_stride_requests++;
+ ras->ras_stride_offset = pos;
RAS_CDEBUG(ras);
}
@@ -895,49 +896,97 @@ static void ras_increase_window(struct inode *inode,
}
}
-static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
- struct ll_readahead_state *ras, unsigned long index,
- enum ras_update_flags flags)
+/**
+ * Seek within 8 pages are considered as sequential read for now.
+ */
+static inline bool is_loose_seq_read(struct ll_readahead_state *ras,
+ unsigned long pos)
{
- struct ll_ra_info *ra = &sbi->ll_ra_info;
- int zero = 0, stride_detect = 0, ra_miss = 0;
- unsigned long pos = index << PAGE_SHIFT;
- bool hit = flags & LL_RAS_HIT;
-
- spin_lock(&ras->ras_lock);
-
- if (!hit)
- CDEBUG(D_READA, DFID " pages at %lu miss.\n",
- PFID(ll_inode2fid(inode)), index);
+ return pos_in_window(pos, ras->ras_last_read_end,
+ 8 << PAGE_SHIFT, 8 << PAGE_SHIFT);
+}
- ll_ra_stats_inc_sbi(sbi, hit ? RA_STAT_HIT : RA_STAT_MISS);
+static void ras_detect_read_pattern(struct ll_readahead_state *ras,
+ struct ll_sb_info *sbi,
+ unsigned long pos, unsigned long count,
+ bool mmap)
+{
+ bool stride_detect = false;
+ unsigned long index = pos >> PAGE_SHIFT;
- /* reset the read-ahead window in two cases. First when the app seeks
- * or reads to some other part of the file. Secondly if we get a
- * read-ahead miss that we think we've previously issued. This can
- * be a symptom of there being so many read-ahead pages that the VM is
- * reclaiming it before we get to it.
+ /*
+ * Reset the read-ahead window in two cases. First when the app seeks
+ * or reads to some other part of the file. Secondly if we get a
+ * read-ahead miss that we think we've previously issued. This can
+ * be a symptom of there being so many read-ahead pages that the VM
+ * is reclaiming it before we get to it.
*/
- if (!pos_in_window(pos, ras->ras_last_read_end,
- 8 << PAGE_SHIFT, 8 << PAGE_SHIFT)) {
- zero = 1;
+ if (!is_loose_seq_read(ras, pos)) {
+ /* Check whether it is in stride I/O mode */
+ if (!read_in_stride_window(ras, pos, count)) {
+ if (ras->ras_consecutive_stride_requests == 0)
+ ras_init_stride_detector(ras, pos, count);
+ else
+ ras_stride_reset(ras);
+ ras->ras_consecutive_bytes = 0;
+ ras_reset(ras, index);
+ } else {
+ ras->ras_consecutive_bytes = 0;
+ ras->ras_consecutive_requests = 0;
+ if (++ras->ras_consecutive_stride_requests > 1)
+ stride_detect = true;
+ RAS_CDEBUG(ras);
+ }
ll_ra_stats_inc_sbi(sbi, RA_STAT_DISTANT_READPAGE);
- } else if (!hit && ras->ras_window_len &&
- index < ras->ras_next_readahead &&
- pos_in_window(index, ras->ras_window_start, 0,
- ras->ras_window_len)) {
- ra_miss = 1;
- ll_ra_stats_inc_sbi(sbi, RA_STAT_MISS_IN_WINDOW);
+ } else if (stride_io_mode(ras)) {
+ /*
+ * If this is contiguous read but in stride I/O mode
+ * currently, check whether stride step still is valid,
+ * if invalid, it will reset the stride ra window to
+ * be zero.
+ */
+ if (!read_in_stride_window(ras, pos, count)) {
+ ras_stride_reset(ras);
+ ras->ras_window_len = 0;
+ ras->ras_next_readahead = index;
+ }
}
- /* On the second access to a file smaller than the tunable
+ ras->ras_consecutive_bytes += count;
+ if (mmap) {
+ unsigned int idx = (ras->ras_consecutive_bytes >> PAGE_SHIFT);
+
+ if ((idx >= 4 && idx % 4 == 0) || stride_detect)
+ ras->ras_need_increase_window = true;
+ } else if ((ras->ras_consecutive_requests > 1 || stride_detect)) {
+ ras->ras_need_increase_window = true;
+ }
+
+ ras->ras_last_read_end = pos + count - 1;
+}
+
+void ll_ras_enter(struct file *f, unsigned long pos, unsigned long count)
+{
+ struct ll_file_data *fd = LUSTRE_FPRIVATE(f);
+ struct ll_readahead_state *ras = &fd->fd_ras;
+ struct inode *inode = file_inode(f);
+ unsigned long index = pos >> PAGE_SHIFT;
+ struct ll_sb_info *sbi = ll_i2sbi(inode);
+
+ spin_lock(&ras->ras_lock);
+ ras->ras_requests++;
+ ras->ras_consecutive_requests++;
+ ras->ras_need_increase_window = false;
+ ras->ras_no_miss_check = false;
+ /*
+ * On the second access to a file smaller than the tunable
* ra_max_read_ahead_whole_pages trigger RA on all pages in the
* file up to ra_max_pages_per_file. This is simply a best effort
- * and only occurs once per open file. Normal RA behavior is reverted
- * to for subsequent IO. The mmap case does not increment
- * ras_requests and thus can never trigger this behavior.
+ * and only occurs once per open file. Normal RA behavior is reverted
+ * to for subsequent IO.
*/
- if (ras->ras_requests >= 2 && !ras->ras_request_index) {
+ if (ras->ras_requests >= 2) {
+ struct ll_ra_info *ra = &sbi->ll_ra_info;
u64 kms_pages;
kms_pages = (i_size_read(inode) + PAGE_SIZE - 1) >>
@@ -952,73 +1001,111 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
ras->ras_window_start = 0;
ras->ras_next_readahead = index + 1;
ras->ras_window_len = min(ra->ra_max_pages_per_file,
- ra->ra_max_read_ahead_whole_pages);
+ ra->ra_max_read_ahead_whole_pages);
+ ras->ras_no_miss_check = true;
goto out_unlock;
}
}
- if (zero) {
- /* check whether it is in stride I/O mode*/
- if (!index_in_stride_window(ras, index)) {
- if (ras->ras_consecutive_stride_requests == 0 &&
- ras->ras_request_index == 0) {
- ras_init_stride_detector(ras, pos, PAGE_SIZE);
- ras->ras_consecutive_stride_requests++;
- } else {
- ras_stride_reset(ras);
- }
+ ras_detect_read_pattern(ras, sbi, pos, count, false);
+out_unlock:
+ spin_unlock(&ras->ras_lock);
+}
+
+static bool index_in_stride_window(struct ll_readahead_state *ras,
+ unsigned int index)
+{
+ unsigned long pos = index << PAGE_SHIFT;
+ unsigned long offset;
+
+ if (ras->ras_stride_length == 0 || ras->ras_stride_bytes == 0 ||
+ ras->ras_stride_bytes == ras->ras_stride_length)
+ return false;
+
+ if (pos >= ras->ras_stride_offset) {
+ offset = (pos - ras->ras_stride_offset) %
+ ras->ras_stride_length;
+ if (offset < ras->ras_stride_bytes ||
+ ras->ras_stride_length - offset < PAGE_SIZE)
+ return true;
+ } else if (ras->ras_stride_offset - pos < PAGE_SIZE) {
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * ll_ras_enter() is used to detect read pattern according to
+ * pos and count.
+ *
+ * ras_update() is used to detect cache miss and
+ * reset window or increase window accordingly
+ */
+static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
+ struct ll_readahead_state *ras, unsigned long index,
+ enum ras_update_flags flags)
+{
+ struct ll_ra_info *ra = &sbi->ll_ra_info;
+ bool hit = flags & LL_RAS_HIT;
+
+ spin_lock(&ras->ras_lock);
+
+ if (!hit)
+ CDEBUG(D_READA, DFID " pages at %lu miss.\n",
+ PFID(ll_inode2fid(inode)), index);
+ ll_ra_stats_inc_sbi(sbi, hit ? RA_STAT_HIT : RA_STAT_MISS);
+
+ /*
+ * The readahead window has been expanded to cover whole
+ * file size, we don't care whether ra miss happen or not.
+ * Because we will read whole file to page cache even if
+ * some pages missed.
+ */
+ if (ras->ras_no_miss_check)
+ goto out_unlock;
+
+ if (flags & LL_RAS_MMAP)
+ ras_detect_read_pattern(ras, sbi, index << PAGE_SHIFT,
+ PAGE_SIZE, true);
+
+ if (!hit && ras->ras_window_len &&
+ index < ras->ras_next_readahead &&
+ pos_in_window(index, ras->ras_window_start, 0,
+ ras->ras_window_len)) {
+ ll_ra_stats_inc_sbi(sbi, RA_STAT_MISS_IN_WINDOW);
+ ras->ras_need_increase_window = false;
+
+ if (index_in_stride_window(ras, index) &&
+ stride_io_mode(ras)) {
+ /*
+ * if (index != ras->ras_last_readpage + 1)
+ * ras->ras_consecutive_pages = 0;
+ */
ras_reset(ras, index);
- ras->ras_consecutive_bytes += PAGE_SIZE;
- goto out_unlock;
- } else {
- ras->ras_consecutive_bytes = 0;
- ras->ras_consecutive_requests = 0;
- if (++ras->ras_consecutive_stride_requests > 1)
- stride_detect = 1;
- RAS_CDEBUG(ras);
- }
- } else {
- if (ra_miss) {
- if (index_in_stride_window(ras, index) &&
- stride_io_mode(ras)) {
- if (index != (ras->ras_last_read_end >>
- PAGE_SHIFT) + 1)
- ras->ras_consecutive_bytes = 0;
- ras_reset(ras, index);
-
- /* If stride-RA hit cache miss, the stride
- * detector will not be reset to avoid the
- * overhead of redetecting read-ahead mode,
- * but on the condition that the stride window
- * is still intersect with normal sequential
- * read-ahead window.
- */
- if (ras->ras_window_start <
- (ras->ras_stride_offset >> PAGE_SHIFT))
- ras_stride_reset(ras);
- RAS_CDEBUG(ras);
- } else {
- /* Reset both stride window and normal RA
- * window
- */
- ras_reset(ras, index);
- ras->ras_consecutive_bytes += PAGE_SIZE;
- ras_stride_reset(ras);
- goto out_unlock;
- }
- } else if (stride_io_mode(ras)) {
- /* If this is contiguous read but in stride I/O mode
- * currently, check whether stride step still is valid,
- * if invalid, it will reset the stride ra window
+ /*
+ * If stride-RA hit cache miss, the stride
+ * detector will not be reset to avoid the
+ * overhead of redetecting read-ahead mode,
+ * but on the condition that the stride window
+ * is still intersect with normal sequential
+ * read-ahead window.
*/
- if (!index_in_stride_window(ras, index)) {
- /* Shrink stride read-ahead window to be zero */
+ if (ras->ras_window_start <
+ ras->ras_stride_offset)
ras_stride_reset(ras);
- ras->ras_window_len = 0;
- ras->ras_next_readahead = index;
- }
+ RAS_CDEBUG(ras);
+ } else {
+ /*
+ * Reset both stride window and normal RA
+ * window.
+ */
+ ras_reset(ras, index);
+ /* ras->ras_consecutive_pages++; */
+ ras->ras_consecutive_bytes = 0;
+ ras_stride_reset(ras);
+ goto out_unlock;
}
}
- ras->ras_consecutive_bytes += PAGE_SIZE;
ras_set_start(ras, index);
if (stride_io_mode(ras)) {
@@ -1037,44 +1124,13 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
if (!hit)
ras->ras_next_readahead = index + 1;
}
- RAS_CDEBUG(ras);
- /* Trigger RA in the mmap case where ras_consecutive_requests
- * is not incremented and thus can't be used to trigger RA
- */
- if (ras->ras_consecutive_bytes >= (4 << PAGE_SHIFT) &&
- flags & LL_RAS_MMAP) {
+ if (ras->ras_need_increase_window) {
ras_increase_window(inode, ras, ra);
- /*
- * reset consecutive pages so that the readahead window can
- * grow gradually.
- */
- ras->ras_consecutive_bytes = 0;
- goto out_unlock;
- }
-
- /* Initially reset the stride window offset to next_readahead*/
- if (ras->ras_consecutive_stride_requests == 2 && stride_detect) {
- /**
- * Once stride IO mode is detected, next_readahead should be
- * reset to make sure next_readahead > stride offset
- */
- ras->ras_next_readahead = max(index, ras->ras_next_readahead);
- ras->ras_stride_offset = index << PAGE_SHIFT;
- ras->ras_window_start = max(index, ras->ras_window_start);
+ ras->ras_need_increase_window = false;
}
- /* The initial ras_window_len is set to the request size. To avoid
- * uselessly reading and discarding pages for random IO the window is
- * only increased once per consecutive request received.
- */
- if ((ras->ras_consecutive_requests > 1 || stride_detect) &&
- !ras->ras_request_index)
- ras_increase_window(inode, ras, ra);
out_unlock:
- RAS_CDEBUG(ras);
- ras->ras_request_index++;
- ras->ras_last_read_end = pos + PAGE_SIZE - 1;
spin_unlock(&ras->ras_lock);
}