@@ -1974,7 +1974,15 @@ struct cl_io {
* the read IO will check to-be-read OSCs' status, and make fast-switch
* another mirror if some of the OSTs are not healthy.
*/
- ci_tried_all_mirrors:1;
+ ci_tried_all_mirrors:1,
+ /**
+ * Random read hints, readahead will be disabled.
+ */
+ ci_rand_read:1,
+ /**
+ * Sequential read hints.
+ */
+ ci_seq_read:1;
/**
* Bypass quota check
*/
@@ -736,6 +736,8 @@ static int ll_local_open(struct file *file, struct lookup_intent *it,
file->private_data = fd;
ll_readahead_init(inode, &fd->fd_ras);
fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
+ /* turn off the kernel's read-ahead */
+ file->f_ra.ra_pages = 0;
/* ll_cl_context initialize */
rwlock_init(&fd->fd_lock);
@@ -84,13 +84,11 @@ struct vm_area_struct *our_vma(struct mm_struct *mm, unsigned long addr,
* @vma virtual memory area addressed to page fault
* @env corespondent lu_env to processing
* @index page index corespondent to fault.
- * @ra_flags vma readahead flags.
*
- * \return error codes from cl_io_init.
+ * RETURN error codes from cl_io_init.
*/
static struct cl_io *
-ll_fault_io_init(struct lu_env *env, struct vm_area_struct *vma,
- pgoff_t index, unsigned long *ra_flags)
+ll_fault_io_init(struct lu_env *env, struct vm_area_struct *vma, pgoff_t index)
{
struct file *file = vma->vm_file;
struct inode *inode = file_inode(file);
@@ -110,18 +108,15 @@ struct vm_area_struct *our_vma(struct mm_struct *mm, unsigned long addr,
fio->ft_index = index;
fio->ft_executable = vma->vm_flags & VM_EXEC;
- /*
- * disable VM_SEQ_READ and use VM_RAND_READ to make sure that
- * the kernel will not read other pages not covered by ldlm in
- * filemap_nopage. we do our readahead in ll_readpage.
- */
- if (ra_flags)
- *ra_flags = vma->vm_flags & (VM_RAND_READ | VM_SEQ_READ);
- vma->vm_flags &= ~VM_SEQ_READ;
- vma->vm_flags |= VM_RAND_READ;
+ CDEBUG(D_MMAP,
+ DFID": vma=%p start=%#lx end=%#lx vm_flags=%#lx idx=%lu\n",
+ PFID(&ll_i2info(inode)->lli_fid), vma, vma->vm_start,
+ vma->vm_end, vma->vm_flags, fio->ft_index);
- CDEBUG(D_MMAP, "vm_flags: %lx (%lu %d)\n", vma->vm_flags,
- fio->ft_index, fio->ft_executable);
+ if (vma->vm_flags & VM_SEQ_READ)
+ io->ci_seq_read = 1;
+ else if (vma->vm_flags & VM_RAND_READ)
+ io->ci_rand_read = 1;
rc = cl_io_init(env, io, CIT_FAULT, io->ci_obj);
if (rc == 0) {
@@ -161,7 +156,7 @@ static int __ll_page_mkwrite(struct vm_area_struct *vma, struct page *vmpage,
if (IS_ERR(env))
return PTR_ERR(env);
- io = ll_fault_io_init(env, vma, vmpage->index, NULL);
+ io = ll_fault_io_init(env, vma, vmpage->index);
if (IS_ERR(io)) {
result = PTR_ERR(io);
goto out;
@@ -277,7 +272,6 @@ static vm_fault_t __ll_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
struct cl_io *io;
struct vvp_io *vio = NULL;
struct page *vmpage;
- unsigned long ra_flags;
int result = 0;
vm_fault_t fault_ret = 0;
u16 refcheck;
@@ -314,7 +308,7 @@ static vm_fault_t __ll_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
fault_ret = 0;
}
- io = ll_fault_io_init(env, vma, vmf->pgoff, &ra_flags);
+ io = ll_fault_io_init(env, vma, vmf->pgoff);
if (IS_ERR(io)) {
fault_ret = to_fault_error(PTR_ERR(io));
goto out;
@@ -350,8 +344,6 @@ static vm_fault_t __ll_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
}
cl_io_fini(env, io);
- vma->vm_flags |= ra_flags;
-
out:
cl_env_put(env, &refcheck);
if (result != 0 && !(fault_ret & VM_FAULT_RETRY))
@@ -375,6 +367,10 @@ static vm_fault_t ll_fault(struct vm_fault *vmf)
if (cached)
goto out;
+ CDEBUG(D_MMAP, DFID": vma=%p start=%#lx end=%#lx vm_flags=%#lx\n",
+ PFID(&ll_i2info(file_inode(vma->vm_file))->lli_fid),
+ vma, vma->vm_start, vma->vm_end, vma->vm_flags);
+
/* Only SIGKILL and SIGTERM are allowed for fault/nopage/mkwrite
* so that it can be killed by admin but not cause segfault by
* other signals.
@@ -385,6 +381,7 @@ static vm_fault_t ll_fault(struct vm_fault *vmf)
/* make sure offset is not a negative number */
if (vmf->pgoff > (MAX_LFS_FILESIZE >> PAGE_SHIFT))
return VM_FAULT_SIGBUS;
+
restart:
result = __ll_fault(vmf->vma, vmf);
if (vmf->page &&
@@ -545,6 +542,11 @@ int ll_file_mmap(struct file *file, struct vm_area_struct *vma)
bool cached;
int rc;
+ CDEBUG(D_VFSTRACE | D_MMAP,
+ "VFS_Op: fid="DFID" vma=%p start=%#lx end=%#lx vm_flags=%#lx\n",
+ PFID(&ll_i2info(inode)->lli_fid),
+ vma, vma->vm_start, vma->vm_end, vma->vm_flags);
+
if (ll_file_nolock(file))
return -EOPNOTSUPP;
@@ -1255,7 +1255,7 @@ static bool index_in_stride_window(struct ll_readahead_state *ras,
*/
static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
struct ll_readahead_state *ras, pgoff_t index,
- enum ras_update_flags flags)
+ enum ras_update_flags flags, struct cl_io *io)
{
struct ll_ra_info *ra = &sbi->ll_ra_info;
bool hit = flags & LL_RAS_HIT;
@@ -1276,6 +1276,18 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
if (ras->ras_no_miss_check)
goto out_unlock;
+ if (io && io->ci_rand_read)
+ goto out_unlock;
+
+ if (io && io->ci_seq_read) {
+ if (!hit) {
+ /* to avoid many small read RPC here */
+ ras->ras_window_pages = sbi->ll_ra_info.ra_range_pages;
+ ll_ra_stats_inc_sbi(sbi, RA_STAT_MMAP_RANGE_READ);
+ }
+ goto skip;
+ }
+
if (flags & LL_RAS_MMAP) {
unsigned long ra_pages;
@@ -1594,7 +1606,7 @@ int ll_io_read_page(const struct lu_env *env, struct cl_io *io,
flags |= LL_RAS_HIT;
if (!vio->vui_ra_valid)
flags |= LL_RAS_MMAP;
- ras_update(sbi, inode, ras, vvp_index(vpg), flags);
+ ras_update(sbi, inode, ras, vvp_index(vpg), flags, io);
}
cl_2queue_init(queue);
@@ -1613,7 +1625,7 @@ int ll_io_read_page(const struct lu_env *env, struct cl_io *io,
io_start_index = cl_index(io->ci_obj, io->u.ci_rw.crw_pos);
io_end_index = cl_index(io->ci_obj, io->u.ci_rw.crw_pos +
io->u.ci_rw.crw_count - 1);
- if (ll_readahead_enabled(sbi) && ras) {
+ if (ll_readahead_enabled(sbi) && ras && !io->ci_rand_read) {
pgoff_t skip_index = 0;
if (ras->ras_next_readahead_idx < vvp_index(vpg))
@@ -1802,7 +1814,7 @@ int ll_readpage(struct file *file, struct page *vmpage)
* if the page is hit in cache because non cache page
* case will be handled by slow read later.
*/
- ras_update(sbi, inode, ras, vvp_index(vpg), flags);
+ ras_update(sbi, inode, ras, vvp_index(vpg), flags, io);
/* avoid duplicate ras_update() call */
vpg->vpg_ra_updated = 1;