@@ -392,6 +392,7 @@ extern pgprot_t protection_map[16];
#define FAULT_FLAG_USER 0x40 /* The fault originated in userspace */
#define FAULT_FLAG_REMOTE 0x80 /* faulting for non current tsk/mm */
#define FAULT_FLAG_INSTRUCTION 0x100 /* The fault was during an instruction fetch */
+#define FAULT_FLAG_CACHED 0x200 /* Only look at the page cache */
#define FAULT_FLAG_TRACE \
{ FAULT_FLAG_WRITE, "WRITE" }, \
@@ -402,7 +403,8 @@ extern pgprot_t protection_map[16];
{ FAULT_FLAG_TRIED, "TRIED" }, \
{ FAULT_FLAG_USER, "USER" }, \
{ FAULT_FLAG_REMOTE, "REMOTE" }, \
- { FAULT_FLAG_INSTRUCTION, "INSTRUCTION" }
+ { FAULT_FLAG_INSTRUCTION, "INSTRUCTION" }, \
+ { FAULT_FLAG_CACHED, "CACHED" }
/*
* vm_fault is filled by the the pagefault handler and passed to the vma's
@@ -2383,7 +2383,7 @@ static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page,
* the mmap_sem still held. That's how FAULT_FLAG_RETRY_NOWAIT
* is supposed to work. We have way too many special cases..
*/
- if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
+ if (vmf->flags & (FAULT_FLAG_RETRY_NOWAIT | FAULT_FLAG_CACHED))
return 0;
*fpin = maybe_unlock_mmap_for_io(vmf, *fpin);
@@ -2460,26 +2460,28 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
* so we want to possibly extend the readahead further. We return the file that
* was pinned if we have to drop the mmap_sem in order to do IO.
*/
-static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
- struct page *page)
+static vm_fault_t do_async_mmap_readahead(struct vm_fault *vmf,
+ struct page *page,
+ struct file **fpin)
{
struct file *file = vmf->vma->vm_file;
struct file_ra_state *ra = &file->f_ra;
struct address_space *mapping = file->f_mapping;
- struct file *fpin = NULL;
pgoff_t offset = vmf->pgoff;
/* If we don't want any read-ahead, don't bother */
if (vmf->vma->vm_flags & VM_RAND_READ)
- return fpin;
+ return 0;
if (ra->mmap_miss > 0)
ra->mmap_miss--;
if (PageReadahead(page)) {
- fpin = maybe_unlock_mmap_for_io(vmf, fpin);
+ if (vmf->flags & FAULT_FLAG_CACHED)
+ return VM_FAULT_RETRY;
+ *fpin = maybe_unlock_mmap_for_io(vmf, *fpin);
page_cache_async_readahead(mapping, ra, file,
page, offset, ra->ra_pages);
}
- return fpin;
+ return 0;
}
/**
@@ -2495,8 +2497,11 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
*
* vma->vm_mm->mmap_sem must be held on entry.
*
- * If our return value has VM_FAULT_RETRY set, it's because the mmap_sem
- * may be dropped before doing I/O or by lock_page_maybe_drop_mmap().
+ * This function may drop the mmap_sem before doing I/O or waiting for a page
+ * lock; this is indicated by the VM_FAULT_RETRY flag in our return value.
+ * Setting FAULT_FLAG_CACHED or FAULT_FLAG_RETRY_NOWAIT in vmf->flags will
+ * prevent dropping the mmap_sem; in that case, VM_FAULT_RETRY indicates that
+ * the mmap_sem would have been dropped.
*
* If our return value does not have VM_FAULT_RETRY set, the mmap_sem
* has not been released.
@@ -2518,9 +2523,15 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
struct page *page;
vm_fault_t ret = 0;
- max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
- if (unlikely(offset >= max_off))
- return VM_FAULT_SIGBUS;
+ /*
+ * FAULT_FLAG_CACHED indicates that the inode size is only guaranteed
+ * to be valid when the page we are looking for is in the page cache.
+ */
+ if (!(vmf->flags & FAULT_FLAG_CACHED)) {
+ max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
+ if (unlikely(offset >= max_off))
+ return VM_FAULT_SIGBUS;
+ }
/*
* Do we have something in the page cache already?
@@ -2531,8 +2542,14 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
* We found the page, so try async readahead before
* waiting for the lock.
*/
- fpin = do_async_mmap_readahead(vmf, page);
+ ret = do_async_mmap_readahead(vmf, page, &fpin);
+ if (ret) {
+ put_page(page);
+ return ret;
+ }
} else if (!page) {
+ if (vmf->flags & FAULT_FLAG_CACHED)
+ goto out_retry;
/* No page in the page cache at all */
count_vm_event(PGMAJFAULT);
count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
Add a FAULT_FLAG_CACHED flag which indicates to filemap_fault that it should only look at the page cache, without triggering filesystem I/O for the actual request or for readahead. When filesystem I/O would be triggered, VM_FAULT_RETRY should be returned instead. This allows the caller to tentatively satisfy a minor page fault out of the page cache, and to retry the operation after taking the necessary steps when that isn't possible. Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com> --- include/linux/mm.h | 4 +++- mm/filemap.c | 43 ++++++++++++++++++++++++++++++------------- 2 files changed, 33 insertions(+), 14 deletions(-)