@@ -195,6 +195,13 @@ struct btrfs_inode {
*/
struct rw_semaphore dio_sem;
+ /*
+ * To serialise page fault with truncate/punch_hole operations.
+ * We have to make sure that new page cannot be faulted in a section
+ * of the inode that is being punched.
+ */
+ struct rw_semaphore mmap_sem;
+
struct inode vfs_inode;
};
@@ -2298,11 +2298,12 @@ static int btrfs_filemap_page_mkwrite(struct vm_area_struct *vma,
goto out;
}
+ down_read(&BTRFS_I(inode)->mmap_sem);
if (IS_DAX(inode))
ret = iomap_dax_fault(vma, vmf, &btrfs_iomap_ops);
else
ret = btrfs_page_mkwrite(vma, vmf);
-
+ up_read(&BTRFS_I(inode)->mmap_sem);
out:
sb_end_pagefault(inode->i_sb);
return ret;
@@ -2316,10 +2317,12 @@ static int btrfs_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
if ((vmf->flags & FAULT_FLAG_WRITE) && IS_DAX(inode))
return btrfs_filemap_page_mkwrite(vma, vmf);
+ down_read(&BTRFS_I(inode)->mmap_sem);
if (IS_DAX(inode))
ret = iomap_dax_fault(vma, vmf, &btrfs_iomap_ops);
else
ret = filemap_fault(vma, vmf);
+ up_read(&BTRFS_I(inode)->mmap_sem);
return ret;
}
@@ -2335,17 +2338,13 @@ static int btrfs_filemap_pfn_mkwrite(struct vm_area_struct *vma,
sb_start_pagefault(sb);
file_update_time(vma->vm_file);
- /*
- * How to serialise against truncate/hole punch similar to page_mkwrite?
- * For truncate, we firstly update isize and then truncate pagecache in
- * order to avoid race against page fault.
- * For punch_hole, we use lock_extent and truncate pagecache.
- */
+ down_read(&BTRFS_I(inode)->mmap_sem);
size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
if (vmf->pgoff >= size)
ret = VM_FAULT_SIGBUS;
else
ret = dax_pfn_mkwrite(vma, vmf);
+ up_read(&BTRFS_I(inode)->mmap_sem);
sb_end_pagefault(sb);
return ret;
@@ -2576,6 +2575,13 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
BTRFS_I(inode)->root->sectorsize) - 1;
same_block = (BTRFS_BYTES_TO_BLKS(root->fs_info, offset))
== (BTRFS_BYTES_TO_BLKS(root->fs_info, offset + len - 1));
+
+ /*
+ * Prevent page faults from reinstantiating pages we have released
+ * from page cache.
+ */
+ down_write(&BTRFS_I(inode)->mmap_sem);
+
/*
* We needn't truncate any block which is beyond the end of the file
* because we are sure there is no data there.
@@ -2591,17 +2597,15 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
} else {
ret = 0;
}
- goto out_only_mutex;
+ goto out_mmap;
}
/* zero back part of the first block */
if (offset < ino_size) {
truncated_block = true;
ret = btrfs_truncate_block(inode, offset, 0, 0);
- if (ret) {
- inode_unlock(inode);
- return ret;
- }
+ if (ret)
+ goto out_mmap;
}
/* Check the aligned pages after the first unaligned page,
@@ -2614,10 +2618,10 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
offset = lockstart;
ret = find_first_non_hole(inode, &offset, &len);
if (ret < 0)
- goto out_only_mutex;
+ goto out_mmap;
if (ret && !len) {
ret = 0;
- goto out_only_mutex;
+ goto out_mmap;
}
lockstart = offset;
}
@@ -2628,7 +2632,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
if (tail_len) {
ret = find_first_non_hole(inode, &tail_start, &tail_len);
if (unlikely(ret < 0))
- goto out_only_mutex;
+ goto out_mmap;
if (!ret) {
/* zero the front end of the last page */
if (tail_start + tail_len < ino_size) {
@@ -2637,14 +2641,14 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
tail_start + tail_len,
0, 1);
if (ret)
- goto out_only_mutex;
+ goto out_mmap;
}
}
}
if (lockend < lockstart) {
ret = 0;
- goto out_only_mutex;
+ goto out_mmap;
}
while (1) {
@@ -2814,6 +2818,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
out:
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
&cached_state, GFP_NOFS);
+out_mmap:
+ up_write(&BTRFS_I(inode)->mmap_sem);
out_only_mutex:
if (!updated_inode && truncated_block && !ret && !err) {
/*
@@ -5069,14 +5069,21 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
if (ret)
return ret;
- /* we don't support swapfiles, so vmtruncate shouldn't fail */
- truncate_setsize(inode, newsize);
+ /*
+ * Update isize first so that if upcoming unlock dio read won't
+ * race with truncate if they are beyond new isize.
+ */
+ i_size_write(inode, newsize);
/* Disable nonlocked read DIO to avoid the end less truncate */
btrfs_inode_block_unlocked_dio(inode);
inode_dio_wait(inode);
btrfs_inode_resume_unlocked_dio(inode);
+ down_write(&BTRFS_I(inode)->mmap_sem);
+ /* we don't support swapfiles, so vmtruncate shouldn't fail */
+ truncate_pagecache(inode, newsize);
+
ret = btrfs_truncate(inode);
if (ret && inode->i_nlink) {
int err;
@@ -5089,6 +5096,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
*/
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
+ up_write(&BTRFS_I(inode)->mmap_sem);
btrfs_orphan_del(NULL, inode);
return ret;
}
@@ -5109,6 +5117,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
if (IS_DAX(inode))
ret = btrfs_truncate_block(inode, newsize, 0, 0);
}
+ up_write(&BTRFS_I(inode)->mmap_sem);
}
return ret;
@@ -9877,6 +9886,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
INIT_LIST_HEAD(&ei->delayed_iput);
RB_CLEAR_NODE(&ei->rb_node);
init_rwsem(&ei->dio_sem);
+ init_rwsem(&ei->mmap_sem);
return inode;
}
How to serialise page_faults against truncate/hole punch? For truncate, we firstly update isize and then truncate pagecache in order to avoid race against page fault. For punch_hole, we use lock_extent and truncate pagecache. Although we have these rules to avoid the race, it's not easy to understand how they do that. This adds a new rw_semaphore mmap_sem in inode and grab it for writing over truncate, hole punching and for reading over page faults. Signed-off-by: Liu Bo <bo.li.liu@oracle.com> --- fs/btrfs/btrfs_inode.h | 7 +++++++ fs/btrfs/file.c | 40 +++++++++++++++++++++++----------------- fs/btrfs/inode.c | 14 ++++++++++++-- 3 files changed, 42 insertions(+), 19 deletions(-)