Message ID | afd3c1009172a4a1cfa10e73a64caf35c631a6d4.1416563833.git.osandov@osandov.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Fri, Nov 21, 2014 at 02:08:31AM -0800, Omar Sandoval wrote: > Implement the swap file a_ops on btrfs. Activation simply checks for a usable > swap file: it must be fully allocated (no holes), support direct I/O (so no > compressed or inline extents) and should be nocow (I'm not sure about that last > one). > > Signed-off-by: Omar Sandoval <osandov@osandov.com> > --- > fs/btrfs/inode.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 71 insertions(+) > > diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c > index d23362f..b8fd36b 100644 > --- a/fs/btrfs/inode.c > +++ b/fs/btrfs/inode.c > @@ -9442,6 +9442,75 @@ out_inode: > > } > > +static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, > + sector_t *span) > +{ > + struct inode *inode = file_inode(file); > + struct btrfs_inode *ip = BTRFS_I(inode); 'ip' looks strange in context of a filesystem, please pick a different name (eg. 'inode' or whatever). > + int ret = 0; > + u64 isize = inode->i_size; > + struct extent_state *cached_state = NULL; > + struct extent_map *em; > + u64 start, len; > + > + if (ip->flags & BTRFS_INODE_COMPRESS) { > + /* Can't do direct I/O on a compressed file. */ > + pr_err("BTRFS: swapfile is compressed"); Please use the btrfs_err macros everywhere. > + return -EINVAL; > + } > + if (!(ip->flags & BTRFS_INODE_NODATACOW)) { > + /* The swap file can't be copy-on-write. */ > + pr_err("BTRFS: swapfile is copy-on-write"); > + return -EINVAL; > + } > + > + lock_extent_bits(&ip->io_tree, 0, isize - 1, 0, &cached_state); > + > + /* > + * All of the extents must be allocated and support direct I/O. Inline > + * extents and compressed extents fall back to buffered I/O, so those > + * are no good. > + */ > + start = 0; > + while (start < isize) { > + len = isize - start; > + em = btrfs_get_extent(inode, NULL, 0, start, len, 0); > + if (IS_ERR(em)) { > + ret = PTR_ERR(em); > + goto out; > + } > + > + if (test_bit(EXTENT_FLAG_VACANCY, &em->flags) || > + em->block_start == EXTENT_MAP_HOLE) { If the no-holes feature is enabled on the filesystem, there won't be any such extent representing the hole. You have to check that each two consecutive extents are adjacent. > + pr_err("BTRFS: swapfile has holes"); > + ret = -EINVAL; > + goto out; > + } > + if (em->block_start == EXTENT_MAP_INLINE) { > + pr_err("BTRFS: swapfile is inline"); While the test is valid, this would mean that the file is smaller than the inline limit, which is now one page. I think the generic swap code would refuse such a small file anyway. > + ret = -EINVAL; > + goto out; > + } > + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { > + pr_err("BTRFS: swapfile is compresed"); > + ret = -EINVAL; > + goto out; > + } I think the preallocated extents should be refused as well. This means the filesystem has enough space to hold the data but it would still have to go through the allocation and could in turn stress the memory management code that triggered the swapping activity in the first place. Though it's probably still possible to reach such corner case even with fully allocated nodatacow file, this should be reviewed anyway. > + > + start = extent_map_end(em); > + free_extent_map(em); > + } > + > +out: > + unlock_extent_cached(&ip->io_tree, 0, isize - 1, &cached_state, > + GFP_NOFS); > + return ret; > +} -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Nov 21, 2014 at 6:00 PM, David Sterba <dsterba@suse.cz> wrote: > On Fri, Nov 21, 2014 at 02:08:31AM -0800, Omar Sandoval wrote: >> Implement the swap file a_ops on btrfs. Activation simply checks for a usable >> swap file: it must be fully allocated (no holes), support direct I/O (so no >> compressed or inline extents) and should be nocow (I'm not sure about that last >> one). >> >> Signed-off-by: Omar Sandoval <osandov@osandov.com> >> --- >> fs/btrfs/inode.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ >> 1 file changed, 71 insertions(+) >> >> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c >> index d23362f..b8fd36b 100644 >> --- a/fs/btrfs/inode.c >> +++ b/fs/btrfs/inode.c >> @@ -9442,6 +9442,75 @@ out_inode: >> >> } >> >> +static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, >> + sector_t *span) >> +{ >> + struct inode *inode = file_inode(file); >> + struct btrfs_inode *ip = BTRFS_I(inode); > > 'ip' looks strange in context of a filesystem, please pick a different > name (eg. 'inode' or whatever). > >> + int ret = 0; >> + u64 isize = inode->i_size; >> + struct extent_state *cached_state = NULL; >> + struct extent_map *em; >> + u64 start, len; >> + >> + if (ip->flags & BTRFS_INODE_COMPRESS) { >> + /* Can't do direct I/O on a compressed file. */ >> + pr_err("BTRFS: swapfile is compressed"); > > Please use the btrfs_err macros everywhere. > >> + return -EINVAL; >> + } >> + if (!(ip->flags & BTRFS_INODE_NODATACOW)) { >> + /* The swap file can't be copy-on-write. */ >> + pr_err("BTRFS: swapfile is copy-on-write"); >> + return -EINVAL; >> + } >> + >> + lock_extent_bits(&ip->io_tree, 0, isize - 1, 0, &cached_state); >> + >> + /* >> + * All of the extents must be allocated and support direct I/O. Inline >> + * extents and compressed extents fall back to buffered I/O, so those >> + * are no good. >> + */ >> + start = 0; >> + while (start < isize) { >> + len = isize - start; >> + em = btrfs_get_extent(inode, NULL, 0, start, len, 0); >> + if (IS_ERR(em)) { >> + ret = PTR_ERR(em); >> + goto out; >> + } >> + >> + if (test_bit(EXTENT_FLAG_VACANCY, &em->flags) || >> + em->block_start == EXTENT_MAP_HOLE) { > > If the no-holes feature is enabled on the filesystem, there won't be any > such extent representing the hole. You have to check that each two > consecutive extents are adjacent. If no-holes is enabled it means file extent items aren't used to represent holes. But extent maps are still used to represent holes in memory, and that's what the code is looking for and therefore it's correct. > >> + pr_err("BTRFS: swapfile has holes"); >> + ret = -EINVAL; >> + goto out; >> + } >> + if (em->block_start == EXTENT_MAP_INLINE) { >> + pr_err("BTRFS: swapfile is inline"); > > While the test is valid, this would mean that the file is smaller than > the inline limit, which is now one page. I think the generic swap code > would refuse such a small file anyway. > >> + ret = -EINVAL; >> + goto out; >> + } >> + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { >> + pr_err("BTRFS: swapfile is compresed"); >> + ret = -EINVAL; >> + goto out; >> + } > > I think the preallocated extents should be refused as well. This means > the filesystem has enough space to hold the data but it would still have > to go through the allocation and could in turn stress the memory > management code that triggered the swapping activity in the first place. > > Though it's probably still possible to reach such corner case even with > fully allocated nodatacow file, this should be reviewed anyway. > >> + >> + start = extent_map_end(em); >> + free_extent_map(em); >> + } >> + >> +out: >> + unlock_extent_cached(&ip->io_tree, 0, isize - 1, &cached_state, >> + GFP_NOFS); >> + return ret; >> +} > -- > To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Nov 21, 2014 at 07:00:45PM +0100, David Sterba wrote: > > + pr_err("BTRFS: swapfile has holes"); > > + ret = -EINVAL; > > + goto out; > > + } > > + if (em->block_start == EXTENT_MAP_INLINE) { > > + pr_err("BTRFS: swapfile is inline"); > > While the test is valid, this would mean that the file is smaller than > the inline limit, which is now one page. I think the generic swap code > would refuse such a small file anyway. > Sure. This test doesn't really cost us anything, so I think I'd feel a little better just leaving it in. I'll add a comment for the next close reader. Besides that and Filipe's response, I'll address everything you mentioned here and in your other email in the next version, thanks. > > + ret = -EINVAL; > > + goto out; > > + } > > + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { > > + pr_err("BTRFS: swapfile is compresed"); > > + ret = -EINVAL; > > + goto out; > > + } > > I think the preallocated extents should be refused as well. This means > the filesystem has enough space to hold the data but it would still have > to go through the allocation and could in turn stress the memory > management code that triggered the swapping activity in the first place. > > Though it's probably still possible to reach such corner case even with > fully allocated nodatacow file, this should be reviewed anyway. > I'll definitely take a closer look at this. In particular, btrfs_get_blocks_direct and btrfs_get_extent do allocations in some cases which I'll look into.
On Sat, Nov 22, 2014 at 12:03:57PM -0800, Omar Sandoval wrote: > On Fri, Nov 21, 2014 at 07:00:45PM +0100, David Sterba wrote: > > > + ret = -EINVAL; > > > + goto out; > > > + } > > > + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { > > > + pr_err("BTRFS: swapfile is compresed"); > > > + ret = -EINVAL; > > > + goto out; > > > + } > > > > I think the preallocated extents should be refused as well. This means > > the filesystem has enough space to hold the data but it would still have > > to go through the allocation and could in turn stress the memory > > management code that triggered the swapping activity in the first place. > > > > Though it's probably still possible to reach such corner case even with > > fully allocated nodatacow file, this should be reviewed anyway. > > > I'll definitely take a closer look at this. In particular, > btrfs_get_blocks_direct and btrfs_get_extent do allocations in some cases which > I'll look into. > Alright, I took a look at this. My understanding is that a PREALLOC extent represents a region on disk that has already been allocated but isn't in use yet, but please correct me if I'm wrong. Judging by this comment in btrfs_get_blocks_direct, we don't have to worry about PREALLOC extents in general: /* * We don't allocate a new extent in the following cases * * 1) The inode is marked as NODATACOW. In this case we'll just use the * existing extent. * 2) The extent is marked as PREALLOC. We're good to go here and can * just use the extent. * */ A couple of other considerations that cropped up: - btrfs_get_extent does a bunch of extra work if the extent is not cached in the extent map tree that would be nice to avoid when swapping - We might still have to do a COW if the swap file is in a snapshot We can avoid the btrfs_get_extent by pinning the extents in memory one way or another in btrfs_swap_activate. The snapshot issue is a little tricker to resolve. I see a few options: 1. Just do the COW and hope for the best 2. As part of btrfs_swap_activate, COW any shared extents. If a snapshot happens while a swap file is active, we'll fall back to 1. 3. Clobber any swap file extents which are in a snapshot, i.e., always use the existing extent. I'm partial to 3, as it's the simplest approach, and I don't think it makes much sense for a swap file to be in a snapshot anyways. I'd appreciate any comments that anyone might have.
On 2014/11/25 00:03, Omar Sandoval wrote: > [snip] > > The snapshot issue is a little tricker to resolve. I see a few options: > > 1. Just do the COW and hope for the best > 2. As part of btrfs_swap_activate, COW any shared extents. If a snapshot > happens while a swap file is active, we'll fall back to 1. > 3. Clobber any swap file extents which are in a snapshot, i.e., always use the > existing extent. > > I'm partial to 3, as it's the simplest approach, and I don't think it makes > much sense for a swap file to be in a snapshot anyways. I'd appreciate any > comments that anyone might have. > Personally, 3 seems pragmatic - but not necessarily "correct". :-/
On Mon, Nov 24, 2014 at 02:03:02PM -0800, Omar Sandoval wrote: > Alright, I took a look at this. My understanding is that a PREALLOC extent > represents a region on disk that has already been allocated but isn't in use > yet, but please correct me if I'm wrong. Judging by this comment in > btrfs_get_blocks_direct, we don't have to worry about PREALLOC extents in > general: > > /* > * We don't allocate a new extent in the following cases > * > * 1) The inode is marked as NODATACOW. In this case we'll just use the > * existing extent. > * 2) The extent is marked as PREALLOC. We're good to go here and can > * just use the extent. > * > */ Ok, thanks for checking. > A couple of other considerations that cropped up: > > - btrfs_get_extent does a bunch of extra work if the extent is not cached in > the extent map tree that would be nice to avoid when swapping > - We might still have to do a COW if the swap file is in a snapshot > > We can avoid the btrfs_get_extent by pinning the extents in memory one way or > another in btrfs_swap_activate. That's preferrable, the overhead should be small. > The snapshot issue is a little tricker to resolve. I see a few options: > > 1. Just do the COW and hope for the best Snapshotting of NODATACOW files work, the extents are COWed on the first write, the original owner sees no change. > 2. As part of btrfs_swap_activate, COW any shared extents. If a snapshot > happens while a swap file is active, we'll fall back to 1. So we should make sure that any write to the swapfile will not lead to the first-COW, this would require to scan all the extents and see if we're the owner and COW eventually. Doing that automatically is IMO better from the user perspective, compared to an erroring out and requesting a manual "dd" over the file. Possibly, the implied COW could create preallocated extents so we do not have to rewrite the data. > 3. Clobber any swap file extents which are in a snapshot, i.e., always use the > existing extent. Easy to implement but could lead to bad suprises. More thoughts: There are some guards in the generic code to prevent unwanted modifications to the swapfiles (eg. no truncate, no fallocate, no deletion). Further we should audit btrfs ioctls that may interfere with swap and forbid any action (notably the cloning and deduplication ones). -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d23362f..b8fd36b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9442,6 +9442,75 @@ out_inode: } +static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, + sector_t *span) +{ + struct inode *inode = file_inode(file); + struct btrfs_inode *ip = BTRFS_I(inode); + int ret = 0; + u64 isize = inode->i_size; + struct extent_state *cached_state = NULL; + struct extent_map *em; + u64 start, len; + + if (ip->flags & BTRFS_INODE_COMPRESS) { + /* Can't do direct I/O on a compressed file. */ + pr_err("BTRFS: swapfile is compressed"); + return -EINVAL; + } + if (!(ip->flags & BTRFS_INODE_NODATACOW)) { + /* The swap file can't be copy-on-write. */ + pr_err("BTRFS: swapfile is copy-on-write"); + return -EINVAL; + } + + lock_extent_bits(&ip->io_tree, 0, isize - 1, 0, &cached_state); + + /* + * All of the extents must be allocated and support direct I/O. Inline + * extents and compressed extents fall back to buffered I/O, so those + * are no good. + */ + start = 0; + while (start < isize) { + len = isize - start; + em = btrfs_get_extent(inode, NULL, 0, start, len, 0); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out; + } + + if (test_bit(EXTENT_FLAG_VACANCY, &em->flags) || + em->block_start == EXTENT_MAP_HOLE) { + pr_err("BTRFS: swapfile has holes"); + ret = -EINVAL; + goto out; + } + if (em->block_start == EXTENT_MAP_INLINE) { + pr_err("BTRFS: swapfile is inline"); + ret = -EINVAL; + goto out; + } + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { + pr_err("BTRFS: swapfile is compresed"); + ret = -EINVAL; + goto out; + } + + start = extent_map_end(em); + free_extent_map(em); + } + +out: + unlock_extent_cached(&ip->io_tree, 0, isize - 1, &cached_state, + GFP_NOFS); + return ret; +} + +static void btrfs_swap_deactivate(struct file *file) +{ +} + static const struct inode_operations btrfs_dir_inode_operations = { .getattr = btrfs_getattr, .lookup = btrfs_lookup, @@ -9519,6 +9588,8 @@ static const struct address_space_operations btrfs_aops = { .releasepage = btrfs_releasepage, .set_page_dirty = btrfs_set_page_dirty, .error_remove_page = generic_error_remove_page, + .swap_activate = btrfs_swap_activate, + .swap_deactivate = btrfs_swap_deactivate, }; static const struct address_space_operations btrfs_symlink_aops = {
Implement the swap file a_ops on btrfs. Activation simply checks for a usable swap file: it must be fully allocated (no holes), support direct I/O (so no compressed or inline extents) and should be nocow (I'm not sure about that last one). Signed-off-by: Omar Sandoval <osandov@osandov.com> --- fs/btrfs/inode.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+)