diff mbox

[4/4] btrfs: qgroup: Fix qgroup data leaking by using subtree tracing

Message ID 20161018013129.23331-5-quwenruo@cn.fujitsu.com (mailing list archive)
State Accepted
Headers show

Commit Message

Qu Wenruo Oct. 18, 2016, 1:31 a.m. UTC
Commit 62b99540a1d91e464 (btrfs: relocation: Fix leaking qgroups numbers
on data extents) only fixes the problem partly.

The previous fix is to trace all new data extents at transaction commit
time when balance finishes.

However balance is not done in a large transaction, every path
replacement can happen in its own transaction.
This makes the fix useless if transaction commits during relocation.

For example:
relocate_block_group()
|-merge_reloc_roots()
|  |- merge_reloc_root()
|     |- btrfs_start_transaction()         <- Trans X
|     |- replace_path()                    <- Cause leak
|     |- btrfs_end_transaction_throttle()  <- Trans X commits here
|     |                                       Leak not fixed
|     |
|     |- btrfs_start_transaction()         <- Trans Y
|     |- replace_path()                    <- Cause leak
|     |- btrfs_end_transaction_throttle()  <- Trans Y ends
|                                             but not committed
|-btrfs_join_transaction()                 <- Still trans Y
|-qgroup_fix()                             <- Only fixes data leak
|                                             in trans Y
|-btrfs_commit_transaction()               <- Trans Y commits

In that case, qgroup fixup can only fix data leak in trans Y, data leak
in trans X is out of fix.

So the correct fix should happens in the same transaction of
replace_path().

This patch fixes it by tracing both subtrees of tree block swap, so it
can fix the problem and ensure all leaking and fix are in the same
transaction, so no leak again.

Reported-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
---
 fs/btrfs/relocation.c | 119 ++++++++++----------------------------------------
 1 file changed, 23 insertions(+), 96 deletions(-)

Comments

Goldwyn Rodrigues Nov. 3, 2016, 6:50 p.m. UTC | #1
Reviewed-and-Tested-by: Goldwyn Rodrigues <rgoldwyn@suse.com>

On 10/17/2016 08:31 PM, Qu Wenruo wrote:
> Commit 62b99540a1d91e464 (btrfs: relocation: Fix leaking qgroups numbers
> on data extents) only fixes the problem partly.
> 
> The previous fix is to trace all new data extents at transaction commit
> time when balance finishes.
> 
> However balance is not done in a large transaction, every path
> replacement can happen in its own transaction.
> This makes the fix useless if transaction commits during relocation.
> 
> For example:
> relocate_block_group()
> |-merge_reloc_roots()
> |  |- merge_reloc_root()
> |     |- btrfs_start_transaction()         <- Trans X
> |     |- replace_path()                    <- Cause leak
> |     |- btrfs_end_transaction_throttle()  <- Trans X commits here
> |     |                                       Leak not fixed
> |     |
> |     |- btrfs_start_transaction()         <- Trans Y
> |     |- replace_path()                    <- Cause leak
> |     |- btrfs_end_transaction_throttle()  <- Trans Y ends
> |                                             but not committed
> |-btrfs_join_transaction()                 <- Still trans Y
> |-qgroup_fix()                             <- Only fixes data leak
> |                                             in trans Y
> |-btrfs_commit_transaction()               <- Trans Y commits
> 
> In that case, qgroup fixup can only fix data leak in trans Y, data leak
> in trans X is out of fix.
> 
> So the correct fix should happens in the same transaction of
> replace_path().
> 
> This patch fixes it by tracing both subtrees of tree block swap, so it
> can fix the problem and ensure all leaking and fix are in the same
> transaction, so no leak again.
> 
> Reported-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
> Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
> ---
>  fs/btrfs/relocation.c | 119 ++++++++++----------------------------------------
>  1 file changed, 23 insertions(+), 96 deletions(-)
> 
> diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
> index 33cde30..540f225 100644
> --- a/fs/btrfs/relocation.c
> +++ b/fs/btrfs/relocation.c
> @@ -1901,6 +1901,29 @@ again:
>  		BUG_ON(ret);
>  
>  		/*
> +		 * Info qgroup to trace both subtrees.
> +		 *
> +		 * We must trace both trees.
> +		 * 1) Tree reloc subtree
> +		 *    If not traced, we will leak data numbers
> +		 * 2) Fs subtree
> +		 *    If not traced, we will double count old data
> +		 *    and tree block numbers, if current trans doesn't free
> +		 *    data reloc tree inode.
> +		 */
> +		ret = btrfs_qgroup_trace_subtree(trans, src, parent,
> +				btrfs_header_generation(parent),
> +				btrfs_header_level(parent));
> +		if (ret < 0)
> +			break;
> +		ret = btrfs_qgroup_trace_subtree(trans, dest,
> +				path->nodes[level],
> +				btrfs_header_generation(path->nodes[level]),
> +				btrfs_header_level(path->nodes[level]));
> +		if (ret < 0)
> +			break;
> +
> +		/*
>  		 * swap blocks in fs tree and reloc tree.
>  		 */
>  		btrfs_set_node_blockptr(parent, slot, new_bytenr);
> @@ -3942,90 +3965,6 @@ int prepare_to_relocate(struct reloc_control *rc)
>  	return 0;
>  }
>  
> -/*
> - * Qgroup fixer for data chunk relocation.
> - * The data relocation is done in the following steps
> - * 1) Copy data extents into data reloc tree
> - * 2) Create tree reloc tree(special snapshot) for related subvolumes
> - * 3) Modify file extents in tree reloc tree
> - * 4) Merge tree reloc tree with original fs tree, by swapping tree blocks
> - *
> - * The problem is, data and tree reloc tree are not accounted to qgroup,
> - * and 4) will only info qgroup to track tree blocks change, not file extents
> - * in the tree blocks.
> - *
> - * The good news is, related data extents are all in data reloc tree, so we
> - * only need to info qgroup to track all file extents in data reloc tree
> - * before commit trans.
> - */
> -static int qgroup_fix_relocated_data_extents(struct btrfs_trans_handle *trans,
> -					     struct reloc_control *rc)
> -{
> -	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
> -	struct inode *inode = rc->data_inode;
> -	struct btrfs_root *data_reloc_root = BTRFS_I(inode)->root;
> -	struct btrfs_path *path;
> -	struct btrfs_key key;
> -	int ret = 0;
> -
> -	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
> -		return 0;
> -
> -	/*
> -	 * Only for stage where we update data pointers the qgroup fix is
> -	 * valid.
> -	 * For MOVING_DATA stage, we will miss the timing of swapping tree
> -	 * blocks, and won't fix it.
> -	 */
> -	if (!(rc->stage == UPDATE_DATA_PTRS && rc->extents_found))
> -		return 0;
> -
> -	path = btrfs_alloc_path();
> -	if (!path)
> -		return -ENOMEM;
> -	key.objectid = btrfs_ino(inode);
> -	key.type = BTRFS_EXTENT_DATA_KEY;
> -	key.offset = 0;
> -
> -	ret = btrfs_search_slot(NULL, data_reloc_root, &key, path, 0, 0);
> -	if (ret < 0)
> -		goto out;
> -
> -	lock_extent(&BTRFS_I(inode)->io_tree, 0, (u64)-1);
> -	while (1) {
> -		struct btrfs_file_extent_item *fi;
> -
> -		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
> -		if (key.objectid > btrfs_ino(inode))
> -			break;
> -		if (key.type != BTRFS_EXTENT_DATA_KEY)
> -			goto next;
> -		fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
> -				    struct btrfs_file_extent_item);
> -		if (btrfs_file_extent_type(path->nodes[0], fi) !=
> -				BTRFS_FILE_EXTENT_REG)
> -			goto next;
> -		ret = btrfs_qgroup_trace_extent(trans, fs_info,
> -			btrfs_file_extent_disk_bytenr(path->nodes[0], fi),
> -			btrfs_file_extent_disk_num_bytes(path->nodes[0], fi),
> -			GFP_NOFS);
> -		if (ret < 0)
> -			break;
> -next:
> -		ret = btrfs_next_item(data_reloc_root, path);
> -		if (ret < 0)
> -			break;
> -		if (ret > 0) {
> -			ret = 0;
> -			break;
> -		}
> -	}
> -	unlock_extent(&BTRFS_I(inode)->io_tree, 0 , (u64)-1);
> -out:
> -	btrfs_free_path(path);
> -	return ret;
> -}
> -
>  static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
>  {
>  	struct rb_root blocks = RB_ROOT;
> @@ -4216,13 +4155,6 @@ restart:
>  		err = PTR_ERR(trans);
>  		goto out_free;
>  	}
> -	ret = qgroup_fix_relocated_data_extents(trans, rc);
> -	if (ret < 0) {
> -		btrfs_abort_transaction(trans, ret);
> -		if (!err)
> -			err = ret;
> -		goto out_free;
> -	}
>  	btrfs_commit_transaction(trans, rc->extent_root);
>  out_free:
>  	btrfs_free_block_rsv(rc->extent_root, rc->block_rsv);
> @@ -4591,11 +4523,6 @@ int btrfs_recover_relocation(struct btrfs_root *root)
>  		err = PTR_ERR(trans);
>  		goto out_free;
>  	}
> -	err = qgroup_fix_relocated_data_extents(trans, rc);
> -	if (err < 0) {
> -		btrfs_abort_transaction(trans, err);
> -		goto out_free;
> -	}
>  	err = btrfs_commit_transaction(trans, rc->extent_root);
>  out_free:
>  	kfree(rc);
>
diff mbox

Patch

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 33cde30..540f225 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1901,6 +1901,29 @@  again:
 		BUG_ON(ret);
 
 		/*
+		 * Info qgroup to trace both subtrees.
+		 *
+		 * We must trace both trees.
+		 * 1) Tree reloc subtree
+		 *    If not traced, we will leak data numbers
+		 * 2) Fs subtree
+		 *    If not traced, we will double count old data
+		 *    and tree block numbers, if current trans doesn't free
+		 *    data reloc tree inode.
+		 */
+		ret = btrfs_qgroup_trace_subtree(trans, src, parent,
+				btrfs_header_generation(parent),
+				btrfs_header_level(parent));
+		if (ret < 0)
+			break;
+		ret = btrfs_qgroup_trace_subtree(trans, dest,
+				path->nodes[level],
+				btrfs_header_generation(path->nodes[level]),
+				btrfs_header_level(path->nodes[level]));
+		if (ret < 0)
+			break;
+
+		/*
 		 * swap blocks in fs tree and reloc tree.
 		 */
 		btrfs_set_node_blockptr(parent, slot, new_bytenr);
@@ -3942,90 +3965,6 @@  int prepare_to_relocate(struct reloc_control *rc)
 	return 0;
 }
 
-/*
- * Qgroup fixer for data chunk relocation.
- * The data relocation is done in the following steps
- * 1) Copy data extents into data reloc tree
- * 2) Create tree reloc tree(special snapshot) for related subvolumes
- * 3) Modify file extents in tree reloc tree
- * 4) Merge tree reloc tree with original fs tree, by swapping tree blocks
- *
- * The problem is, data and tree reloc tree are not accounted to qgroup,
- * and 4) will only info qgroup to track tree blocks change, not file extents
- * in the tree blocks.
- *
- * The good news is, related data extents are all in data reloc tree, so we
- * only need to info qgroup to track all file extents in data reloc tree
- * before commit trans.
- */
-static int qgroup_fix_relocated_data_extents(struct btrfs_trans_handle *trans,
-					     struct reloc_control *rc)
-{
-	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
-	struct inode *inode = rc->data_inode;
-	struct btrfs_root *data_reloc_root = BTRFS_I(inode)->root;
-	struct btrfs_path *path;
-	struct btrfs_key key;
-	int ret = 0;
-
-	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
-		return 0;
-
-	/*
-	 * Only for stage where we update data pointers the qgroup fix is
-	 * valid.
-	 * For MOVING_DATA stage, we will miss the timing of swapping tree
-	 * blocks, and won't fix it.
-	 */
-	if (!(rc->stage == UPDATE_DATA_PTRS && rc->extents_found))
-		return 0;
-
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-	key.objectid = btrfs_ino(inode);
-	key.type = BTRFS_EXTENT_DATA_KEY;
-	key.offset = 0;
-
-	ret = btrfs_search_slot(NULL, data_reloc_root, &key, path, 0, 0);
-	if (ret < 0)
-		goto out;
-
-	lock_extent(&BTRFS_I(inode)->io_tree, 0, (u64)-1);
-	while (1) {
-		struct btrfs_file_extent_item *fi;
-
-		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
-		if (key.objectid > btrfs_ino(inode))
-			break;
-		if (key.type != BTRFS_EXTENT_DATA_KEY)
-			goto next;
-		fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
-				    struct btrfs_file_extent_item);
-		if (btrfs_file_extent_type(path->nodes[0], fi) !=
-				BTRFS_FILE_EXTENT_REG)
-			goto next;
-		ret = btrfs_qgroup_trace_extent(trans, fs_info,
-			btrfs_file_extent_disk_bytenr(path->nodes[0], fi),
-			btrfs_file_extent_disk_num_bytes(path->nodes[0], fi),
-			GFP_NOFS);
-		if (ret < 0)
-			break;
-next:
-		ret = btrfs_next_item(data_reloc_root, path);
-		if (ret < 0)
-			break;
-		if (ret > 0) {
-			ret = 0;
-			break;
-		}
-	}
-	unlock_extent(&BTRFS_I(inode)->io_tree, 0 , (u64)-1);
-out:
-	btrfs_free_path(path);
-	return ret;
-}
-
 static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
 {
 	struct rb_root blocks = RB_ROOT;
@@ -4216,13 +4155,6 @@  restart:
 		err = PTR_ERR(trans);
 		goto out_free;
 	}
-	ret = qgroup_fix_relocated_data_extents(trans, rc);
-	if (ret < 0) {
-		btrfs_abort_transaction(trans, ret);
-		if (!err)
-			err = ret;
-		goto out_free;
-	}
 	btrfs_commit_transaction(trans, rc->extent_root);
 out_free:
 	btrfs_free_block_rsv(rc->extent_root, rc->block_rsv);
@@ -4591,11 +4523,6 @@  int btrfs_recover_relocation(struct btrfs_root *root)
 		err = PTR_ERR(trans);
 		goto out_free;
 	}
-	err = qgroup_fix_relocated_data_extents(trans, rc);
-	if (err < 0) {
-		btrfs_abort_transaction(trans, err);
-		goto out_free;
-	}
 	err = btrfs_commit_transaction(trans, rc->extent_root);
 out_free:
 	kfree(rc);