diff mbox

Btrfs: allow delayed refs to be merged

Message ID 1341946331-18058-1-git-send-email-jbacik@fusionio.com (mailing list archive)
State New, archived
Headers show

Commit Message

Josef Bacik July 10, 2012, 6:52 p.m. UTC
Daniel Blueman reported a bug with fio+balance on a ramdisk setup.
Basically what happens is the balance relocates a tree block which will drop
the implicit refs for all of its children and adds a full backref.  Once the
block is relocated we have to add the implicit refs back, so when we cow the
block again we add the implicit refs for its children back.  The problem
comes when the original drop ref doesn't get run before we add the implicit
refs back.  The delayed ref stuff will specifically prefer ADD operations
over DROP to keep us from freeing up an extent that will have references to
it, so we try to add the implicit ref before it is actually removed and we
panic.  This worked fine before because the add would have just canceled the
drop out and we would have been fine.  But the backref walking work needs to
be able to freeze the delayed ref stuff in time so we have this ever
increasing sequence number that gets attached to all new delayed ref updates
which makes us not merge refs and we run into this issue.

So since the backref walking stuff doesn't get run all that often we just
ignore the sequence updates until somebody actually tries to do the freeze.
Then if we try to run the delayed refs we go back and try to merge them in
case we get a sequence like this again so we do not panic.

I need the consumers of the backref resolution code to test this heavily and
make sure it doesn't break them.  It makes Daniels original problem go away.
Thanks,

Reported-by: Daniel J Blueman <daniel@quora.org>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
---
 fs/btrfs/ctree.c       |    2 +-
 fs/btrfs/ctree.h       |    4 +-
 fs/btrfs/delayed-ref.c |  146 ++++++++++++++++++++++++++++++++++++++++--------
 fs/btrfs/delayed-ref.h |    6 ++-
 fs/btrfs/extent-tree.c |   20 +++++-
 5 files changed, 146 insertions(+), 32 deletions(-)

Comments

Arne Jansen July 10, 2012, 7:39 p.m. UTC | #1
On 07/10/2012 08:52 PM, Josef Bacik wrote:
> Daniel Blueman reported a bug with fio+balance on a ramdisk setup.
> Basically what happens is the balance relocates a tree block which will drop
> the implicit refs for all of its children and adds a full backref.  Once the
> block is relocated we have to add the implicit refs back, so when we cow the
> block again we add the implicit refs for its children back.  The problem
> comes when the original drop ref doesn't get run before we add the implicit
> refs back.  The delayed ref stuff will specifically prefer ADD operations
> over DROP to keep us from freeing up an extent that will have references to
> it, so we try to add the implicit ref before it is actually removed and we
> panic.  This worked fine before because the add would have just canceled the
> drop out and we would have been fine.  But the backref walking work needs to
> be able to freeze the delayed ref stuff in time so we have this ever
> increasing sequence number that gets attached to all new delayed ref updates
> which makes us not merge refs and we run into this issue.
> 
> So since the backref walking stuff doesn't get run all that often we just
> ignore the sequence updates until somebody actually tries to do the freeze.
> Then if we try to run the delayed refs we go back and try to merge them in
> case we get a sequence like this again so we do not panic.

Subvolume quota will also use it, so it might get used _very_ often.
Please give me some time to understand the problem deeper. This patch
adds a lot of complexity, and I'd prefer to find a solution that adds
none :)

Thanks,
Arne

> 
> I need the consumers of the backref resolution code to test this heavily and
> make sure it doesn't break them.  It makes Daniels original problem go away.
> Thanks,
> 
> Reported-by: Daniel J Blueman <daniel@quora.org>
> Signed-off-by: Josef Bacik <jbacik@fusionio.com>
> ---
>  fs/btrfs/ctree.c       |    2 +-
>  fs/btrfs/ctree.h       |    4 +-
>  fs/btrfs/delayed-ref.c |  146 ++++++++++++++++++++++++++++++++++++++++--------
>  fs/btrfs/delayed-ref.h |    6 ++-
>  fs/btrfs/extent-tree.c |   20 +++++-
>  5 files changed, 146 insertions(+), 32 deletions(-)
> 
> diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
> index 8206b39..a914580 100644
> --- a/fs/btrfs/ctree.c
> +++ b/fs/btrfs/ctree.c
> @@ -846,7 +846,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
>  		if (new_flags != 0) {
>  			ret = btrfs_set_disk_extent_flags(trans, root,
>  							  buf->start,
> -							  buf->len,
> +							  buf->len, owner,
>  							  new_flags, 0);
>  			if (ret)
>  				return ret;
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index fa5c45b..1b527bc 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -2574,8 +2574,8 @@ int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
>  		  struct extent_buffer *buf, int full_backref, int for_cow);
>  int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
>  				struct btrfs_root *root,
> -				u64 bytenr, u64 num_bytes, u64 flags,
> -				int is_data);
> +				u64 bytenr, u64 num_bytes, u64 ref_root,
> +				u64 flags, int is_data);
>  int btrfs_free_extent(struct btrfs_trans_handle *trans,
>  		      struct btrfs_root *root,
>  		      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
> diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
> index 13ae7b0..93b7df1 100644
> --- a/fs/btrfs/delayed-ref.c
> +++ b/fs/btrfs/delayed-ref.c
> @@ -85,7 +85,8 @@ static int comp_data_refs(struct btrfs_delayed_data_ref *ref2,
>   * type of the delayed backrefs and content of delayed backrefs.
>   */
>  static int comp_entry(struct btrfs_delayed_ref_node *ref2,
> -		      struct btrfs_delayed_ref_node *ref1)
> +		      struct btrfs_delayed_ref_node *ref1,
> +		      bool compare_seq)
>  {
>  	if (ref1->bytenr < ref2->bytenr)
>  		return -1;
> @@ -102,10 +103,12 @@ static int comp_entry(struct btrfs_delayed_ref_node *ref2,
>  	if (ref1->type > ref2->type)
>  		return 1;
>  	/* merging of sequenced refs is not allowed */
> -	if (ref1->seq < ref2->seq)
> -		return -1;
> -	if (ref1->seq > ref2->seq)
> -		return 1;
> +	if (compare_seq) {
> +		if (ref1->seq < ref2->seq)
> +			return -1;
> +		if (ref1->seq > ref2->seq)
> +			return 1;
> +	}
>  	if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
>  	    ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) {
>  		return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2),
> @@ -139,7 +142,7 @@ static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
>  		entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
>  				 rb_node);
>  
> -		cmp = comp_entry(entry, ins);
> +		cmp = comp_entry(entry, ins, 1);
>  		if (cmp < 0)
>  			p = &(*p)->rb_left;
>  		else if (cmp > 0)
> @@ -233,6 +236,101 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
>  	return 0;
>  }
>  
> +static void inline drop_delayed_ref(struct btrfs_trans_handle *trans,
> +				    struct btrfs_delayed_ref_root *delayed_refs,
> +				    struct btrfs_delayed_ref_node *ref)
> +{
> +	rb_erase(&ref->rb_node, &delayed_refs->root);
> +	ref->in_tree = 0;
> +	btrfs_put_delayed_ref(ref);
> +	delayed_refs->num_entries--;
> +	if (trans->delayed_ref_updates)
> +		trans->delayed_ref_updates--;
> +}
> +
> +static int merge_ref(struct btrfs_trans_handle *trans,
> +		     struct btrfs_delayed_ref_root *delayed_refs,
> +		     struct btrfs_delayed_ref_node *ref,
> +		     u64 seq)
> +{
> +	struct rb_node *node;
> +	int merged = 0;
> +
> +again:
> +	node = rb_prev(&ref->rb_node);
> +	while (node) {
> +		struct btrfs_delayed_ref_node *next;
> +
> +		next = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
> +		node = rb_prev(node);
> +		if (next->bytenr != ref->bytenr)
> +			break;
> +		if (seq && next->seq > seq)
> +			break;
> +		if (comp_entry(ref, next, 0))
> +			continue;
> +
> +		/*
> +		 * We don't care about multiple adds, we just want to cancel out
> +		 * a drop+add
> +		 */
> +		if (ref->action == next->action)
> +			continue;
> +
> +		merged++;
> +		drop_delayed_ref(trans, delayed_refs, next);
> +		if (--ref->ref_mod == 0) {
> +			drop_delayed_ref(trans, delayed_refs, ref);
> +			break;
> +		}
> +		goto again;
> +	}
> +
> +	return merged;
> +}
> +
> +int btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
> +			     struct btrfs_delayed_ref_root *delayed_refs,
> +			     struct btrfs_delayed_ref_head *head)
> +{
> +	struct rb_node *node;
> +	int merged = 0;
> +	u64 seq = 0;
> +
> +	/* We didn't span seq counts for this ref head */
> +	if (likely(!head->need_merge))
> +		return 0;
> +
> +	if (!list_empty(&delayed_refs->seq_head)) {
> +		struct seq_list *elem;
> +
> +		elem = list_first_entry(&delayed_refs->seq_head,
> +					struct seq_list, list);
> +		seq = elem->seq;
> +	}
> +
> +	node = rb_prev(&head->node.rb_node);
> +	while (node) {
> +		struct btrfs_delayed_ref_node *ref;
> +
> +		ref = rb_entry(node, struct btrfs_delayed_ref_node,
> +			       rb_node);
> +		if (ref->bytenr != head->node.bytenr)
> +			break;
> +		/* We can't merge refs that are outside of our seq count */
> +		if (seq && ref->seq > seq)
> +			break;
> +		if (merge_ref(trans, delayed_refs, ref, seq)) {
> +			merged = 1;
> +			node = rb_prev(&head->node.rb_node);
> +		} else {
> +			node = rb_prev(node);
> +		}
> +	}
> +
> +	return merged;
> +}
> +
>  int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
>  			    u64 seq)
>  {
> @@ -332,18 +430,11 @@ update_existing_ref(struct btrfs_trans_handle *trans,
>  		 * every changing the extent allocation tree.
>  		 */
>  		existing->ref_mod--;
> -		if (existing->ref_mod == 0) {
> -			rb_erase(&existing->rb_node,
> -				 &delayed_refs->root);
> -			existing->in_tree = 0;
> -			btrfs_put_delayed_ref(existing);
> -			delayed_refs->num_entries--;
> -			if (trans->delayed_ref_updates)
> -				trans->delayed_ref_updates--;
> -		} else {
> +		if (existing->ref_mod == 0)
> +			drop_delayed_ref(trans, delayed_refs, existing);
> +		else
>  			WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY ||
>  				existing->type == BTRFS_SHARED_BLOCK_REF_KEY);
> -		}
>  	} else {
>  		WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY ||
>  			existing->type == BTRFS_SHARED_BLOCK_REF_KEY);
> @@ -423,7 +514,7 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
>  static noinline void add_delayed_ref_head(struct btrfs_fs_info *fs_info,
>  					struct btrfs_trans_handle *trans,
>  					struct btrfs_delayed_ref_node *ref,
> -					u64 bytenr, u64 num_bytes,
> +					u64 bytenr, u64 num_bytes, u64 ref_root,
>  					int action, int is_data)
>  {
>  	struct btrfs_delayed_ref_node *existing;
> @@ -431,6 +522,7 @@ static noinline void add_delayed_ref_head(struct btrfs_fs_info *fs_info,
>  	struct btrfs_delayed_ref_root *delayed_refs;
>  	int count_mod = 1;
>  	int must_insert_reserved = 0;
> +	int need_merge = 0;
>  
>  	/*
>  	 * the head node stores the sum of all the mods, so dropping a ref
> @@ -469,6 +561,8 @@ static noinline void add_delayed_ref_head(struct btrfs_fs_info *fs_info,
>  	ref->is_head = 1;
>  	ref->in_tree = 1;
>  	ref->seq = 0;
> +	if (is_fstree(ref_root) && !list_empty(&delayed_refs->seq_head))
> +		need_merge = 1;
>  
>  	head_ref = btrfs_delayed_node_to_head(ref);
>  	head_ref->must_insert_reserved = must_insert_reserved;
> @@ -488,12 +582,16 @@ static noinline void add_delayed_ref_head(struct btrfs_fs_info *fs_info,
>  		 * allocated ref
>  		 */
>  		kfree(head_ref);
> +		head_ref = btrfs_delayed_node_to_head(existing);
>  	} else {
>  		delayed_refs->num_heads++;
>  		delayed_refs->num_heads_ready++;
>  		delayed_refs->num_entries++;
>  		trans->delayed_ref_updates++;
>  	}
> +
> +	if (!head_ref->need_merge)
> +		head_ref->need_merge = need_merge;
>  }
>  
>  /*
> @@ -525,7 +623,7 @@ static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
>  	ref->is_head = 0;
>  	ref->in_tree = 1;
>  
> -	if (is_fstree(ref_root))
> +	if (is_fstree(ref_root) && !list_empty(&delayed_refs->seq_head))
>  		seq = inc_delayed_seq(delayed_refs);
>  	ref->seq = seq;
>  
> @@ -584,7 +682,7 @@ static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info,
>  	ref->is_head = 0;
>  	ref->in_tree = 1;
>  
> -	if (is_fstree(ref_root))
> +	if (is_fstree(ref_root) && !list_empty(&delayed_refs->seq_head))
>  		seq = inc_delayed_seq(delayed_refs);
>  	ref->seq = seq;
>  
> @@ -653,7 +751,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
>  	 * the spin lock
>  	 */
>  	add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
> -				   num_bytes, action, 0);
> +				   num_bytes, ref_root, action, 0);
>  
>  	add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr,
>  				   num_bytes, parent, ref_root, level, action,
> @@ -702,7 +800,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
>  	 * the spin lock
>  	 */
>  	add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
> -				   num_bytes, action, 1);
> +				   num_bytes, ref_root, action, 1);
>  
>  	add_delayed_data_ref(fs_info, trans, &ref->node, bytenr,
>  				   num_bytes, parent, ref_root, owner, offset,
> @@ -717,7 +815,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
>  
>  int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
>  				struct btrfs_trans_handle *trans,
> -				u64 bytenr, u64 num_bytes,
> +				u64 bytenr, u64 num_bytes, u64 ref_root,
>  				struct btrfs_delayed_extent_op *extent_op)
>  {
>  	struct btrfs_delayed_ref_head *head_ref;
> @@ -733,8 +831,8 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
>  	spin_lock(&delayed_refs->lock);
>  
>  	add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
> -				   num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
> -				   extent_op->is_data);
> +			     num_bytes, ref_root, BTRFS_UPDATE_DELAYED_HEAD,
> +			     extent_op->is_data);
>  
>  	if (waitqueue_active(&delayed_refs->seq_wait))
>  		wake_up(&delayed_refs->seq_wait);
> diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
> index 413927f..f3cad4a 100644
> --- a/fs/btrfs/delayed-ref.h
> +++ b/fs/btrfs/delayed-ref.h
> @@ -97,6 +97,7 @@ struct btrfs_delayed_ref_head {
>  	 */
>  	unsigned int must_insert_reserved:1;
>  	unsigned int is_data:1;
> +	unsigned int need_merge:1;
>  };
>  
>  struct btrfs_delayed_tree_ref {
> @@ -185,8 +186,11 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
>  			       int for_cow);
>  int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
>  				struct btrfs_trans_handle *trans,
> -				u64 bytenr, u64 num_bytes,
> +				u64 bytenr, u64 num_bytes, u64 ref_root,
>  				struct btrfs_delayed_extent_op *extent_op);
> +int btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
> +			     struct btrfs_delayed_ref_root *delayed_refs,
> +			     struct btrfs_delayed_ref_head *head);
>  
>  struct btrfs_delayed_ref_head *
>  btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index 5e552f9..fa25ca9 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -2271,6 +2271,16 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
>  		}
>  
>  		/*
> +		 * We need to try and merge add/drops of the same ref since we
> +		 * can run into issues with relocate dropping the implicit ref
> +		 * and then it being added back again before the drop can
> +		 * finish.  If we merged anything we need to re-loop so we can
> +		 * get a good ref.
> +		 */
> +		if (btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref))
> +			continue;
> +
> +		/*
>  		 * record the must insert reserved flag before we
>  		 * drop the spin lock.
>  		 */
> @@ -2507,8 +2517,8 @@ out:
>  
>  int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
>  				struct btrfs_root *root,
> -				u64 bytenr, u64 num_bytes, u64 flags,
> -				int is_data)
> +				u64 bytenr, u64 num_bytes, u64 ref_root,
> +				u64 flags, int is_data)
>  {
>  	struct btrfs_delayed_extent_op *extent_op;
>  	int ret;
> @@ -2523,7 +2533,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
>  	extent_op->is_data = is_data ? 1 : 0;
>  
>  	ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
> -					  num_bytes, extent_op);
> +					  num_bytes, ref_root, extent_op);
>  	if (ret)
>  		kfree(extent_op);
>  	return ret;
> @@ -6464,7 +6474,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
>  		ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc);
>  		BUG_ON(ret); /* -ENOMEM */
>  		ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
> -						  eb->len, flag, 0);
> +						  eb->len,
> +						  btrfs_header_owner(eb),
> +						  flag, 0);
>  		BUG_ON(ret); /* -ENOMEM */
>  		wc->flags[level] |= flag;
>  	}

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Josef Bacik July 10, 2012, 7:52 p.m. UTC | #2
On Tue, Jul 10, 2012 at 01:39:42PM -0600, Arne Jansen wrote:
> On 07/10/2012 08:52 PM, Josef Bacik wrote:
> > Daniel Blueman reported a bug with fio+balance on a ramdisk setup.
> > Basically what happens is the balance relocates a tree block which will drop
> > the implicit refs for all of its children and adds a full backref.  Once the
> > block is relocated we have to add the implicit refs back, so when we cow the
> > block again we add the implicit refs for its children back.  The problem
> > comes when the original drop ref doesn't get run before we add the implicit
> > refs back.  The delayed ref stuff will specifically prefer ADD operations
> > over DROP to keep us from freeing up an extent that will have references to
> > it, so we try to add the implicit ref before it is actually removed and we
> > panic.  This worked fine before because the add would have just canceled the
> > drop out and we would have been fine.  But the backref walking work needs to
> > be able to freeze the delayed ref stuff in time so we have this ever
> > increasing sequence number that gets attached to all new delayed ref updates
> > which makes us not merge refs and we run into this issue.
> >
> > So since the backref walking stuff doesn't get run all that often we just
> > ignore the sequence updates until somebody actually tries to do the freeze.
> > Then if we try to run the delayed refs we go back and try to merge them in
> > case we get a sequence like this again so we do not panic.
> 
> Subvolume quota will also use it, so it might get used _very_ often.
> Please give me some time to understand the problem deeper. This patch
> adds a lot of complexity, and I'd prefer to find a solution that adds
> none :)
> 

If you've got a better idea then go for it, but I'm coming up short.  One way or
another we need these operations to cancel out of they are both on the same ref
head at the same time.  We may be able to do something like make sure the full
backrefs are added first, then let implicit ref deletes happen, and then let
implicit ref adds happen, but then you are adding even more weird logic to what
can be run when.

The other option is to make relocate not do this dance at all, and I'm not
entirely sure how you would go about this.  I think we are ok leaving the
implicit ref because frankly the children all still belong to the original root,
but I don't understand the relocate code enough to decide if thats ok.  Thanks,

Josef
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Schmidt July 11, 2012, 1:19 p.m. UTC | #3
Hi Josef,

I hit a warning with this patch on top of the current cmason/for-linus
branch. Takes about 15 minutes to produce when running xfstest 278 in
a loop and, in another shell, doing fsstress on the same volume to
force metadata modifications.

fs/btrfs/extent-tree.c
...
5032         } else if (ret == -ENOENT) {
5033                 btrfs_print_leaf(extent_root, path->nodes[0]);
5034                 WARN_ON(1);
5035                 printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
5036                        "parent %llu root %llu  owner %llu offset %llu\n",
5037                        (unsigned long long)bytenr,
5038                        (unsigned long long)parent,
5039                        (unsigned long long)root_objectid,
5040                        (unsigned long long)owner_objectid,
5041                        (unsigned long long)owner_offset);
5042         } else {

<6>[469120.918975] leaf 133349376 total ptrs 36 free space 1187
<6>[469120.918978]      item 0 key (1070149632 a8 53248) itemoff 3942 itemsize 53
<6>[469120.918981]              extent refs 1 gen 358 flags 1
<6>[469120.918983]              extent data backref root 5 objectid 40286 offset 0 count 1
<6>[469120.918986]      item 1 key (1070202880 a8 53248) itemoff 3889 itemsize 53
<6>[469120.918988]              extent refs 1 gen 358 flags 1
<6>[469120.918989]              extent data backref root 5 objectid 40287 offset 0 count 1
<6>[469120.918992]      item 2 key (1070256128 a8 53248) itemoff 3836 itemsize 53
<6>[469120.918994]              extent refs 1 gen 358 flags 1
<6>[469120.918996]              extent data backref root 5 objectid 40288 offset 0 count 1
<6>[469120.918998]      item 3 key (1070309376 a8 53248) itemoff 3783 itemsize 53
<6>[469120.919000]              extent refs 1 gen 358 flags 1
<6>[469120.919002]              extent data backref root 5 objectid 40289 offset 0 count 1
<6>[469120.919004]      item 4 key (1070362624 a8 53248) itemoff 3730 itemsize 53
<6>[469120.919006]              extent refs 1 gen 358 flags 1
<6>[469120.919008]              extent data backref root 5 objectid 40290 offset 0 count 1
<6>[469120.919010]      item 5 key (1070415872 a8 53248) itemoff 3677 itemsize 53
<6>[469120.919012]              extent refs 1 gen 358 flags 1
<6>[469120.919014]              extent data backref root 5 objectid 40291 offset 0 count 1
<6>[469120.919016]      item 6 key (1070469120 a8 53248) itemoff 3624 itemsize 53
<6>[469120.919018]              extent refs 1 gen 358 flags 1
<6>[469120.919020]              extent data backref root 5 objectid 40292 offset 0 count 1
<6>[469120.919022]      item 7 key (1070522368 a8 53248) itemoff 3571 itemsize 53
<6>[469120.919024]              extent refs 1 gen 358 flags 1
<6>[469120.919026]              extent data backref root 5 objectid 40293 offset 0 count 1
<6>[469120.919028]      item 8 key (1070575616 a8 53248) itemoff 3518 itemsize 53
<6>[469120.919030]              extent refs 1 gen 358 flags 1
<6>[469120.919032]              extent data backref root 5 objectid 40294 offset 0 count 1
<6>[469120.919034]      item 9 key (1070628864 a8 53248) itemoff 3465 itemsize 53
<6>[469120.919036]              extent refs 1 gen 358 flags 1
<6>[469120.919038]              extent data backref root 5 objectid 40295 offset 0 count 1
<6>[469120.919040]      item 10 key (1070682112 a8 53248) itemoff 3412 itemsize 53
<6>[469120.919042]              extent refs 1 gen 358 flags 1
<6>[469120.919044]              extent data backref root 5 objectid 40296 offset 0 count 1
<6>[469120.919046]      item 11 key (1070735360 a8 53248) itemoff 3359 itemsize 53
<6>[469120.919048]              extent refs 1 gen 358 flags 1
<6>[469120.919050]              extent data backref root 5 objectid 40297 offset 0 count 1
<6>[469120.919052]      item 12 key (1071714304 a8 32768) itemoff 3306 itemsize 53
<6>[469120.919054]              extent refs 1 gen 791704 flags 1
<6>[469120.919056]              extent data backref root 5 objectid 14760797 offset 475136 count 1
<6>[469120.919059]      item 13 key (1071755264 a8 49152) itemoff 3253 itemsize 53
<6>[469120.919061]              extent refs 1 gen 791702 flags 1
<6>[469120.919062]              extent data backref root 5 objectid 14760670 offset 651264 count 1
<6>[469120.919065]      item 14 key (1072099328 a8 258048) itemoff 3200 itemsize 53
<6>[469120.919067]              extent refs 1 gen 791694 flags 1
<6>[469120.919069]              extent data backref root 5 objectid 14760709 offset 548864 count 1
<6>[469120.919071]      item 15 key (1072410624 a8 94208) itemoff 3147 itemsize 53
<6>[469120.919073]              extent refs 1 gen 791690 flags 1
<6>[469120.919075]              extent data backref root 5 objectid 14760670 offset 4096 count 1
<6>[469120.919077]      item 16 key (1072635904 a8 86016) itemoff 3094 itemsize 53
<6>[469120.919079]              extent refs 1 gen 791697 flags 1
<6>[469120.919081]              extent data backref root 5 objectid 14760652 offset 864256 count 1
<6>[469120.919084]      item 17 key (1072734208 a8 102400) itemoff 3041 itemsize 53
<6>[469120.919086]              extent refs 1 gen 791690 flags 1
<6>[469120.919088]              extent data backref root 5 objectid 14760665 offset 937984 count 1
<6>[469120.919090]      item 18 key (1072836608 a8 163840) itemoff 2988 itemsize 53
<6>[469120.919092]              extent refs 1 gen 791697 flags 1
<6>[469120.919094]              extent data backref root 5 objectid 14760641 offset 339968 count 1
<6>[469120.919096]      item 19 key (1073119232 a8 106496) itemoff 2935 itemsize 53
<6>[469120.919098]              extent refs 1 gen 791697 flags 1
<6>[469120.919100]              extent data backref root 5 objectid 14760665 offset 688128 count 1
<6>[469120.919103]      item 20 key (1073225728 a8 45056) itemoff 2882 itemsize 53
<6>[469120.919104]              extent refs 1 gen 791699 flags 1
<6>[469120.919106]              extent data backref root 5 objectid 14760674 offset 909312 count 1
<6>[469120.919109]      item 21 key (1073270784 a8 69632) itemoff 2829 itemsize 53
<6>[469120.919111]              extent refs 1 gen 791700 flags 1
<6>[469120.919113]              extent data backref root 5 objectid 14760665 offset 3403776 count 1
<6>[469120.919115]      item 22 key (1073356800 a8 73728) itemoff 2776 itemsize 53
<6>[469120.919117]              extent refs 1 gen 791699 flags 1
<6>[469120.919119]              extent data backref root 5 objectid 14760683 offset 1835008 count 1
<6>[469120.919121]      item 23 key (1073430528 a8 57344) itemoff 2723 itemsize 53
<6>[469120.919123]              extent refs 1 gen 791699 flags 1
<6>[469120.919125]              extent data backref root 5 objectid 14760713 offset 266240 count 1
<6>[469120.919128]      item 24 key (1073487872 a8 65536) itemoff 2670 itemsize 53
<6>[469120.919130]              extent refs 1 gen 791700 flags 1
<6>[469120.919131]              extent data backref root 1 objectid 260 offset 0 count 1
<6>[469120.919134]      item 25 key (1073553408 a8 40960) itemoff 2617 itemsize 53
<6>[469120.919136]              extent refs 1 gen 791701 flags 1
<6>[469120.919138]              extent data backref root 5 objectid 14760660 offset 1241088 count 1
<6>[469120.919140]      item 26 key (1073602560 a8 131072) itemoff 2564 itemsize 53
<6>[469120.919142]              extent refs 1 gen 791691 flags 1
<6>[469120.919144]              extent data backref root 5 objectid 14760686 offset 212992 count 1
<6>[469120.919146]      item 27 key (1073733632 a8 180224) itemoff 2511 itemsize 53
<6>[469120.919148]              extent refs 1 gen 791691 flags 1
<6>[469120.919150]              extent data backref root 5 objectid 14760686 offset 32768 count 1
<6>[469120.919153]      item 28 key (1073913856 a8 245760) itemoff 2458 itemsize 53
<6>[469120.919155]              extent refs 3 gen 791691 flags 1
<6>[469120.919157]              extent data backref root 5 objectid 14760686 offset 344064 count 3
<6>[469120.919159]      item 29 key (1074159616 a8 131072) itemoff 2405 itemsize 53
<6>[469120.919161]              extent refs 1 gen 791691 flags 1
<6>[469120.919163]              extent data backref root 5 objectid 14760655 offset 147456 count 1
<6>[469120.919165]      item 30 key (1074290688 a8 724992) itemoff 2352 itemsize 53
<6>[469120.919167]              extent refs 1 gen 791693 flags 1
<6>[469120.919169]              extent data backref root 5 objectid 14760673 offset 2031616 count 1
<6>[469120.919172]      item 31 key (1075015680 a8 270336) itemoff 2299 itemsize 53
<6>[469120.919174]              extent refs 1 gen 791704 flags 1
<6>[469120.919175]              extent data backref root 5 objectid 14760728 offset 413696 count 1
<6>[469120.919178]      item 32 key (1075625984 a8 126976) itemoff 2246 itemsize 53
<6>[469120.919180]              extent refs 1 gen 791704 flags 1
<6>[469120.919182]              extent data backref root 5 objectid 14760670 offset 1712128 count 1
<6>[469120.919184]      item 33 key (1075867648 a8 61440) itemoff 2193 itemsize 53
<6>[469120.919186]              extent refs 1 gen 791704 flags 1
<6>[469120.919188]              extent data backref root 5 objectid 14760743 offset 512000 count 1
<6>[469120.919190]      item 34 key (1075953664 a8 131072) itemoff 2140 itemsize 53
<6>[469120.919192]              extent refs 1 gen 791698 flags 1
<6>[469120.919194]              extent data backref root 5 objectid 14760673 offset 1437696 count 1
<6>[469120.919197]      item 35 key (1076084736 a8 118784) itemoff 2087 itemsize 53
<6>[469120.919199]              extent refs 1 gen 791704 flags 1
<6>[469120.919201]              extent data backref root 5 objectid 14760683 offset 561152 count 1
<4>[469120.919202] ------------[ cut here ]------------
<4>[469120.919216] WARNING: at fs/btrfs/extent-tree.c:5034 __btrfs_free_extent+0x675/0x740 [btrfs]()
<4>[469120.919218] Hardware name: X8SIL
<4>[469120.919220] Modules linked in: btrfs mpt2sas scsi_transport_sas raid_class [last unloaded: btrfs]
<4>[469120.919228] Pid: 31351, comm: rm Tainted: G        W    3.4.0+ #25
<4>[469120.919230] Call Trace:
<4>[469120.919235]  [<ffffffff8108101a>] warn_slowpath_common+0x7a/0xb0
<4>[469120.919238]  [<ffffffff81081065>] warn_slowpath_null+0x15/0x20
<4>[469120.919249]  [<ffffffffa0136bc5>] __btrfs_free_extent+0x675/0x740 [btrfs]
<4>[469120.919260]  [<ffffffffa01374fd>] run_clustered_refs+0x86d/0x9d0 [btrfs]
<4>[469120.919270]  [<ffffffffa0137802>] btrfs_run_delayed_refs+0x1a2/0x500 [btrfs]
<4>[469120.919283]  [<ffffffffa01485e5>] __btrfs_end_transaction+0xa5/0x270 [btrfs]
<4>[469120.919295]  [<ffffffffa0148810>] btrfs_end_transaction+0x10/0x20 [btrfs]
<4>[469120.919308]  [<ffffffffa014ed14>] btrfs_evict_inode+0x254/0x300 [btrfs]
<4>[469120.919312]  [<ffffffff811ab34a>] evict+0x9a/0x190
<4>[469120.919315]  [<ffffffff811ab54c>] iput+0x10c/0x200
<4>[469120.919318]  [<ffffffff811a1347>] do_unlinkat+0x167/0x1d0
<4>[469120.919322]  [<ffffffff811cef5e>] ? fsnotify_find_inode_mark+0x3e/0x50
<4>[469120.919326]  [<ffffffff8190eac7>] ? sysret_check+0x1b/0x56
<4>[469120.919330]  [<ffffffff810d9aa5>] ? trace_hardirqs_on_caller+0x105/0x190
<4>[469120.919334]  [<ffffffff8142628e>] ? trace_hardirqs_on_thunk+0x3a/0x3f
<4>[469120.919337]  [<ffffffff811a152d>] sys_unlinkat+0x1d/0x40
<4>[469120.919340]  [<ffffffff8190eaa2>] system_call_fastpath+0x16/0x1b
<4>[469120.919342] ---[ end trace 1dfd6883330a5208 ]---
<3>[469120.919345] btrfs unable to find ref byte nr 1072283648 parent 0 root 5  owner 14760652 offset 819200

-Jan
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Josef Bacik July 12, 2012, 5:05 p.m. UTC | #4
On Wed, Jul 11, 2012 at 07:19:29AM -0600, Jan Schmidt wrote:
> Hi Josef,
> 
> I hit a warning with this patch on top of the current cmason/for-linus
> branch. Takes about 15 minutes to produce when running xfstest 278 in
> a loop and, in another shell, doing fsstress on the same volume to
> force metadata modifications.
> 
> fs/btrfs/extent-tree.c
> ...
> 5032         } else if (ret == -ENOENT) {
> 5033                 btrfs_print_leaf(extent_root, path->nodes[0]);
> 5034                 WARN_ON(1);
> 5035                 printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
> 5036                        "parent %llu root %llu  owner %llu offset %llu\n",
> 5037                        (unsigned long long)bytenr,
> 5038                        (unsigned long long)parent,
> 5039                        (unsigned long long)root_objectid,
> 5040                        (unsigned long long)owner_objectid,
> 5041                        (unsigned long long)owner_offset);
> 5042         } else {
> 

Which test, 278 is an xfs specific test.  Thanks,

Josef
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Schmidt July 12, 2012, 6:41 p.m. UTC | #5
On 12.07.2012 19:05, Josef Bacik wrote:
> Which test, 278 is an xfs specific test.  Thanks,

Oops. The test finally made it into xfstests-dev as 276.

Sorry,
-Jan
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 8206b39..a914580 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -846,7 +846,7 @@  static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
 		if (new_flags != 0) {
 			ret = btrfs_set_disk_extent_flags(trans, root,
 							  buf->start,
-							  buf->len,
+							  buf->len, owner,
 							  new_flags, 0);
 			if (ret)
 				return ret;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index fa5c45b..1b527bc 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2574,8 +2574,8 @@  int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		  struct extent_buffer *buf, int full_backref, int for_cow);
 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
-				u64 bytenr, u64 num_bytes, u64 flags,
-				int is_data);
+				u64 bytenr, u64 num_bytes, u64 ref_root,
+				u64 flags, int is_data);
 int btrfs_free_extent(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root,
 		      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 13ae7b0..93b7df1 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -85,7 +85,8 @@  static int comp_data_refs(struct btrfs_delayed_data_ref *ref2,
  * type of the delayed backrefs and content of delayed backrefs.
  */
 static int comp_entry(struct btrfs_delayed_ref_node *ref2,
-		      struct btrfs_delayed_ref_node *ref1)
+		      struct btrfs_delayed_ref_node *ref1,
+		      bool compare_seq)
 {
 	if (ref1->bytenr < ref2->bytenr)
 		return -1;
@@ -102,10 +103,12 @@  static int comp_entry(struct btrfs_delayed_ref_node *ref2,
 	if (ref1->type > ref2->type)
 		return 1;
 	/* merging of sequenced refs is not allowed */
-	if (ref1->seq < ref2->seq)
-		return -1;
-	if (ref1->seq > ref2->seq)
-		return 1;
+	if (compare_seq) {
+		if (ref1->seq < ref2->seq)
+			return -1;
+		if (ref1->seq > ref2->seq)
+			return 1;
+	}
 	if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
 	    ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) {
 		return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2),
@@ -139,7 +142,7 @@  static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
 		entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
 				 rb_node);
 
-		cmp = comp_entry(entry, ins);
+		cmp = comp_entry(entry, ins, 1);
 		if (cmp < 0)
 			p = &(*p)->rb_left;
 		else if (cmp > 0)
@@ -233,6 +236,101 @@  int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+static void inline drop_delayed_ref(struct btrfs_trans_handle *trans,
+				    struct btrfs_delayed_ref_root *delayed_refs,
+				    struct btrfs_delayed_ref_node *ref)
+{
+	rb_erase(&ref->rb_node, &delayed_refs->root);
+	ref->in_tree = 0;
+	btrfs_put_delayed_ref(ref);
+	delayed_refs->num_entries--;
+	if (trans->delayed_ref_updates)
+		trans->delayed_ref_updates--;
+}
+
+static int merge_ref(struct btrfs_trans_handle *trans,
+		     struct btrfs_delayed_ref_root *delayed_refs,
+		     struct btrfs_delayed_ref_node *ref,
+		     u64 seq)
+{
+	struct rb_node *node;
+	int merged = 0;
+
+again:
+	node = rb_prev(&ref->rb_node);
+	while (node) {
+		struct btrfs_delayed_ref_node *next;
+
+		next = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+		node = rb_prev(node);
+		if (next->bytenr != ref->bytenr)
+			break;
+		if (seq && next->seq > seq)
+			break;
+		if (comp_entry(ref, next, 0))
+			continue;
+
+		/*
+		 * We don't care about multiple adds, we just want to cancel out
+		 * a drop+add
+		 */
+		if (ref->action == next->action)
+			continue;
+
+		merged++;
+		drop_delayed_ref(trans, delayed_refs, next);
+		if (--ref->ref_mod == 0) {
+			drop_delayed_ref(trans, delayed_refs, ref);
+			break;
+		}
+		goto again;
+	}
+
+	return merged;
+}
+
+int btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
+			     struct btrfs_delayed_ref_root *delayed_refs,
+			     struct btrfs_delayed_ref_head *head)
+{
+	struct rb_node *node;
+	int merged = 0;
+	u64 seq = 0;
+
+	/* We didn't span seq counts for this ref head */
+	if (likely(!head->need_merge))
+		return 0;
+
+	if (!list_empty(&delayed_refs->seq_head)) {
+		struct seq_list *elem;
+
+		elem = list_first_entry(&delayed_refs->seq_head,
+					struct seq_list, list);
+		seq = elem->seq;
+	}
+
+	node = rb_prev(&head->node.rb_node);
+	while (node) {
+		struct btrfs_delayed_ref_node *ref;
+
+		ref = rb_entry(node, struct btrfs_delayed_ref_node,
+			       rb_node);
+		if (ref->bytenr != head->node.bytenr)
+			break;
+		/* We can't merge refs that are outside of our seq count */
+		if (seq && ref->seq > seq)
+			break;
+		if (merge_ref(trans, delayed_refs, ref, seq)) {
+			merged = 1;
+			node = rb_prev(&head->node.rb_node);
+		} else {
+			node = rb_prev(node);
+		}
+	}
+
+	return merged;
+}
+
 int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
 			    u64 seq)
 {
@@ -332,18 +430,11 @@  update_existing_ref(struct btrfs_trans_handle *trans,
 		 * every changing the extent allocation tree.
 		 */
 		existing->ref_mod--;
-		if (existing->ref_mod == 0) {
-			rb_erase(&existing->rb_node,
-				 &delayed_refs->root);
-			existing->in_tree = 0;
-			btrfs_put_delayed_ref(existing);
-			delayed_refs->num_entries--;
-			if (trans->delayed_ref_updates)
-				trans->delayed_ref_updates--;
-		} else {
+		if (existing->ref_mod == 0)
+			drop_delayed_ref(trans, delayed_refs, existing);
+		else
 			WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY ||
 				existing->type == BTRFS_SHARED_BLOCK_REF_KEY);
-		}
 	} else {
 		WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY ||
 			existing->type == BTRFS_SHARED_BLOCK_REF_KEY);
@@ -423,7 +514,7 @@  update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
 static noinline void add_delayed_ref_head(struct btrfs_fs_info *fs_info,
 					struct btrfs_trans_handle *trans,
 					struct btrfs_delayed_ref_node *ref,
-					u64 bytenr, u64 num_bytes,
+					u64 bytenr, u64 num_bytes, u64 ref_root,
 					int action, int is_data)
 {
 	struct btrfs_delayed_ref_node *existing;
@@ -431,6 +522,7 @@  static noinline void add_delayed_ref_head(struct btrfs_fs_info *fs_info,
 	struct btrfs_delayed_ref_root *delayed_refs;
 	int count_mod = 1;
 	int must_insert_reserved = 0;
+	int need_merge = 0;
 
 	/*
 	 * the head node stores the sum of all the mods, so dropping a ref
@@ -469,6 +561,8 @@  static noinline void add_delayed_ref_head(struct btrfs_fs_info *fs_info,
 	ref->is_head = 1;
 	ref->in_tree = 1;
 	ref->seq = 0;
+	if (is_fstree(ref_root) && !list_empty(&delayed_refs->seq_head))
+		need_merge = 1;
 
 	head_ref = btrfs_delayed_node_to_head(ref);
 	head_ref->must_insert_reserved = must_insert_reserved;
@@ -488,12 +582,16 @@  static noinline void add_delayed_ref_head(struct btrfs_fs_info *fs_info,
 		 * allocated ref
 		 */
 		kfree(head_ref);
+		head_ref = btrfs_delayed_node_to_head(existing);
 	} else {
 		delayed_refs->num_heads++;
 		delayed_refs->num_heads_ready++;
 		delayed_refs->num_entries++;
 		trans->delayed_ref_updates++;
 	}
+
+	if (!head_ref->need_merge)
+		head_ref->need_merge = need_merge;
 }
 
 /*
@@ -525,7 +623,7 @@  static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 	ref->is_head = 0;
 	ref->in_tree = 1;
 
-	if (is_fstree(ref_root))
+	if (is_fstree(ref_root) && !list_empty(&delayed_refs->seq_head))
 		seq = inc_delayed_seq(delayed_refs);
 	ref->seq = seq;
 
@@ -584,7 +682,7 @@  static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 	ref->is_head = 0;
 	ref->in_tree = 1;
 
-	if (is_fstree(ref_root))
+	if (is_fstree(ref_root) && !list_empty(&delayed_refs->seq_head))
 		seq = inc_delayed_seq(delayed_refs);
 	ref->seq = seq;
 
@@ -653,7 +751,7 @@  int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 	 * the spin lock
 	 */
 	add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
-				   num_bytes, action, 0);
+				   num_bytes, ref_root, action, 0);
 
 	add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr,
 				   num_bytes, parent, ref_root, level, action,
@@ -702,7 +800,7 @@  int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 	 * the spin lock
 	 */
 	add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
-				   num_bytes, action, 1);
+				   num_bytes, ref_root, action, 1);
 
 	add_delayed_data_ref(fs_info, trans, &ref->node, bytenr,
 				   num_bytes, parent, ref_root, owner, offset,
@@ -717,7 +815,7 @@  int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 
 int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
 				struct btrfs_trans_handle *trans,
-				u64 bytenr, u64 num_bytes,
+				u64 bytenr, u64 num_bytes, u64 ref_root,
 				struct btrfs_delayed_extent_op *extent_op)
 {
 	struct btrfs_delayed_ref_head *head_ref;
@@ -733,8 +831,8 @@  int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
 	spin_lock(&delayed_refs->lock);
 
 	add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
-				   num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
-				   extent_op->is_data);
+			     num_bytes, ref_root, BTRFS_UPDATE_DELAYED_HEAD,
+			     extent_op->is_data);
 
 	if (waitqueue_active(&delayed_refs->seq_wait))
 		wake_up(&delayed_refs->seq_wait);
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 413927f..f3cad4a 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -97,6 +97,7 @@  struct btrfs_delayed_ref_head {
 	 */
 	unsigned int must_insert_reserved:1;
 	unsigned int is_data:1;
+	unsigned int need_merge:1;
 };
 
 struct btrfs_delayed_tree_ref {
@@ -185,8 +186,11 @@  int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 			       int for_cow);
 int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
 				struct btrfs_trans_handle *trans,
-				u64 bytenr, u64 num_bytes,
+				u64 bytenr, u64 num_bytes, u64 ref_root,
 				struct btrfs_delayed_extent_op *extent_op);
+int btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
+			     struct btrfs_delayed_ref_root *delayed_refs,
+			     struct btrfs_delayed_ref_head *head);
 
 struct btrfs_delayed_ref_head *
 btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 5e552f9..fa25ca9 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2271,6 +2271,16 @@  static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 		}
 
 		/*
+		 * We need to try and merge add/drops of the same ref since we
+		 * can run into issues with relocate dropping the implicit ref
+		 * and then it being added back again before the drop can
+		 * finish.  If we merged anything we need to re-loop so we can
+		 * get a good ref.
+		 */
+		if (btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref))
+			continue;
+
+		/*
 		 * record the must insert reserved flag before we
 		 * drop the spin lock.
 		 */
@@ -2507,8 +2517,8 @@  out:
 
 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
-				u64 bytenr, u64 num_bytes, u64 flags,
-				int is_data)
+				u64 bytenr, u64 num_bytes, u64 ref_root,
+				u64 flags, int is_data)
 {
 	struct btrfs_delayed_extent_op *extent_op;
 	int ret;
@@ -2523,7 +2533,7 @@  int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
 	extent_op->is_data = is_data ? 1 : 0;
 
 	ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
-					  num_bytes, extent_op);
+					  num_bytes, ref_root, extent_op);
 	if (ret)
 		kfree(extent_op);
 	return ret;
@@ -6464,7 +6474,9 @@  static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
 		ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc);
 		BUG_ON(ret); /* -ENOMEM */
 		ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
-						  eb->len, flag, 0);
+						  eb->len,
+						  btrfs_header_owner(eb),
+						  flag, 0);
 		BUG_ON(ret); /* -ENOMEM */
 		wc->flags[level] |= flag;
 	}