diff mbox

[2/5] Btrfs: fix trans block rsv regression

Message ID 1347613087-3489-2-git-send-email-bo.li.liu@oracle.com (mailing list archive)
State New, archived
Headers show

Commit Message

Liu Bo Sept. 14, 2012, 8:58 a.m. UTC
In some workloads we have nested joining transaction operations,
eg.
  run_delalloc_nocow
      btrfs_join_transaction
      cow_file_range
         btrfs_join_transaction

it can be a serious bug since each trans handler has only two
block_rsv, orig_rsv and block_rsv, which means we may lose our
first block_rsv after two joining transaction operations:

1) btrfs_start_transaction
      trans->block_rsv = A

2) btrfs_join_transaction
      trans->orig_rsv = trans->block_rsv; ---> orig_rsv is now A
      trans->block_rsv = B

3) btrfs_join_transaction
      trans->orig_rsv = trans->block_rsv; ---> orig_rsv is now B
      trans->block_rsv = C
   ...

This uses a list of block_rsv instead so that we can either
a) PUSH the old one into the list and use a new one in joining, or
b) POP the old one in ending this transaction.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
---
 fs/btrfs/transaction.c |   25 +++++++++++++++++++++----
 fs/btrfs/transaction.h |    7 ++++++-
 2 files changed, 27 insertions(+), 5 deletions(-)

Comments

David Sterba Sept. 14, 2012, 11:15 a.m. UTC | #1
On Fri, Sep 14, 2012 at 04:58:04PM +0800, Liu Bo wrote:
> --- a/fs/btrfs/transaction.c
> +++ b/fs/btrfs/transaction.c
> @@ -306,9 +306,17 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
>  		WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK &&
>  			type != TRANS_JOIN_ONLY);
>  		h = current->journal_info;
> -		h->use_count++;
> -		h->orig_rsv = h->block_rsv;
> +		if (h->block_rsv) {
> +			struct btrfs_trans_rsv_item *item;
> +			item = kmalloc(sizeof(*item), GFP_NOFS);

I'd rather avoid the kmalloc here and add a list hook into
btrfs_block_rsv itself (used only for this purpose).

It also does not increase the failure surface and we don't have to
handle error conditions from deep callchains.

> +			if (!item)
> +				return ERR_PTR(-ENOMEM);
> +			item->rsv = h->block_rsv;
> +			INIT_LIST_HEAD(&item->list);
> +			list_add(&item->list, &h->blk_rsv_list);
> +		}
>  		h->block_rsv = NULL;
> +		h->use_count++;
>  		goto got_it;
>  	} else if (type == TRANS_JOIN_ONLY) {
>  		return ERR_PTR(-ENOENT);
> --- a/fs/btrfs/transaction.h
> +++ b/fs/btrfs/transaction.h
> @@ -57,7 +57,6 @@ struct btrfs_trans_handle {
>  	unsigned long delayed_ref_updates;
>  	struct btrfs_transaction *transaction;
>  	struct btrfs_block_rsv *block_rsv;
> -	struct btrfs_block_rsv *orig_rsv;
>  	int aborted;
>  	int adding_csums;
>  	/*
> @@ -68,6 +67,12 @@ struct btrfs_trans_handle {
>  	struct btrfs_root *root;
>  	struct seq_list delayed_ref_elem;
>  	struct list_head qgroup_ref_list;
> +	struct list_head blk_rsv_list;

Does it refer to chain of orig_rsv's ? Ie. naming it orig_blk_rsv_list

> +};
> +
> +struct btrfs_trans_rsv_item {
> +	struct btrfs_block_rsv *rsv;
> +	struct list_head list;

Generally, for such 'list of single pointers' structs I'd evaluate the
possibility of embedding the hook inside the struct, the overhead
(memory, processing) is not desirable.

>  };
>  
>  struct btrfs_pending_snapshot {
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Liu Bo Sept. 14, 2012, 11:25 a.m. UTC | #2
On 09/14/2012 07:15 PM, David Sterba wrote:
> On Fri, Sep 14, 2012 at 04:58:04PM +0800, Liu Bo wrote:
>> --- a/fs/btrfs/transaction.c
>> +++ b/fs/btrfs/transaction.c
>> @@ -306,9 +306,17 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
>>  		WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK &&
>>  			type != TRANS_JOIN_ONLY);
>>  		h = current->journal_info;
>> -		h->use_count++;
>> -		h->orig_rsv = h->block_rsv;
>> +		if (h->block_rsv) {
>> +			struct btrfs_trans_rsv_item *item;
>> +			item = kmalloc(sizeof(*item), GFP_NOFS);
> 
> I'd rather avoid the kmalloc here and add a list hook into
> btrfs_block_rsv itself (used only for this purpose).
> 
> It also does not increase the failure surface and we don't have to
> handle error conditions from deep callchains.
> 

Actually I placed a list hook at first, but I found the same block_rsv could be inserted into
the list chain twice, which will cause list_head's terrible warnings.

>> +			if (!item)
>> +				return ERR_PTR(-ENOMEM);
>> +			item->rsv = h->block_rsv;
>> +			INIT_LIST_HEAD(&item->list);
>> +			list_add(&item->list, &h->blk_rsv_list);
>> +		}
>>  		h->block_rsv = NULL;
>> +		h->use_count++;
>>  		goto got_it;
>>  	} else if (type == TRANS_JOIN_ONLY) {
>>  		return ERR_PTR(-ENOENT);
>> --- a/fs/btrfs/transaction.h
>> +++ b/fs/btrfs/transaction.h
>> @@ -57,7 +57,6 @@ struct btrfs_trans_handle {
>>  	unsigned long delayed_ref_updates;
>>  	struct btrfs_transaction *transaction;
>>  	struct btrfs_block_rsv *block_rsv;
>> -	struct btrfs_block_rsv *orig_rsv;
>>  	int aborted;
>>  	int adding_csums;
>>  	/*
>> @@ -68,6 +67,12 @@ struct btrfs_trans_handle {
>>  	struct btrfs_root *root;
>>  	struct seq_list delayed_ref_elem;
>>  	struct list_head qgroup_ref_list;
>> +	struct list_head blk_rsv_list;
> 
> Does it refer to chain of orig_rsv's ? Ie. naming it orig_blk_rsv_list
> 

Make sense.

>> +};
>> +
>> +struct btrfs_trans_rsv_item {
>> +	struct btrfs_block_rsv *rsv;
>> +	struct list_head list;
> 
> Generally, for such 'list of single pointers' structs I'd evaluate the
> possibility of embedding the hook inside the struct, the overhead
> (memory, processing) is not desirable.
> 

See the above, plz.

thanks,
liubo


>>  };
>>  
>>  struct btrfs_pending_snapshot {

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Sterba Sept. 14, 2012, 12:07 p.m. UTC | #3
On Fri, Sep 14, 2012 at 07:25:45PM +0800, Liu Bo wrote:
> On 09/14/2012 07:15 PM, David Sterba wrote:
> > On Fri, Sep 14, 2012 at 04:58:04PM +0800, Liu Bo wrote:
> >> --- a/fs/btrfs/transaction.c
> >> +++ b/fs/btrfs/transaction.c
> >> @@ -306,9 +306,17 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
> >>  		WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK &&
> >>  			type != TRANS_JOIN_ONLY);
> >>  		h = current->journal_info;
> >> -		h->use_count++;
> >> -		h->orig_rsv = h->block_rsv;
> >> +		if (h->block_rsv) {
> >> +			struct btrfs_trans_rsv_item *item;
> >> +			item = kmalloc(sizeof(*item), GFP_NOFS);
> > 
> > I'd rather avoid the kmalloc here and add a list hook into
> > btrfs_block_rsv itself (used only for this purpose).
> > 
> > It also does not increase the failure surface and we don't have to
> > handle error conditions from deep callchains.
> > 
> 
> Actually I placed a list hook at first, but I found the same block_rsv could be inserted into
> the list chain twice, which will cause list_head's terrible warnings.

Is it expected and correct to add the rsv twice? I know the transaction
joins and commits are sometimes wildly nested so it does not mean that's
necessarily a bug. Then it is not possible to embed the list hook, we
need to keep the full track of the blk_rsv stack.

I'm counting two other similar structs used inside btrfs
(list_head + 8B value), we could make a shared separate slab for them.

struct delayed_iput {
        struct list_head           list;                 /*     0    16 */
        struct inode *             inode;                /*    16     8 */

        /* size: 24, cachelines: 1, members: 2 */
        /* last cacheline: 24 bytes */
};

truct seq_list {
        struct list_head           list;                 /*     0    16 */
        u64                        seq;                  /*    16     8 */

        /* size: 24, cachelines: 1, members: 2 */
        /* last cacheline: 24 bytes */
};

seq_list is never allocated, only embedded inside tree_mod_log structures.

delayed_iput is allocated on every add_delayed_iput and freed quite
frequently (once per-commit at least), so this is a short-lived object
as well as the proposed btrfs_trans_rsv_item. IMO this justifies using
the slab. Does this sound good to you?


david
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Liu Bo Sept. 14, 2012, 12:16 p.m. UTC | #4
On 09/14/2012 08:07 PM, David Sterba wrote:
> On Fri, Sep 14, 2012 at 07:25:45PM +0800, Liu Bo wrote:
>> On 09/14/2012 07:15 PM, David Sterba wrote:
>>> On Fri, Sep 14, 2012 at 04:58:04PM +0800, Liu Bo wrote:
>>>> --- a/fs/btrfs/transaction.c
>>>> +++ b/fs/btrfs/transaction.c
>>>> @@ -306,9 +306,17 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
>>>>  		WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK &&
>>>>  			type != TRANS_JOIN_ONLY);
>>>>  		h = current->journal_info;
>>>> -		h->use_count++;
>>>> -		h->orig_rsv = h->block_rsv;
>>>> +		if (h->block_rsv) {
>>>> +			struct btrfs_trans_rsv_item *item;
>>>> +			item = kmalloc(sizeof(*item), GFP_NOFS);
>>>
>>> I'd rather avoid the kmalloc here and add a list hook into
>>> btrfs_block_rsv itself (used only for this purpose).
>>>
>>> It also does not increase the failure surface and we don't have to
>>> handle error conditions from deep callchains.
>>>
>>
>> Actually I placed a list hook at first, but I found the same block_rsv could be inserted into
>> the list chain twice, which will cause list_head's terrible warnings.
> 
> Is it expected and correct to add the rsv twice? I know the transaction
> joins and commits are sometimes wildly nested so it does not mean that's
> necessarily a bug. Then it is not possible to embed the list hook, we
> need to keep the full track of the blk_rsv stack.
> 
> I'm counting two other similar structs used inside btrfs
> (list_head + 8B value), we could make a shared separate slab for them.
> 
> struct delayed_iput {
>         struct list_head           list;                 /*     0    16 */
>         struct inode *             inode;                /*    16     8 */
> 
>         /* size: 24, cachelines: 1, members: 2 */
>         /* last cacheline: 24 bytes */
> };
> 
> truct seq_list {
>         struct list_head           list;                 /*     0    16 */
>         u64                        seq;                  /*    16     8 */
> 
>         /* size: 24, cachelines: 1, members: 2 */
>         /* last cacheline: 24 bytes */
> };
> 
> seq_list is never allocated, only embedded inside tree_mod_log structures.
> 
> delayed_iput is allocated on every add_delayed_iput and freed quite
> frequently (once per-commit at least), so this is a short-lived object
> as well as the proposed btrfs_trans_rsv_item. IMO this justifies using
> the slab. Does this sound good to you?
> 

Good, a quick thinking says it will work ;)

btw, do you have any test suit to measure this kind of cacheline efficiency so that
we can get some numbers to see if it really helps?

- liubo
> 
> david
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Josef Bacik Sept. 14, 2012, 12:41 p.m. UTC | #5
On Fri, Sep 14, 2012 at 02:58:04AM -0600, Liu Bo wrote:
> In some workloads we have nested joining transaction operations,
> eg.
>   run_delalloc_nocow
>       btrfs_join_transaction
>       cow_file_range
>          btrfs_join_transaction
> 
> it can be a serious bug since each trans handler has only two
> block_rsv, orig_rsv and block_rsv, which means we may lose our
> first block_rsv after two joining transaction operations:
> 
> 1) btrfs_start_transaction
>       trans->block_rsv = A
> 
> 2) btrfs_join_transaction
>       trans->orig_rsv = trans->block_rsv; ---> orig_rsv is now A
>       trans->block_rsv = B
> 
> 3) btrfs_join_transaction
>       trans->orig_rsv = trans->block_rsv; ---> orig_rsv is now B
>       trans->block_rsv = C
>    ...
> 

I'd like to see the actual stack trace where this happens, because I don't think
it can happen.  And if it is we need to look at that specific case and adjust it
as necessary and not add a bunch of kmallocs just to track the block_rsv,
because frankly it's not that big of a deal, it was just put into place in case
somebody wasn't expecting a call they made to start another transaction and
reset the block_rsv, which I don't actually think happens anywhere.  So NAK on
this patch, give me more information so I can figure out the right way to deal
with this.  Thanks,

Josef
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Liu Bo Sept. 14, 2012, 1:01 p.m. UTC | #6
On 09/14/2012 08:41 PM, Josef Bacik wrote:
> On Fri, Sep 14, 2012 at 02:58:04AM -0600, Liu Bo wrote:
>> In some workloads we have nested joining transaction operations,
>> eg.
>>   run_delalloc_nocow
>>       btrfs_join_transaction
>>       cow_file_range
>>          btrfs_join_transaction
>>
>> it can be a serious bug since each trans handler has only two
>> block_rsv, orig_rsv and block_rsv, which means we may lose our
>> first block_rsv after two joining transaction operations:
>>
>> 1) btrfs_start_transaction
>>       trans->block_rsv = A
>>
>> 2) btrfs_join_transaction
>>       trans->orig_rsv = trans->block_rsv; ---> orig_rsv is now A
>>       trans->block_rsv = B
>>
>> 3) btrfs_join_transaction
>>       trans->orig_rsv = trans->block_rsv; ---> orig_rsv is now B
>>       trans->block_rsv = C
>>    ...
>>
> 
> I'd like to see the actual stack trace where this happens, because I don't think
> it can happen.  And if it is we need to look at that specific case and adjust it
> as necessary and not add a bunch of kmallocs just to track the block_rsv,
> because frankly it's not that big of a deal, it was just put into place in case
> somebody wasn't expecting a call they made to start another transaction and
> reset the block_rsv, which I don't actually think happens anywhere.  So NAK on
> this patch, give me more information so I can figure out the right way to deal
> with this.  Thanks,
> 

Fine, please run xfstests 068 till it hits a BUG_ON inside either btrfs_delete_delayed_dir_index or
btrfs_insert_delayed_dir_index.

What I saw is that the orig_rsv and block_rsv is both delalloc_block_rsv, which is already lack of space.

thanks,
liubo

> Josef
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Liu Bo Sept. 14, 2012, 1:05 p.m. UTC | #7
On 09/14/2012 09:01 PM, Liu Bo wrote:
> On 09/14/2012 08:41 PM, Josef Bacik wrote:
>> On Fri, Sep 14, 2012 at 02:58:04AM -0600, Liu Bo wrote:
>>> In some workloads we have nested joining transaction operations,
>>> eg.
>>>   run_delalloc_nocow
>>>       btrfs_join_transaction
>>>       cow_file_range
>>>          btrfs_join_transaction
>>>
>>> it can be a serious bug since each trans handler has only two
>>> block_rsv, orig_rsv and block_rsv, which means we may lose our
>>> first block_rsv after two joining transaction operations:
>>>
>>> 1) btrfs_start_transaction
>>>       trans->block_rsv = A
>>>
>>> 2) btrfs_join_transaction
>>>       trans->orig_rsv = trans->block_rsv; ---> orig_rsv is now A
>>>       trans->block_rsv = B
>>>
>>> 3) btrfs_join_transaction
>>>       trans->orig_rsv = trans->block_rsv; ---> orig_rsv is now B
>>>       trans->block_rsv = C
>>>    ...
>>>
>>
>> I'd like to see the actual stack trace where this happens, because I don't think
>> it can happen.  And if it is we need to look at that specific case and adjust it
>> as necessary and not add a bunch of kmallocs just to track the block_rsv,
>> because frankly it's not that big of a deal, it was just put into place in case
>> somebody wasn't expecting a call they made to start another transaction and
>> reset the block_rsv, which I don't actually think happens anywhere.  So NAK on
>> this patch, give me more information so I can figure out the right way to deal
>> with this.  Thanks,
>>
> 
> Fine, please run xfstests 068 till it hits a BUG_ON inside either btrfs_delete_delayed_dir_index or
> btrfs_insert_delayed_dir_index.
> 
> What I saw is that the orig_rsv and block_rsv is both delalloc_block_rsv, which is already lack of space.
> 

and trans->use_count has been 3.


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Sterba Sept. 14, 2012, 1:26 p.m. UTC | #8
On Fri, Sep 14, 2012 at 09:01:36PM +0800, Liu Bo wrote:
> Fine, please run xfstests 068 till it hits a BUG_ON inside either btrfs_delete_delayed_dir_index or
> btrfs_insert_delayed_dir_index.

068 is the freezer test, we've seen enough issues in the freezing area
recently, I'm curious if you were able to reproduce with a non-freezing
tests.

It's quite possible that this may be a race condition and the freezer
just supplies the right delays and widens the race window.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 0c17d9e..a36ae05 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -306,9 +306,17 @@  static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
 		WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK &&
 			type != TRANS_JOIN_ONLY);
 		h = current->journal_info;
-		h->use_count++;
-		h->orig_rsv = h->block_rsv;
+		if (h->block_rsv) {
+			struct btrfs_trans_rsv_item *item;
+			item = kmalloc(sizeof(*item), GFP_NOFS);
+			if (!item)
+				return ERR_PTR(-ENOMEM);
+			item->rsv = h->block_rsv;
+			INIT_LIST_HEAD(&item->list);
+			list_add(&item->list, &h->blk_rsv_list);
+		}
 		h->block_rsv = NULL;
+		h->use_count++;
 		goto got_it;
 	} else if (type == TRANS_JOIN_ONLY) {
 		return ERR_PTR(-ENOENT);
@@ -367,11 +375,11 @@  again:
 	h->use_count = 1;
 	h->adding_csums = 0;
 	h->block_rsv = NULL;
-	h->orig_rsv = NULL;
 	h->aborted = 0;
 	h->qgroup_reserved = qgroup_reserved;
 	h->delayed_ref_elem.seq = 0;
 	INIT_LIST_HEAD(&h->qgroup_ref_list);
+	INIT_LIST_HEAD(&h->blk_rsv_list);
 
 	smp_mb();
 	if (cur_trans->blocked && may_wait_transaction(root, type)) {
@@ -523,7 +531,15 @@  static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 	int err = 0;
 
 	if (--trans->use_count) {
-		trans->block_rsv = trans->orig_rsv;
+		trans->block_rsv = NULL;
+		if (!list_empty(&trans->blk_rsv_list)) {
+			struct btrfs_trans_rsv_item *item;
+			item = list_entry(trans->blk_rsv_list.next,
+					  struct btrfs_trans_rsv_item, list);
+			list_del_init(&item->list);
+			trans->block_rsv = item->rsv;
+			kfree(item);
+		}
 		return 0;
 	}
 
@@ -558,6 +574,7 @@  static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 		count++;
 	}
 	btrfs_trans_release_metadata(trans, root);
+	BUG_ON(!list_empty(&trans->blk_rsv_list));
 	trans->block_rsv = NULL;
 
 	sb_end_intwrite(root->fs_info->sb);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 59adf55..7fa11b7 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -57,7 +57,6 @@  struct btrfs_trans_handle {
 	unsigned long delayed_ref_updates;
 	struct btrfs_transaction *transaction;
 	struct btrfs_block_rsv *block_rsv;
-	struct btrfs_block_rsv *orig_rsv;
 	int aborted;
 	int adding_csums;
 	/*
@@ -68,6 +67,12 @@  struct btrfs_trans_handle {
 	struct btrfs_root *root;
 	struct seq_list delayed_ref_elem;
 	struct list_head qgroup_ref_list;
+	struct list_head blk_rsv_list;
+};
+
+struct btrfs_trans_rsv_item {
+	struct btrfs_block_rsv *rsv;
+	struct list_head list;
 };
 
 struct btrfs_pending_snapshot {