Message ID | 1347613087-3489-2-git-send-email-bo.li.liu@oracle.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Fri, Sep 14, 2012 at 04:58:04PM +0800, Liu Bo wrote: > --- a/fs/btrfs/transaction.c > +++ b/fs/btrfs/transaction.c > @@ -306,9 +306,17 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, > WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK && > type != TRANS_JOIN_ONLY); > h = current->journal_info; > - h->use_count++; > - h->orig_rsv = h->block_rsv; > + if (h->block_rsv) { > + struct btrfs_trans_rsv_item *item; > + item = kmalloc(sizeof(*item), GFP_NOFS); I'd rather avoid the kmalloc here and add a list hook into btrfs_block_rsv itself (used only for this purpose). It also does not increase the failure surface and we don't have to handle error conditions from deep callchains. > + if (!item) > + return ERR_PTR(-ENOMEM); > + item->rsv = h->block_rsv; > + INIT_LIST_HEAD(&item->list); > + list_add(&item->list, &h->blk_rsv_list); > + } > h->block_rsv = NULL; > + h->use_count++; > goto got_it; > } else if (type == TRANS_JOIN_ONLY) { > return ERR_PTR(-ENOENT); > --- a/fs/btrfs/transaction.h > +++ b/fs/btrfs/transaction.h > @@ -57,7 +57,6 @@ struct btrfs_trans_handle { > unsigned long delayed_ref_updates; > struct btrfs_transaction *transaction; > struct btrfs_block_rsv *block_rsv; > - struct btrfs_block_rsv *orig_rsv; > int aborted; > int adding_csums; > /* > @@ -68,6 +67,12 @@ struct btrfs_trans_handle { > struct btrfs_root *root; > struct seq_list delayed_ref_elem; > struct list_head qgroup_ref_list; > + struct list_head blk_rsv_list; Does it refer to chain of orig_rsv's ? Ie. naming it orig_blk_rsv_list > +}; > + > +struct btrfs_trans_rsv_item { > + struct btrfs_block_rsv *rsv; > + struct list_head list; Generally, for such 'list of single pointers' structs I'd evaluate the possibility of embedding the hook inside the struct, the overhead (memory, processing) is not desirable. > }; > > struct btrfs_pending_snapshot { -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 09/14/2012 07:15 PM, David Sterba wrote: > On Fri, Sep 14, 2012 at 04:58:04PM +0800, Liu Bo wrote: >> --- a/fs/btrfs/transaction.c >> +++ b/fs/btrfs/transaction.c >> @@ -306,9 +306,17 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, >> WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK && >> type != TRANS_JOIN_ONLY); >> h = current->journal_info; >> - h->use_count++; >> - h->orig_rsv = h->block_rsv; >> + if (h->block_rsv) { >> + struct btrfs_trans_rsv_item *item; >> + item = kmalloc(sizeof(*item), GFP_NOFS); > > I'd rather avoid the kmalloc here and add a list hook into > btrfs_block_rsv itself (used only for this purpose). > > It also does not increase the failure surface and we don't have to > handle error conditions from deep callchains. > Actually I placed a list hook at first, but I found the same block_rsv could be inserted into the list chain twice, which will cause list_head's terrible warnings. >> + if (!item) >> + return ERR_PTR(-ENOMEM); >> + item->rsv = h->block_rsv; >> + INIT_LIST_HEAD(&item->list); >> + list_add(&item->list, &h->blk_rsv_list); >> + } >> h->block_rsv = NULL; >> + h->use_count++; >> goto got_it; >> } else if (type == TRANS_JOIN_ONLY) { >> return ERR_PTR(-ENOENT); >> --- a/fs/btrfs/transaction.h >> +++ b/fs/btrfs/transaction.h >> @@ -57,7 +57,6 @@ struct btrfs_trans_handle { >> unsigned long delayed_ref_updates; >> struct btrfs_transaction *transaction; >> struct btrfs_block_rsv *block_rsv; >> - struct btrfs_block_rsv *orig_rsv; >> int aborted; >> int adding_csums; >> /* >> @@ -68,6 +67,12 @@ struct btrfs_trans_handle { >> struct btrfs_root *root; >> struct seq_list delayed_ref_elem; >> struct list_head qgroup_ref_list; >> + struct list_head blk_rsv_list; > > Does it refer to chain of orig_rsv's ? Ie. naming it orig_blk_rsv_list > Make sense. >> +}; >> + >> +struct btrfs_trans_rsv_item { >> + struct btrfs_block_rsv *rsv; >> + struct list_head list; > > Generally, for such 'list of single pointers' structs I'd evaluate the > possibility of embedding the hook inside the struct, the overhead > (memory, processing) is not desirable. > See the above, plz. thanks, liubo >> }; >> >> struct btrfs_pending_snapshot { -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Sep 14, 2012 at 07:25:45PM +0800, Liu Bo wrote: > On 09/14/2012 07:15 PM, David Sterba wrote: > > On Fri, Sep 14, 2012 at 04:58:04PM +0800, Liu Bo wrote: > >> --- a/fs/btrfs/transaction.c > >> +++ b/fs/btrfs/transaction.c > >> @@ -306,9 +306,17 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, > >> WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK && > >> type != TRANS_JOIN_ONLY); > >> h = current->journal_info; > >> - h->use_count++; > >> - h->orig_rsv = h->block_rsv; > >> + if (h->block_rsv) { > >> + struct btrfs_trans_rsv_item *item; > >> + item = kmalloc(sizeof(*item), GFP_NOFS); > > > > I'd rather avoid the kmalloc here and add a list hook into > > btrfs_block_rsv itself (used only for this purpose). > > > > It also does not increase the failure surface and we don't have to > > handle error conditions from deep callchains. > > > > Actually I placed a list hook at first, but I found the same block_rsv could be inserted into > the list chain twice, which will cause list_head's terrible warnings. Is it expected and correct to add the rsv twice? I know the transaction joins and commits are sometimes wildly nested so it does not mean that's necessarily a bug. Then it is not possible to embed the list hook, we need to keep the full track of the blk_rsv stack. I'm counting two other similar structs used inside btrfs (list_head + 8B value), we could make a shared separate slab for them. struct delayed_iput { struct list_head list; /* 0 16 */ struct inode * inode; /* 16 8 */ /* size: 24, cachelines: 1, members: 2 */ /* last cacheline: 24 bytes */ }; truct seq_list { struct list_head list; /* 0 16 */ u64 seq; /* 16 8 */ /* size: 24, cachelines: 1, members: 2 */ /* last cacheline: 24 bytes */ }; seq_list is never allocated, only embedded inside tree_mod_log structures. delayed_iput is allocated on every add_delayed_iput and freed quite frequently (once per-commit at least), so this is a short-lived object as well as the proposed btrfs_trans_rsv_item. IMO this justifies using the slab. Does this sound good to you? david -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 09/14/2012 08:07 PM, David Sterba wrote: > On Fri, Sep 14, 2012 at 07:25:45PM +0800, Liu Bo wrote: >> On 09/14/2012 07:15 PM, David Sterba wrote: >>> On Fri, Sep 14, 2012 at 04:58:04PM +0800, Liu Bo wrote: >>>> --- a/fs/btrfs/transaction.c >>>> +++ b/fs/btrfs/transaction.c >>>> @@ -306,9 +306,17 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, >>>> WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK && >>>> type != TRANS_JOIN_ONLY); >>>> h = current->journal_info; >>>> - h->use_count++; >>>> - h->orig_rsv = h->block_rsv; >>>> + if (h->block_rsv) { >>>> + struct btrfs_trans_rsv_item *item; >>>> + item = kmalloc(sizeof(*item), GFP_NOFS); >>> >>> I'd rather avoid the kmalloc here and add a list hook into >>> btrfs_block_rsv itself (used only for this purpose). >>> >>> It also does not increase the failure surface and we don't have to >>> handle error conditions from deep callchains. >>> >> >> Actually I placed a list hook at first, but I found the same block_rsv could be inserted into >> the list chain twice, which will cause list_head's terrible warnings. > > Is it expected and correct to add the rsv twice? I know the transaction > joins and commits are sometimes wildly nested so it does not mean that's > necessarily a bug. Then it is not possible to embed the list hook, we > need to keep the full track of the blk_rsv stack. > > I'm counting two other similar structs used inside btrfs > (list_head + 8B value), we could make a shared separate slab for them. > > struct delayed_iput { > struct list_head list; /* 0 16 */ > struct inode * inode; /* 16 8 */ > > /* size: 24, cachelines: 1, members: 2 */ > /* last cacheline: 24 bytes */ > }; > > truct seq_list { > struct list_head list; /* 0 16 */ > u64 seq; /* 16 8 */ > > /* size: 24, cachelines: 1, members: 2 */ > /* last cacheline: 24 bytes */ > }; > > seq_list is never allocated, only embedded inside tree_mod_log structures. > > delayed_iput is allocated on every add_delayed_iput and freed quite > frequently (once per-commit at least), so this is a short-lived object > as well as the proposed btrfs_trans_rsv_item. IMO this justifies using > the slab. Does this sound good to you? > Good, a quick thinking says it will work ;) btw, do you have any test suit to measure this kind of cacheline efficiency so that we can get some numbers to see if it really helps? - liubo > > david > -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Sep 14, 2012 at 02:58:04AM -0600, Liu Bo wrote: > In some workloads we have nested joining transaction operations, > eg. > run_delalloc_nocow > btrfs_join_transaction > cow_file_range > btrfs_join_transaction > > it can be a serious bug since each trans handler has only two > block_rsv, orig_rsv and block_rsv, which means we may lose our > first block_rsv after two joining transaction operations: > > 1) btrfs_start_transaction > trans->block_rsv = A > > 2) btrfs_join_transaction > trans->orig_rsv = trans->block_rsv; ---> orig_rsv is now A > trans->block_rsv = B > > 3) btrfs_join_transaction > trans->orig_rsv = trans->block_rsv; ---> orig_rsv is now B > trans->block_rsv = C > ... > I'd like to see the actual stack trace where this happens, because I don't think it can happen. And if it is we need to look at that specific case and adjust it as necessary and not add a bunch of kmallocs just to track the block_rsv, because frankly it's not that big of a deal, it was just put into place in case somebody wasn't expecting a call they made to start another transaction and reset the block_rsv, which I don't actually think happens anywhere. So NAK on this patch, give me more information so I can figure out the right way to deal with this. Thanks, Josef -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 09/14/2012 08:41 PM, Josef Bacik wrote: > On Fri, Sep 14, 2012 at 02:58:04AM -0600, Liu Bo wrote: >> In some workloads we have nested joining transaction operations, >> eg. >> run_delalloc_nocow >> btrfs_join_transaction >> cow_file_range >> btrfs_join_transaction >> >> it can be a serious bug since each trans handler has only two >> block_rsv, orig_rsv and block_rsv, which means we may lose our >> first block_rsv after two joining transaction operations: >> >> 1) btrfs_start_transaction >> trans->block_rsv = A >> >> 2) btrfs_join_transaction >> trans->orig_rsv = trans->block_rsv; ---> orig_rsv is now A >> trans->block_rsv = B >> >> 3) btrfs_join_transaction >> trans->orig_rsv = trans->block_rsv; ---> orig_rsv is now B >> trans->block_rsv = C >> ... >> > > I'd like to see the actual stack trace where this happens, because I don't think > it can happen. And if it is we need to look at that specific case and adjust it > as necessary and not add a bunch of kmallocs just to track the block_rsv, > because frankly it's not that big of a deal, it was just put into place in case > somebody wasn't expecting a call they made to start another transaction and > reset the block_rsv, which I don't actually think happens anywhere. So NAK on > this patch, give me more information so I can figure out the right way to deal > with this. Thanks, > Fine, please run xfstests 068 till it hits a BUG_ON inside either btrfs_delete_delayed_dir_index or btrfs_insert_delayed_dir_index. What I saw is that the orig_rsv and block_rsv is both delalloc_block_rsv, which is already lack of space. thanks, liubo > Josef > -- > To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 09/14/2012 09:01 PM, Liu Bo wrote: > On 09/14/2012 08:41 PM, Josef Bacik wrote: >> On Fri, Sep 14, 2012 at 02:58:04AM -0600, Liu Bo wrote: >>> In some workloads we have nested joining transaction operations, >>> eg. >>> run_delalloc_nocow >>> btrfs_join_transaction >>> cow_file_range >>> btrfs_join_transaction >>> >>> it can be a serious bug since each trans handler has only two >>> block_rsv, orig_rsv and block_rsv, which means we may lose our >>> first block_rsv after two joining transaction operations: >>> >>> 1) btrfs_start_transaction >>> trans->block_rsv = A >>> >>> 2) btrfs_join_transaction >>> trans->orig_rsv = trans->block_rsv; ---> orig_rsv is now A >>> trans->block_rsv = B >>> >>> 3) btrfs_join_transaction >>> trans->orig_rsv = trans->block_rsv; ---> orig_rsv is now B >>> trans->block_rsv = C >>> ... >>> >> >> I'd like to see the actual stack trace where this happens, because I don't think >> it can happen. And if it is we need to look at that specific case and adjust it >> as necessary and not add a bunch of kmallocs just to track the block_rsv, >> because frankly it's not that big of a deal, it was just put into place in case >> somebody wasn't expecting a call they made to start another transaction and >> reset the block_rsv, which I don't actually think happens anywhere. So NAK on >> this patch, give me more information so I can figure out the right way to deal >> with this. Thanks, >> > > Fine, please run xfstests 068 till it hits a BUG_ON inside either btrfs_delete_delayed_dir_index or > btrfs_insert_delayed_dir_index. > > What I saw is that the orig_rsv and block_rsv is both delalloc_block_rsv, which is already lack of space. > and trans->use_count has been 3. -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Sep 14, 2012 at 09:01:36PM +0800, Liu Bo wrote: > Fine, please run xfstests 068 till it hits a BUG_ON inside either btrfs_delete_delayed_dir_index or > btrfs_insert_delayed_dir_index. 068 is the freezer test, we've seen enough issues in the freezing area recently, I'm curious if you were able to reproduce with a non-freezing tests. It's quite possible that this may be a race condition and the freezer just supplies the right delays and widens the race window. -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 0c17d9e..a36ae05 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -306,9 +306,17 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK && type != TRANS_JOIN_ONLY); h = current->journal_info; - h->use_count++; - h->orig_rsv = h->block_rsv; + if (h->block_rsv) { + struct btrfs_trans_rsv_item *item; + item = kmalloc(sizeof(*item), GFP_NOFS); + if (!item) + return ERR_PTR(-ENOMEM); + item->rsv = h->block_rsv; + INIT_LIST_HEAD(&item->list); + list_add(&item->list, &h->blk_rsv_list); + } h->block_rsv = NULL; + h->use_count++; goto got_it; } else if (type == TRANS_JOIN_ONLY) { return ERR_PTR(-ENOENT); @@ -367,11 +375,11 @@ again: h->use_count = 1; h->adding_csums = 0; h->block_rsv = NULL; - h->orig_rsv = NULL; h->aborted = 0; h->qgroup_reserved = qgroup_reserved; h->delayed_ref_elem.seq = 0; INIT_LIST_HEAD(&h->qgroup_ref_list); + INIT_LIST_HEAD(&h->blk_rsv_list); smp_mb(); if (cur_trans->blocked && may_wait_transaction(root, type)) { @@ -523,7 +531,15 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, int err = 0; if (--trans->use_count) { - trans->block_rsv = trans->orig_rsv; + trans->block_rsv = NULL; + if (!list_empty(&trans->blk_rsv_list)) { + struct btrfs_trans_rsv_item *item; + item = list_entry(trans->blk_rsv_list.next, + struct btrfs_trans_rsv_item, list); + list_del_init(&item->list); + trans->block_rsv = item->rsv; + kfree(item); + } return 0; } @@ -558,6 +574,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, count++; } btrfs_trans_release_metadata(trans, root); + BUG_ON(!list_empty(&trans->blk_rsv_list)); trans->block_rsv = NULL; sb_end_intwrite(root->fs_info->sb); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 59adf55..7fa11b7 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -57,7 +57,6 @@ struct btrfs_trans_handle { unsigned long delayed_ref_updates; struct btrfs_transaction *transaction; struct btrfs_block_rsv *block_rsv; - struct btrfs_block_rsv *orig_rsv; int aborted; int adding_csums; /* @@ -68,6 +67,12 @@ struct btrfs_trans_handle { struct btrfs_root *root; struct seq_list delayed_ref_elem; struct list_head qgroup_ref_list; + struct list_head blk_rsv_list; +}; + +struct btrfs_trans_rsv_item { + struct btrfs_block_rsv *rsv; + struct list_head list; }; struct btrfs_pending_snapshot {
In some workloads we have nested joining transaction operations, eg. run_delalloc_nocow btrfs_join_transaction cow_file_range btrfs_join_transaction it can be a serious bug since each trans handler has only two block_rsv, orig_rsv and block_rsv, which means we may lose our first block_rsv after two joining transaction operations: 1) btrfs_start_transaction trans->block_rsv = A 2) btrfs_join_transaction trans->orig_rsv = trans->block_rsv; ---> orig_rsv is now A trans->block_rsv = B 3) btrfs_join_transaction trans->orig_rsv = trans->block_rsv; ---> orig_rsv is now B trans->block_rsv = C ... This uses a list of block_rsv instead so that we can either a) PUSH the old one into the list and use a new one in joining, or b) POP the old one in ending this transaction. Signed-off-by: Liu Bo <bo.li.liu@oracle.com> --- fs/btrfs/transaction.c | 25 +++++++++++++++++++++---- fs/btrfs/transaction.h | 7 ++++++- 2 files changed, 27 insertions(+), 5 deletions(-)