Message ID | 1307479449-14765-3-git-send-email-josef@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Tue, Jun 07, 2011 at 04:44:09PM -0400, Josef Bacik wrote: > --- a/fs/btrfs/extent-tree.c > +++ b/fs/btrfs/extent-tree.c > @@ -2932,6 +2932,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, > found->full = 0; > found->force_alloc = CHUNK_ALLOC_NO_FORCE; > found->chunk_alloc = 0; > + found->flush = 0; > + init_waitqueue_head(&found->wait); > *space_info = found; > list_add_rcu(&found->list, &info->space_info); > atomic_set(&found->caching_threads, 0); > @@ -3314,9 +3316,13 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, > if (reserved == 0) > return 0; > > - /* nothing to shrink - nothing to reclaim */ > - if (root->fs_info->delalloc_bytes == 0) > + smp_mb(); can you please explain why do you use the barrier here? (and add that explanation as a comment) it's for the delalloc_bytes test, right? this is being modified in btrfs_set_bit_hook/btrfs_clear_bit_hook, protected by a spinlock. the counter is sum of all delalloc_bytes for each delalloc inode. if the counter is nonzero, then fs_info->delalloc_inodes is expected to be nonempty, but the list is not touched in this function, but indirectly from writeback_inodes_sb_nr_if_idle and the respective .writepages callback, ending up in __extent_writepage which starts messing with delalloc. it think it's possible to reach state, where counter is nonzero, but the delalloc_inodes list is empty. then writeback will just skip the delalloc work in this round and will process it later. even with a barrier in place: btrfs_set_bit_hook: # counter increased, a barrier will assure len is obtained from # delalloc_bytes in shrink_delalloc 1349 root->fs_info->delalloc_bytes += len; # but fs_info->delalloc_list may be empty 1350 if (do_list && list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1351 list_add_tail(&BTRFS_I(inode)->delalloc_inodes, 1352 &root->fs_info->delalloc_inodes); 1353 } although this does not seem to crash or cause corruption, I suggest to use the spinlock as the synchronization mechanism to protect reading fs_info->delalloc_bytes david > + if (root->fs_info->delalloc_bytes == 0) { > + if (trans) > + return 0; > + btrfs_wait_ordered_extents(root, 0, 0); > return 0; > + } > > max_reclaim = min(reserved, to_reclaim); > > @@ -3360,6 +3366,8 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, > } > > } > + if (reclaimed >= to_reclaim && !trans) > + btrfs_wait_ordered_extents(root, 0, 0); > return reclaimed >= to_reclaim; > } > > @@ -3384,15 +3392,36 @@ static int reserve_metadata_bytes(struct btrfs_trans_handle *trans, > u64 num_bytes = orig_bytes; > int retries = 0; > int ret = 0; > - bool reserved = false; > bool committed = false; > + bool flushing = false; > > again: > - ret = -ENOSPC; > - if (reserved) > - num_bytes = 0; > - > + ret = 0; > spin_lock(&space_info->lock); > + /* > + * We only want to wait if somebody other than us is flushing and we are > + * actually alloed to flush. > + */ > + while (flush && !flushing && space_info->flush) { > + spin_unlock(&space_info->lock); > + /* > + * If we have a trans handle we can't wait because the flusher > + * may have to commit the transaction, which would mean we would > + * deadlock since we are waiting for the flusher to finish, but > + * hold the current transaction open. > + */ > + if (trans) > + return -EAGAIN; > + ret = wait_event_interruptible(space_info->wait, > + !space_info->flush); > + /* Must have been interrupted, return */ > + if (ret) > + return -EINTR; > + > + spin_lock(&space_info->lock); > + } > + > + ret = -ENOSPC; > unused = space_info->bytes_used + space_info->bytes_reserved + > space_info->bytes_pinned + space_info->bytes_readonly + > space_info->bytes_may_use; > @@ -3407,8 +3436,7 @@ again: > if (unused <= space_info->total_bytes) { > unused = space_info->total_bytes - unused; > if (unused >= num_bytes) { > - if (!reserved) > - space_info->bytes_may_use += orig_bytes; > + space_info->bytes_may_use += orig_bytes; > ret = 0; > } else { > /* > @@ -3433,17 +3461,14 @@ again: > * to reclaim space we can actually use it instead of somebody else > * stealing it from us. > */ > - if (ret && !reserved) { > - space_info->bytes_may_use += orig_bytes; > - reserved = true; > + if (ret && flush) { > + flushing = true; > + space_info->flush = 1; > } > > spin_unlock(&space_info->lock); > > - if (!ret) > - return 0; > - > - if (!flush) > + if (!ret || !flush) > goto out; > > /* > @@ -3451,9 +3476,7 @@ again: > * metadata until after the IO is completed. > */ > ret = shrink_delalloc(trans, root, num_bytes, 1); > - if (ret > 0) > - return 0; > - else if (ret < 0) > + if (ret < 0) > goto out; > > /* > @@ -3466,11 +3489,11 @@ again: > goto again; > } > > - spin_lock(&space_info->lock); > /* > * Not enough space to be reclaimed, don't bother committing the > * transaction. > */ > + spin_lock(&space_info->lock); > if (space_info->bytes_pinned < orig_bytes) > ret = -ENOSPC; > spin_unlock(&space_info->lock); > @@ -3493,12 +3516,12 @@ again: > } > > out: > - if (reserved) { > + if (flushing) { > spin_lock(&space_info->lock); > - space_info->bytes_may_use -= orig_bytes; > + space_info->flush = 0; > + wake_up_all(&space_info->wait); > spin_unlock(&space_info->lock); > } > - > return ret; > } > > -- > 1.7.2.3 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 06/09/2011 05:45 AM, David Sterba wrote: > On Tue, Jun 07, 2011 at 04:44:09PM -0400, Josef Bacik wrote: >> --- a/fs/btrfs/extent-tree.c >> +++ b/fs/btrfs/extent-tree.c >> @@ -2932,6 +2932,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, >> found->full = 0; >> found->force_alloc = CHUNK_ALLOC_NO_FORCE; >> found->chunk_alloc = 0; >> + found->flush = 0; >> + init_waitqueue_head(&found->wait); >> *space_info = found; >> list_add_rcu(&found->list, &info->space_info); >> atomic_set(&found->caching_threads, 0); >> @@ -3314,9 +3316,13 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, >> if (reserved == 0) >> return 0; >> >> - /* nothing to shrink - nothing to reclaim */ >> - if (root->fs_info->delalloc_bytes == 0) >> + smp_mb(); > > can you please explain why do you use the barrier here? (and add that > explanation as a comment) > > it's for the delalloc_bytes test, right? this is being modified in > btrfs_set_bit_hook/btrfs_clear_bit_hook, protected by a spinlock. > the counter is sum of all delalloc_bytes for each delalloc inode. > if the counter is nonzero, then fs_info->delalloc_inodes is expected to > be nonempty, but the list is not touched in this function, but > indirectly from writeback_inodes_sb_nr_if_idle and the respective > .writepages callback, ending up in __extent_writepage which starts > messing with delalloc. > > it think it's possible to reach state, where counter is nonzero, but the > delalloc_inodes list is empty. then writeback will just skip the > delalloc work in this round and will process it later. > even with a barrier in place: > > btrfs_set_bit_hook: > # counter increased, a barrier will assure len is obtained from > # delalloc_bytes in shrink_delalloc > 1349 root->fs_info->delalloc_bytes += len; > # but fs_info->delalloc_list may be empty > 1350 if (do_list && list_empty(&BTRFS_I(inode)->delalloc_inodes)) { > 1351 list_add_tail(&BTRFS_I(inode)->delalloc_inodes, > 1352 &root->fs_info->delalloc_inodes); > 1353 } > > although this does not seem to crash or cause corruption, I suggest to > use the spinlock as the synchronization mechanism to protect reading > fs_info->delalloc_bytes > We're just looking to optimize the case where there is no delalloc. If there is delalloc we want to start the flushing thread. Is there a possibility that we could have delalloc_bytes set but nothing on the list yet? Sure, but in the time that we actually get to the writing out of dirty inodes it should be on the list. And if not, we loop several times, so at some point it will be on the list and we will be good to go. We're not trying to be perfect here, we're trying to be fast :). Thanks, Josef -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Jun 09, 2011 at 10:00:42AM -0400, Josef Bacik wrote:
> We're not trying to be perfect here, we're trying to be fast :).
Be even faster with smp_rmb() :)
david
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Jun 09, 2011 at 08:00:15PM +0200, David Sterba wrote: > On Thu, Jun 09, 2011 at 10:00:42AM -0400, Josef Bacik wrote: > > We're not trying to be perfect here, we're trying to be fast :). > > Be even faster with smp_rmb() :) Arne made me think about this again. Let's analyze it in more detail: The read side, if(delalloc_bytes), utilizes a full barrier, which will force all cpus to flush pending reads and writes. This will ensure 'if' will see a fresh value. However, there is no pairing write barrier and the write flush will happen at some point in time, (delalloc_bytes += len), but completely unsynchronized with the read side. The smp_mb barrier has no desired synchonization effect. Moreover, it has a performance hit. Doing it right with barriers would mean to add smp_rmb before the if(...) and smp_wmb after the "delalloc_bytes =", but only in the case the variable is solely synchronized via barriers. Not our case. There is the spinlock. As strict correctness is not needed here, you admit that delalloc_bytes might not correspond to the state of fs_info->delalloc_inodes and this is not a problem. Fine. But then you do not need the smp_mb. The value of delalloc_bytes will be "random" (ie. unsynchronized), with or without the barrier. Please drop it from the patch. david -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 06/10/2011 01:47 PM, David Sterba wrote: > On Thu, Jun 09, 2011 at 08:00:15PM +0200, David Sterba wrote: >> On Thu, Jun 09, 2011 at 10:00:42AM -0400, Josef Bacik wrote: >>> We're not trying to be perfect here, we're trying to be fast :). >> >> Be even faster with smp_rmb() :) > > Arne made me think about this again. Let's analyze it in more detail: > > The read side, if(delalloc_bytes), utilizes a full barrier, which will > force all cpus to flush pending reads and writes. This will ensure 'if' > will see a fresh value. > > However, there is no pairing write barrier and the write flush will > happen at some point in time, (delalloc_bytes += len), but completely > unsynchronized with the read side. > > The smp_mb barrier has no desired synchonization effect. Moreover, it > has a performance hit. > > > Doing it right with barriers would mean to add smp_rmb before the > if(...) and smp_wmb after the "delalloc_bytes =", but only in the case > the variable is solely synchronized via barriers. Not our case. There is > the spinlock. > > As strict correctness is not needed here, you admit that delalloc_bytes > might not correspond to the state of fs_info->delalloc_inodes and this > is not a problem. Fine. But then you do not need the smp_mb. The value > of delalloc_bytes will be "random" (ie. unsynchronized), with or without > the barrier. Please drop it from the patch. I just used the spin lock. Josef -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 6034a23..8857d82 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -756,6 +756,8 @@ struct btrfs_space_info { chunks for this space */ unsigned int chunk_alloc:1; /* set if we are allocating a chunk */ + unsigned int flush:1; /* set if we are trying to make space */ + unsigned int force_alloc; /* set if we need to force a chunk alloc for this space */ @@ -766,6 +768,7 @@ struct btrfs_space_info { spinlock_t lock; struct rw_semaphore groups_sem; atomic_t caching_threads; + wait_queue_head_t wait; }; struct btrfs_block_rsv { diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index b1c3ff7..d86f7c5 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2932,6 +2932,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->full = 0; found->force_alloc = CHUNK_ALLOC_NO_FORCE; found->chunk_alloc = 0; + found->flush = 0; + init_waitqueue_head(&found->wait); *space_info = found; list_add_rcu(&found->list, &info->space_info); atomic_set(&found->caching_threads, 0); @@ -3314,9 +3316,13 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, if (reserved == 0) return 0; - /* nothing to shrink - nothing to reclaim */ - if (root->fs_info->delalloc_bytes == 0) + smp_mb(); + if (root->fs_info->delalloc_bytes == 0) { + if (trans) + return 0; + btrfs_wait_ordered_extents(root, 0, 0); return 0; + } max_reclaim = min(reserved, to_reclaim); @@ -3360,6 +3366,8 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, } } + if (reclaimed >= to_reclaim && !trans) + btrfs_wait_ordered_extents(root, 0, 0); return reclaimed >= to_reclaim; } @@ -3384,15 +3392,36 @@ static int reserve_metadata_bytes(struct btrfs_trans_handle *trans, u64 num_bytes = orig_bytes; int retries = 0; int ret = 0; - bool reserved = false; bool committed = false; + bool flushing = false; again: - ret = -ENOSPC; - if (reserved) - num_bytes = 0; - + ret = 0; spin_lock(&space_info->lock); + /* + * We only want to wait if somebody other than us is flushing and we are + * actually alloed to flush. + */ + while (flush && !flushing && space_info->flush) { + spin_unlock(&space_info->lock); + /* + * If we have a trans handle we can't wait because the flusher + * may have to commit the transaction, which would mean we would + * deadlock since we are waiting for the flusher to finish, but + * hold the current transaction open. + */ + if (trans) + return -EAGAIN; + ret = wait_event_interruptible(space_info->wait, + !space_info->flush); + /* Must have been interrupted, return */ + if (ret) + return -EINTR; + + spin_lock(&space_info->lock); + } + + ret = -ENOSPC; unused = space_info->bytes_used + space_info->bytes_reserved + space_info->bytes_pinned + space_info->bytes_readonly + space_info->bytes_may_use; @@ -3407,8 +3436,7 @@ again: if (unused <= space_info->total_bytes) { unused = space_info->total_bytes - unused; if (unused >= num_bytes) { - if (!reserved) - space_info->bytes_may_use += orig_bytes; + space_info->bytes_may_use += orig_bytes; ret = 0; } else { /* @@ -3433,17 +3461,14 @@ again: * to reclaim space we can actually use it instead of somebody else * stealing it from us. */ - if (ret && !reserved) { - space_info->bytes_may_use += orig_bytes; - reserved = true; + if (ret && flush) { + flushing = true; + space_info->flush = 1; } spin_unlock(&space_info->lock); - if (!ret) - return 0; - - if (!flush) + if (!ret || !flush) goto out; /* @@ -3451,9 +3476,7 @@ again: * metadata until after the IO is completed. */ ret = shrink_delalloc(trans, root, num_bytes, 1); - if (ret > 0) - return 0; - else if (ret < 0) + if (ret < 0) goto out; /* @@ -3466,11 +3489,11 @@ again: goto again; } - spin_lock(&space_info->lock); /* * Not enough space to be reclaimed, don't bother committing the * transaction. */ + spin_lock(&space_info->lock); if (space_info->bytes_pinned < orig_bytes) ret = -ENOSPC; spin_unlock(&space_info->lock); @@ -3493,12 +3516,12 @@ again: } out: - if (reserved) { + if (flushing) { spin_lock(&space_info->lock); - space_info->bytes_may_use -= orig_bytes; + space_info->flush = 0; + wake_up_all(&space_info->wait); spin_unlock(&space_info->lock); } - return ret; }
We keep having problems with early enospc, and that's because our method of making space is inherently racy. The problem is we can have one guy trying to make space for himself, and in the meantime people come in and steal his reservation. In order to stop this we make a waitqueue and put anybody who comes into reserve_metadata_bytes on that waitqueue if somebody is trying to make more space. Thanks, Signed-off-by: Josef Bacik <josef@redhat.com> --- fs/btrfs/ctree.h | 3 ++ fs/btrfs/extent-tree.c | 69 ++++++++++++++++++++++++++++++++---------------- 2 files changed, 49 insertions(+), 23 deletions(-)