diff mbox

[2/2] Btrfs: serialize flushers in reserve_metadata_bytes

Message ID 1307479449-14765-3-git-send-email-josef@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Josef Bacik June 7, 2011, 8:44 p.m. UTC
We keep having problems with early enospc, and that's because our method of
making space is inherently racy.  The problem is we can have one guy trying to
make space for himself, and in the meantime people come in and steal his
reservation.  In order to stop this we make a waitqueue and put anybody who
comes into reserve_metadata_bytes on that waitqueue if somebody is trying to
make more space.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/ctree.h       |    3 ++
 fs/btrfs/extent-tree.c |   69 ++++++++++++++++++++++++++++++++----------------
 2 files changed, 49 insertions(+), 23 deletions(-)

Comments

David Sterba June 9, 2011, 9:45 a.m. UTC | #1
On Tue, Jun 07, 2011 at 04:44:09PM -0400, Josef Bacik wrote:
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -2932,6 +2932,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
>  	found->full = 0;
>  	found->force_alloc = CHUNK_ALLOC_NO_FORCE;
>  	found->chunk_alloc = 0;
> +	found->flush = 0;
> +	init_waitqueue_head(&found->wait);
>  	*space_info = found;
>  	list_add_rcu(&found->list, &info->space_info);
>  	atomic_set(&found->caching_threads, 0);
> @@ -3314,9 +3316,13 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
>  	if (reserved == 0)
>  		return 0;
>  
> -	/* nothing to shrink - nothing to reclaim */
> -	if (root->fs_info->delalloc_bytes == 0)
> +	smp_mb();

can you please explain why do you use the barrier here? (and add that
explanation as a comment)

it's for the delalloc_bytes test, right? this is being modified in
btrfs_set_bit_hook/btrfs_clear_bit_hook, protected by a spinlock.
the counter is sum of all delalloc_bytes for each delalloc inode.
if the counter is nonzero, then fs_info->delalloc_inodes is expected to
be nonempty, but the list is not touched in this function, but
indirectly from writeback_inodes_sb_nr_if_idle and the respective
.writepages callback, ending up in __extent_writepage which starts
messing with delalloc.

it think it's possible to reach state, where counter is nonzero, but the
delalloc_inodes list is empty. then writeback will just skip the
delalloc work in this round and will process it later.
even with a barrier in place:

btrfs_set_bit_hook:
# counter increased, a barrier will assure len is obtained from
# delalloc_bytes in shrink_delalloc
1349                 root->fs_info->delalloc_bytes += len;
# but fs_info->delalloc_list may be empty
1350                 if (do_list && list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1351				list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1352				&root->fs_info->delalloc_inodes);
1353                 }

although this does not seem to crash or cause corruption, I suggest to
use the spinlock as the synchronization mechanism to protect reading
fs_info->delalloc_bytes


david



> +	if (root->fs_info->delalloc_bytes == 0) {
> +		if (trans)
> +			return 0;
> +		btrfs_wait_ordered_extents(root, 0, 0);
>  		return 0;
> +	}
>  
>  	max_reclaim = min(reserved, to_reclaim);
>  
> @@ -3360,6 +3366,8 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
>  		}
>  
>  	}
> +	if (reclaimed >= to_reclaim && !trans)
> +		btrfs_wait_ordered_extents(root, 0, 0);
>  	return reclaimed >= to_reclaim;
>  }
>  
> @@ -3384,15 +3392,36 @@ static int reserve_metadata_bytes(struct btrfs_trans_handle *trans,
>  	u64 num_bytes = orig_bytes;
>  	int retries = 0;
>  	int ret = 0;
> -	bool reserved = false;
>  	bool committed = false;
> +	bool flushing = false;
>  
>  again:
> -	ret = -ENOSPC;
> -	if (reserved)
> -		num_bytes = 0;
> -
> +	ret = 0;
>  	spin_lock(&space_info->lock);
> +	/*
> +	 * We only want to wait if somebody other than us is flushing and we are
> +	 * actually alloed to flush.
> +	 */
> +	while (flush && !flushing && space_info->flush) {
> +		spin_unlock(&space_info->lock);
> +		/*
> +		 * If we have a trans handle we can't wait because the flusher
> +		 * may have to commit the transaction, which would mean we would
> +		 * deadlock since we are waiting for the flusher to finish, but
> +		 * hold the current transaction open.
> +		 */
> +		if (trans)
> +			return -EAGAIN;
> +		ret = wait_event_interruptible(space_info->wait,
> +					       !space_info->flush);
> +		/* Must have been interrupted, return */
> +		if (ret)
> +			return -EINTR;
> +
> +		spin_lock(&space_info->lock);
> +	}
> +
> +	ret = -ENOSPC;
>  	unused = space_info->bytes_used + space_info->bytes_reserved +
>  		 space_info->bytes_pinned + space_info->bytes_readonly +
>  		 space_info->bytes_may_use;
> @@ -3407,8 +3436,7 @@ again:
>  	if (unused <= space_info->total_bytes) {
>  		unused = space_info->total_bytes - unused;
>  		if (unused >= num_bytes) {
> -			if (!reserved)
> -				space_info->bytes_may_use += orig_bytes;
> +			space_info->bytes_may_use += orig_bytes;
>  			ret = 0;
>  		} else {
>  			/*
> @@ -3433,17 +3461,14 @@ again:
>  	 * to reclaim space we can actually use it instead of somebody else
>  	 * stealing it from us.
>  	 */
> -	if (ret && !reserved) {
> -		space_info->bytes_may_use += orig_bytes;
> -		reserved = true;
> +	if (ret && flush) {
> +		flushing = true;
> +		space_info->flush = 1;
>  	}
>  
>  	spin_unlock(&space_info->lock);
>  
> -	if (!ret)
> -		return 0;
> -
> -	if (!flush)
> +	if (!ret || !flush)
>  		goto out;
>  
>  	/*
> @@ -3451,9 +3476,7 @@ again:
>  	 * metadata until after the IO is completed.
>  	 */
>  	ret = shrink_delalloc(trans, root, num_bytes, 1);
> -	if (ret > 0)
> -		return 0;
> -	else if (ret < 0)
> +	if (ret < 0)
>  		goto out;
>  
>  	/*
> @@ -3466,11 +3489,11 @@ again:
>  		goto again;
>  	}
>  
> -	spin_lock(&space_info->lock);
>  	/*
>  	 * Not enough space to be reclaimed, don't bother committing the
>  	 * transaction.
>  	 */
> +	spin_lock(&space_info->lock);
>  	if (space_info->bytes_pinned < orig_bytes)
>  		ret = -ENOSPC;
>  	spin_unlock(&space_info->lock);
> @@ -3493,12 +3516,12 @@ again:
>  	}
>  
>  out:
> -	if (reserved) {
> +	if (flushing) {
>  		spin_lock(&space_info->lock);
> -		space_info->bytes_may_use -= orig_bytes;
> +		space_info->flush = 0;
> +		wake_up_all(&space_info->wait);
>  		spin_unlock(&space_info->lock);
>  	}
> -
>  	return ret;
>  }
>  
> -- 
> 1.7.2.3
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Josef Bacik June 9, 2011, 2 p.m. UTC | #2
On 06/09/2011 05:45 AM, David Sterba wrote:
> On Tue, Jun 07, 2011 at 04:44:09PM -0400, Josef Bacik wrote:
>> --- a/fs/btrfs/extent-tree.c
>> +++ b/fs/btrfs/extent-tree.c
>> @@ -2932,6 +2932,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
>>  	found->full = 0;
>>  	found->force_alloc = CHUNK_ALLOC_NO_FORCE;
>>  	found->chunk_alloc = 0;
>> +	found->flush = 0;
>> +	init_waitqueue_head(&found->wait);
>>  	*space_info = found;
>>  	list_add_rcu(&found->list, &info->space_info);
>>  	atomic_set(&found->caching_threads, 0);
>> @@ -3314,9 +3316,13 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
>>  	if (reserved == 0)
>>  		return 0;
>>  
>> -	/* nothing to shrink - nothing to reclaim */
>> -	if (root->fs_info->delalloc_bytes == 0)
>> +	smp_mb();
> 
> can you please explain why do you use the barrier here? (and add that
> explanation as a comment)
> 
> it's for the delalloc_bytes test, right? this is being modified in
> btrfs_set_bit_hook/btrfs_clear_bit_hook, protected by a spinlock.
> the counter is sum of all delalloc_bytes for each delalloc inode.
> if the counter is nonzero, then fs_info->delalloc_inodes is expected to
> be nonempty, but the list is not touched in this function, but
> indirectly from writeback_inodes_sb_nr_if_idle and the respective
> .writepages callback, ending up in __extent_writepage which starts
> messing with delalloc.
> 
> it think it's possible to reach state, where counter is nonzero, but the
> delalloc_inodes list is empty. then writeback will just skip the
> delalloc work in this round and will process it later.
> even with a barrier in place:
> 
> btrfs_set_bit_hook:
> # counter increased, a barrier will assure len is obtained from
> # delalloc_bytes in shrink_delalloc
> 1349                 root->fs_info->delalloc_bytes += len;
> # but fs_info->delalloc_list may be empty
> 1350                 if (do_list && list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
> 1351				list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
> 1352				&root->fs_info->delalloc_inodes);
> 1353                 }
> 
> although this does not seem to crash or cause corruption, I suggest to
> use the spinlock as the synchronization mechanism to protect reading
> fs_info->delalloc_bytes
> 

We're just looking to optimize the case where there is no delalloc.  If
there is delalloc we want to start the flushing thread.  Is there a
possibility that we could have delalloc_bytes set but nothing on the
list yet?  Sure, but in the time that we actually get to the writing out
of dirty inodes it should be on the list.  And if not, we loop several
times, so at some point it will be on the list and we will be good to
go.  We're not trying to be perfect here, we're trying to be fast :).
Thanks,

Josef
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Sterba June 9, 2011, 6 p.m. UTC | #3
On Thu, Jun 09, 2011 at 10:00:42AM -0400, Josef Bacik wrote:
> We're not trying to be perfect here, we're trying to be fast :).

Be even faster with smp_rmb() :)


david
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Sterba June 10, 2011, 5:47 p.m. UTC | #4
On Thu, Jun 09, 2011 at 08:00:15PM +0200, David Sterba wrote:
> On Thu, Jun 09, 2011 at 10:00:42AM -0400, Josef Bacik wrote:
> > We're not trying to be perfect here, we're trying to be fast :).
> 
> Be even faster with smp_rmb() :)

Arne made me think about this again. Let's analyze it in more detail:

The read side, if(delalloc_bytes), utilizes a full barrier, which will
force all cpus to flush pending reads and writes. This will ensure 'if'
will see a fresh value.

However, there is no pairing write barrier and the write flush will
happen at some point in time, (delalloc_bytes += len), but completely
unsynchronized with the read side.

The smp_mb barrier has no desired synchonization effect. Moreover, it
has a performance hit.


Doing it right with barriers would mean to add smp_rmb before the
if(...) and smp_wmb after the "delalloc_bytes =", but only in the case
the variable is solely synchronized via barriers. Not our case. There is
the spinlock.

As strict correctness is not needed here, you admit that delalloc_bytes
might not correspond to the state of fs_info->delalloc_inodes and this
is not a problem. Fine. But then you do not need the smp_mb. The value
of delalloc_bytes will be "random" (ie. unsynchronized), with or without
the barrier. Please drop it from the patch.


david
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Josef Bacik June 10, 2011, 5:49 p.m. UTC | #5
On 06/10/2011 01:47 PM, David Sterba wrote:
> On Thu, Jun 09, 2011 at 08:00:15PM +0200, David Sterba wrote:
>> On Thu, Jun 09, 2011 at 10:00:42AM -0400, Josef Bacik wrote:
>>> We're not trying to be perfect here, we're trying to be fast :).
>>
>> Be even faster with smp_rmb() :)
> 
> Arne made me think about this again. Let's analyze it in more detail:
> 
> The read side, if(delalloc_bytes), utilizes a full barrier, which will
> force all cpus to flush pending reads and writes. This will ensure 'if'
> will see a fresh value.
> 
> However, there is no pairing write barrier and the write flush will
> happen at some point in time, (delalloc_bytes += len), but completely
> unsynchronized with the read side.
> 
> The smp_mb barrier has no desired synchonization effect. Moreover, it
> has a performance hit.
> 
> 
> Doing it right with barriers would mean to add smp_rmb before the
> if(...) and smp_wmb after the "delalloc_bytes =", but only in the case
> the variable is solely synchronized via barriers. Not our case. There is
> the spinlock.
> 
> As strict correctness is not needed here, you admit that delalloc_bytes
> might not correspond to the state of fs_info->delalloc_inodes and this
> is not a problem. Fine. But then you do not need the smp_mb. The value
> of delalloc_bytes will be "random" (ie. unsynchronized), with or without
> the barrier. Please drop it from the patch.

I just used the spin lock.

Josef
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 6034a23..8857d82 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -756,6 +756,8 @@  struct btrfs_space_info {
 				   chunks for this space */
 	unsigned int chunk_alloc:1;	/* set if we are allocating a chunk */
 
+	unsigned int flush:1;		/* set if we are trying to make space */
+
 	unsigned int force_alloc;	/* set if we need to force a chunk
 					   alloc for this space */
 
@@ -766,6 +768,7 @@  struct btrfs_space_info {
 	spinlock_t lock;
 	struct rw_semaphore groups_sem;
 	atomic_t caching_threads;
+	wait_queue_head_t wait;
 };
 
 struct btrfs_block_rsv {
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b1c3ff7..d86f7c5 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2932,6 +2932,8 @@  static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 	found->full = 0;
 	found->force_alloc = CHUNK_ALLOC_NO_FORCE;
 	found->chunk_alloc = 0;
+	found->flush = 0;
+	init_waitqueue_head(&found->wait);
 	*space_info = found;
 	list_add_rcu(&found->list, &info->space_info);
 	atomic_set(&found->caching_threads, 0);
@@ -3314,9 +3316,13 @@  static int shrink_delalloc(struct btrfs_trans_handle *trans,
 	if (reserved == 0)
 		return 0;
 
-	/* nothing to shrink - nothing to reclaim */
-	if (root->fs_info->delalloc_bytes == 0)
+	smp_mb();
+	if (root->fs_info->delalloc_bytes == 0) {
+		if (trans)
+			return 0;
+		btrfs_wait_ordered_extents(root, 0, 0);
 		return 0;
+	}
 
 	max_reclaim = min(reserved, to_reclaim);
 
@@ -3360,6 +3366,8 @@  static int shrink_delalloc(struct btrfs_trans_handle *trans,
 		}
 
 	}
+	if (reclaimed >= to_reclaim && !trans)
+		btrfs_wait_ordered_extents(root, 0, 0);
 	return reclaimed >= to_reclaim;
 }
 
@@ -3384,15 +3392,36 @@  static int reserve_metadata_bytes(struct btrfs_trans_handle *trans,
 	u64 num_bytes = orig_bytes;
 	int retries = 0;
 	int ret = 0;
-	bool reserved = false;
 	bool committed = false;
+	bool flushing = false;
 
 again:
-	ret = -ENOSPC;
-	if (reserved)
-		num_bytes = 0;
-
+	ret = 0;
 	spin_lock(&space_info->lock);
+	/*
+	 * We only want to wait if somebody other than us is flushing and we are
+	 * actually alloed to flush.
+	 */
+	while (flush && !flushing && space_info->flush) {
+		spin_unlock(&space_info->lock);
+		/*
+		 * If we have a trans handle we can't wait because the flusher
+		 * may have to commit the transaction, which would mean we would
+		 * deadlock since we are waiting for the flusher to finish, but
+		 * hold the current transaction open.
+		 */
+		if (trans)
+			return -EAGAIN;
+		ret = wait_event_interruptible(space_info->wait,
+					       !space_info->flush);
+		/* Must have been interrupted, return */
+		if (ret)
+			return -EINTR;
+
+		spin_lock(&space_info->lock);
+	}
+
+	ret = -ENOSPC;
 	unused = space_info->bytes_used + space_info->bytes_reserved +
 		 space_info->bytes_pinned + space_info->bytes_readonly +
 		 space_info->bytes_may_use;
@@ -3407,8 +3436,7 @@  again:
 	if (unused <= space_info->total_bytes) {
 		unused = space_info->total_bytes - unused;
 		if (unused >= num_bytes) {
-			if (!reserved)
-				space_info->bytes_may_use += orig_bytes;
+			space_info->bytes_may_use += orig_bytes;
 			ret = 0;
 		} else {
 			/*
@@ -3433,17 +3461,14 @@  again:
 	 * to reclaim space we can actually use it instead of somebody else
 	 * stealing it from us.
 	 */
-	if (ret && !reserved) {
-		space_info->bytes_may_use += orig_bytes;
-		reserved = true;
+	if (ret && flush) {
+		flushing = true;
+		space_info->flush = 1;
 	}
 
 	spin_unlock(&space_info->lock);
 
-	if (!ret)
-		return 0;
-
-	if (!flush)
+	if (!ret || !flush)
 		goto out;
 
 	/*
@@ -3451,9 +3476,7 @@  again:
 	 * metadata until after the IO is completed.
 	 */
 	ret = shrink_delalloc(trans, root, num_bytes, 1);
-	if (ret > 0)
-		return 0;
-	else if (ret < 0)
+	if (ret < 0)
 		goto out;
 
 	/*
@@ -3466,11 +3489,11 @@  again:
 		goto again;
 	}
 
-	spin_lock(&space_info->lock);
 	/*
 	 * Not enough space to be reclaimed, don't bother committing the
 	 * transaction.
 	 */
+	spin_lock(&space_info->lock);
 	if (space_info->bytes_pinned < orig_bytes)
 		ret = -ENOSPC;
 	spin_unlock(&space_info->lock);
@@ -3493,12 +3516,12 @@  again:
 	}
 
 out:
-	if (reserved) {
+	if (flushing) {
 		spin_lock(&space_info->lock);
-		space_info->bytes_may_use -= orig_bytes;
+		space_info->flush = 0;
+		wake_up_all(&space_info->wait);
 		spin_unlock(&space_info->lock);
 	}
-
 	return ret;
 }