From patchwork Thu Mar 24 17:43:08 2011
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Tejun Heo <tj@kernel.org>
X-Patchwork-Id: 659581
Received: from vger.kernel.org (vger.kernel.org [209.132.180.67])
	by demeter1.kernel.org (8.14.4/8.14.3) with ESMTP id p2OHpQt6019104
	for <patchwork-linux-btrfs@patchwork.kernel.org>;
	Thu, 24 Mar 2011 17:51:27 GMT
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
	id S1756911Ab1CXRvD (ORCPT
	<rfc822;patchwork-linux-btrfs@patchwork.kernel.org>);
	Thu, 24 Mar 2011 13:51:03 -0400
Received: from mail-fx0-f46.google.com ([209.85.161.46]:39462 "EHLO
	mail-fx0-f46.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
	with ESMTP id S1750924Ab1CXRu5 (ORCPT
	<rfc822;linux-btrfs@vger.kernel.org>);
	Thu, 24 Mar 2011 13:50:57 -0400
Received: by fxm17 with SMTP id 17so271111fxm.19
	for <multiple recipients>; Thu, 24 Mar 2011 10:50:55 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=gamma;
	h=domainkey-signature:sender:from:to:cc:subject:date:message-id
	:x-mailer:in-reply-to:references;
	bh=nV1f/S3LRME9pjD0wmDJcIG3hYfkIrj4ec8GMlotT/I=;
	b=aTIOkuqmxj0oGP9Xw3AmYEVzA6cuEPFFfbCpWgAc1iNzn4QDPiFXlET2+Vtzc8yOFd
	vhEaNCsG63/FuhXi56RE0Mi9m71PGyBUXkFXbDEk0dXWMoAlEwJQF7bzfq7snPStHCPm
	KxDrGE2bHyV1vgbYTnbz10i9UqTRbrAFemIGY=
DomainKey-Signature: a=rsa-sha1; c=nofws; d=gmail.com; s=gamma;
	h=sender:from:to:cc:subject:date:message-id:x-mailer:in-reply-to
	:references;
	b=qwhWmvvJ8kiKWJmzzUG6s2DmSoqkO9IlWQxZLtOeh+BKbM9qgP4z5wU+61rVwaHCfJ
	Tts/w5S75GV7xYJmzfO/LgIDhr3YONNVK8HEm1T8j2YC0bCAfGJzRhtIveWSx/ZbL5QI
	6engqC41jSWUYLUPyr/BG7RpStYPweRfY43RU=
Received: by 10.223.17.76 with SMTP id r12mr2505926faa.142.1300988598023;
	Thu, 24 Mar 2011 10:43:18 -0700 (PDT)
Received: from localhost.localdomain ([130.75.117.88])
	by mx.google.com with ESMTPS id n2sm66979fam.28.2011.03.24.10.43.16
	(version=TLSv1/SSLv3 cipher=OTHER);
	Thu, 24 Mar 2011 10:43:16 -0700 (PDT)
From: Tejun Heo <tj@kernel.org>
To: chris.mason@oracle.com, peterz@infradead.org, mingo@redhat.com,
	linux-btrfs@vger.kernel.org, linux-kernel@vger.kernel.org
Cc: Tejun Heo <tj@kernel.org>
Subject: [PATCH 3/3] btrfs: Simplify extent_buffer locking
Date: Thu, 24 Mar 2011 18:43:08 +0100
Message-Id: <1300988588-13986-4-git-send-email-tj@kernel.org>
X-Mailer: git-send-email 1.7.1
In-Reply-To: <1300988588-13986-1-git-send-email-tj@kernel.org>
References: <1300988588-13986-1-git-send-email-tj@kernel.org>
Sender: linux-btrfs-owner@vger.kernel.org
Precedence: bulk
List-ID: <linux-btrfs.vger.kernel.org>
X-Mailing-List: linux-btrfs@vger.kernel.org
X-Greylist: IP, sender and recipient auto-whitelisted, not delayed by
	milter-greylist-4.2.6 (demeter1.kernel.org [140.211.167.41]);
	Thu, 24 Mar 2011 17:51:27 +0000 (UTC)
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 31610ea..8688f47 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -5,6 +5,6 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   file-item.o inode-item.o inode-map.o disk-io.o \
 	   transaction.o inode.o file.o tree-defrag.o \
 	   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
-	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
+	   extent_io.o volumes.o async-thread.o ioctl.o orphan.o \
 	   export.o tree-log.o acl.o free-space-cache.o zlib.o lzo.o \
 	   compression.o delayed-ref.o relocation.o
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index b5baff0..bc1627d 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1074,7 +1074,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 
 	left = read_node_slot(root, parent, pslot - 1);
 	if (left) {
-		btrfs_tree_lock(left);
+		btrfs_tree_lock_nested(left, 1);
 		btrfs_set_lock_blocking(left);
 		wret = btrfs_cow_block(trans, root, left,
 				       parent, pslot - 1, &left);
@@ -1085,7 +1085,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 	}
 	right = read_node_slot(root, parent, pslot + 1);
 	if (right) {
-		btrfs_tree_lock(right);
+		btrfs_tree_lock_nested(right, 2);
 		btrfs_set_lock_blocking(right);
 		wret = btrfs_cow_block(trans, root, right,
 				       parent, pslot + 1, &right);
@@ -1241,7 +1241,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
 	if (left) {
 		u32 left_nr;
 
-		btrfs_tree_lock(left);
+		btrfs_tree_lock_nested(left, 1);
 		btrfs_set_lock_blocking(left);
 
 		left_nr = btrfs_header_nritems(left);
@@ -1291,7 +1291,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
 	if (right) {
 		u32 right_nr;
 
-		btrfs_tree_lock(right);
+		btrfs_tree_lock_nested(right, 1);
 		btrfs_set_lock_blocking(right);
 
 		right_nr = btrfs_header_nritems(right);
@@ -2519,7 +2519,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (right == NULL)
 		return 1;
 
-	btrfs_tree_lock(right);
+	btrfs_tree_lock_nested(right, 1);
 	btrfs_set_lock_blocking(right);
 
 	free_space = btrfs_leaf_free_space(root, right);
@@ -2772,7 +2772,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (left == NULL)
 		return 1;
 
-	btrfs_tree_lock(left);
+	btrfs_tree_lock_nested(left, 1);
 	btrfs_set_lock_blocking(left);
 
 	free_space = btrfs_leaf_free_space(root, left);
@@ -4420,7 +4420,7 @@ again:
 			ret = btrfs_try_spin_lock(next);
 			if (!ret) {
 				btrfs_set_path_blocking(path);
-				btrfs_tree_lock(next);
+				btrfs_tree_lock_nested(next, 1);
 				if (!force_blocking)
 					btrfs_clear_path_blocking(path, next);
 			}
@@ -4460,7 +4460,7 @@ again:
 			ret = btrfs_try_spin_lock(next);
 			if (!ret) {
 				btrfs_set_path_blocking(path);
-				btrfs_tree_lock(next);
+				btrfs_tree_lock_nested(next, 1);
 				if (!force_blocking)
 					btrfs_clear_path_blocking(path, next);
 			}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 9fcb5ed..0b96a0a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3171,8 +3171,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
 		return NULL;
 	eb->start = start;
 	eb->len = len;
-	spin_lock_init(&eb->lock);
-	init_waitqueue_head(&eb->lock_wq);
+	mutex_init(&eb->lock);
 	INIT_RCU_HEAD(&eb->rcu_head);
 
 #if LEAK_DEBUG
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 9318dfe..c70c8c9 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -2,6 +2,7 @@
 #define __EXTENTIO__
 
 #include <linux/rbtree.h>
+#include <linux/mutex.h>
 
 /* bits for the extent state */
 #define EXTENT_DIRTY 1
@@ -29,7 +30,6 @@
 
 /* these are bit numbers for test/set bit */
 #define EXTENT_BUFFER_UPTODATE 0
-#define EXTENT_BUFFER_BLOCKING 1
 #define EXTENT_BUFFER_DIRTY 2
 
 /* these are flags for extent_clear_unlock_delalloc */
@@ -129,14 +129,8 @@ struct extent_buffer {
 	struct list_head leak_list;
 	struct rcu_head rcu_head;
 
-	/* the spinlock is used to protect most operations */
-	spinlock_t lock;
-
-	/*
-	 * when we keep the lock held while blocking, waiters go onto
-	 * the wq
-	 */
-	wait_queue_head_t lock_wq;
+	/* used to protect most operations */
+	struct mutex lock;
 };
 
 static inline void extent_set_compress_type(unsigned long *bio_flags,
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
deleted file mode 100644
index 6151f2e..0000000
--- a/fs/btrfs/locking.c
+++ /dev/null
@@ -1,233 +0,0 @@
-/*
- * Copyright (C) 2008 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-#include <linux/sched.h>
-#include <linux/pagemap.h>
-#include <linux/spinlock.h>
-#include <linux/page-flags.h>
-#include <asm/bug.h>
-#include "ctree.h"
-#include "extent_io.h"
-#include "locking.h"
-
-static inline void spin_nested(struct extent_buffer *eb)
-{
-	spin_lock(&eb->lock);
-}
-
-/*
- * Setting a lock to blocking will drop the spinlock and set the
- * flag that forces other procs who want the lock to wait.  After
- * this you can safely schedule with the lock held.
- */
-void btrfs_set_lock_blocking(struct extent_buffer *eb)
-{
-	if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
-		set_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags);
-		spin_unlock(&eb->lock);
-	}
-	/* exit with the spin lock released and the bit set */
-}
-
-/*
- * clearing the blocking flag will take the spinlock again.
- * After this you can't safely schedule
- */
-void btrfs_clear_lock_blocking(struct extent_buffer *eb)
-{
-	if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
-		spin_nested(eb);
-		clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags);
-		smp_mb__after_clear_bit();
-	}
-	/* exit with the spin lock held */
-}
-
-/*
- * unfortunately, many of the places that currently set a lock to blocking
- * don't end up blocking for very long, and often they don't block
- * at all.  For a dbench 50 run, if we don't spin on the blocking bit
- * at all, the context switch rate can jump up to 400,000/sec or more.
- *
- * So, we're still stuck with this crummy spin on the blocking bit,
- * at least until the most common causes of the short blocks
- * can be dealt with.
- */
-static int btrfs_spin_on_block(struct extent_buffer *eb)
-{
-	int i;
-
-	for (i = 0; i < 512; i++) {
-		if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
-			return 1;
-		if (need_resched())
-			break;
-		cpu_relax();
-	}
-	return 0;
-}
-
-/*
- * This is somewhat different from trylock.  It will take the
- * spinlock but if it finds the lock is set to blocking, it will
- * return without the lock held.
- *
- * returns 1 if it was able to take the lock and zero otherwise
- *
- * After this call, scheduling is not safe without first calling
- * btrfs_set_lock_blocking()
- */
-int btrfs_try_spin_lock(struct extent_buffer *eb)
-{
-	int i;
-
-	if (btrfs_spin_on_block(eb)) {
-		spin_nested(eb);
-		if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
-			return 1;
-		spin_unlock(&eb->lock);
-	}
-	/* spin for a bit on the BLOCKING flag */
-	for (i = 0; i < 2; i++) {
-		cpu_relax();
-		if (!btrfs_spin_on_block(eb))
-			break;
-
-		spin_nested(eb);
-		if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
-			return 1;
-		spin_unlock(&eb->lock);
-	}
-	return 0;
-}
-
-/*
- * the autoremove wake function will return 0 if it tried to wake up
- * a process that was already awake, which means that process won't
- * count as an exclusive wakeup.  The waitq code will continue waking
- * procs until it finds one that was actually sleeping.
- *
- * For btrfs, this isn't quite what we want.  We want a single proc
- * to be notified that the lock is ready for taking.  If that proc
- * already happen to be awake, great, it will loop around and try for
- * the lock.
- *
- * So, btrfs_wake_function always returns 1, even when the proc that we
- * tried to wake up was already awake.
- */
-static int btrfs_wake_function(wait_queue_t *wait, unsigned mode,
-			       int sync, void *key)
-{
-	autoremove_wake_function(wait, mode, sync, key);
-	return 1;
-}
-
-/*
- * returns with the extent buffer spinlocked.
- *
- * This will spin and/or wait as required to take the lock, and then
- * return with the spinlock held.
- *
- * After this call, scheduling is not safe without first calling
- * btrfs_set_lock_blocking()
- */
-int btrfs_tree_lock(struct extent_buffer *eb)
-{
-	DEFINE_WAIT(wait);
-	wait.func = btrfs_wake_function;
-
-	if (!btrfs_spin_on_block(eb))
-		goto sleep;
-
-	while(1) {
-		spin_nested(eb);
-
-		/* nobody is blocking, exit with the spinlock held */
-		if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
-			return 0;
-
-		/*
-		 * we have the spinlock, but the real owner is blocking.
-		 * wait for them
-		 */
-		spin_unlock(&eb->lock);
-
-		/*
-		 * spin for a bit, and if the blocking flag goes away,
-		 * loop around
-		 */
-		cpu_relax();
-		if (btrfs_spin_on_block(eb))
-			continue;
-sleep:
-		prepare_to_wait_exclusive(&eb->lock_wq, &wait,
-					  TASK_UNINTERRUPTIBLE);
-
-		if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
-			schedule();
-
-		finish_wait(&eb->lock_wq, &wait);
-	}
-	return 0;
-}
-
-/*
- * Very quick trylock, this does not spin or schedule.  It returns
- * 1 with the spinlock held if it was able to take the lock, or it
- * returns zero if it was unable to take the lock.
- *
- * After this call, scheduling is not safe without first calling
- * btrfs_set_lock_blocking()
- */
-int btrfs_try_tree_lock(struct extent_buffer *eb)
-{
-	if (spin_trylock(&eb->lock)) {
-		if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
-			/*
-			 * we've got the spinlock, but the real owner is
-			 * blocking.  Drop the spinlock and return failure
-			 */
-			spin_unlock(&eb->lock);
-			return 0;
-		}
-		return 1;
-	}
-	/* someone else has the spinlock giveup */
-	return 0;
-}
-
-int btrfs_tree_unlock(struct extent_buffer *eb)
-{
-	/*
-	 * if we were a blocking owner, we don't have the spinlock held
-	 * just clear the bit and look for waiters
-	 */
-	if (test_and_clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
-		smp_mb__after_clear_bit();
-	else
-		spin_unlock(&eb->lock);
-
-	if (waitqueue_active(&eb->lock_wq))
-		wake_up(&eb->lock_wq);
-	return 0;
-}
-
-void btrfs_assert_tree_locked(struct extent_buffer *eb)
-{
-	if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
-		assert_spin_locked(&eb->lock);
-}
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index 6c4ce45..1bb1fee 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -19,13 +19,62 @@
 #ifndef __BTRFS_LOCKING_
 #define __BTRFS_LOCKING_
 
-int btrfs_tree_lock(struct extent_buffer *eb);
-int btrfs_tree_unlock(struct extent_buffer *eb);
+#include <linux/bug.h>
 
-int btrfs_try_tree_lock(struct extent_buffer *eb);
-int btrfs_try_spin_lock(struct extent_buffer *eb);
+/*
+ * Lock/unlock wrappers for eb->lock.
+ *
+ * Lock-neseting eb's at the same level should always be done with the
+ * immediate parent locked to avoid deadlocks and use subclass 1 and 2.
+ */
+static inline void btrfs_tree_lock_nested(struct extent_buffer *eb,
+					  unsigned int subclass)
+{
+	mutex_lock_nested(&eb->lock, subclass);
+}
+
+static inline void btrfs_tree_lock(struct extent_buffer *eb)
+{
+	mutex_lock(&eb->lock);
+}
+
+static inline void btrfs_tree_unlock(struct extent_buffer *eb)
+{
+	mutex_unlock(&eb->lock);
+}
+
+/*
+ * Trylock wrappers for eb->lock.
+ *
+ * In the previous custom locking, btrfs_try_tree_lock() tried to acquire
+ * only once while btrfs_try_spin_lock() performed optimistic spinning
+ * before giving up.  With standard mutex, the distinction doesn't exist.
+ */
+static inline int btrfs_try_tree_lock(struct extent_buffer *eb)
+{
+	return mutex_trylock(&eb->lock);
+}
+
+static inline bool btrfs_try_spin_lock(struct extent_buffer *eb)
+{
+	return mutex_trylock(&eb->lock);
+}
+
+/*
+ * In the previous custom locking, lock owner needed to explicitly manage
+ * blocking state of the locked eb's.  Unless blocking was explicitly set,
+ * the lock owner wasn't allowed to sleep.  With standard mutex, the
+ * distinction doesn't exist.
+ *
+ * The blocking management functions are scheduled to be removed.  Don't
+ * use in new code.
+ */
+static inline void btrfs_set_lock_blocking(struct extent_buffer *eb)	{ }
+static inline void btrfs_clear_lock_blocking(struct extent_buffer *eb)	{ }
+
+static inline void btrfs_assert_tree_locked(struct extent_buffer *eb)
+{
+	BUG_ON(!mutex_is_locked(&eb->lock));
+}
 
-void btrfs_set_lock_blocking(struct extent_buffer *eb);
-void btrfs_clear_lock_blocking(struct extent_buffer *eb);
-void btrfs_assert_tree_locked(struct extent_buffer *eb);
-#endif
+#endif	/* __BTRFS_LOCKING_ */