@@ -87,7 +87,6 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
else if (held_rw == BTRFS_READ_LOCK)
held_rw = BTRFS_READ_LOCK_BLOCKING;
}
- btrfs_set_path_blocking(p);
for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
if (p->nodes[i] && p->locks[i]) {
@@ -2536,8 +2535,16 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
if (*write_lock_level < level + 1) {
*write_lock_level = level + 1;
- btrfs_release_path(p);
- goto again;
+
+ ASSERT(p->locks[level] == BTRFS_WRITE_LOCK ||
+ p->locks[level] == BTRFS_READ_LOCK);
+
+ /* if it's not root node or the lock is not WRTIE_LOCK */
+ if ((level < BTRFS_MAX_LEVEL - 1 && p->nodes[level + 1]) ||
+ p->locks[level] != BTRFS_WRITE_LOCK) {
+ btrfs_release_path(p);
+ goto again;
+ }
}
btrfs_set_path_blocking(p);
@@ -2555,10 +2562,32 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
BTRFS_NODEPTRS_PER_BLOCK(root) / 2) {
int sret;
+ if (btrfs_header_nritems(b) > BTRFS_NODEPTRS_PER_BLOCK(root) / 4) {
+ ret = 0;
+ goto done;
+ }
+
+ /*
+ * If this is a root node with more than 1 items, then don't
+ * balance at all since it's totally unnecessay.
+ */
+ if (level < BTRFS_MAX_LEVEL - 1 && !p->nodes[level + 1] &&
+ btrfs_header_nritems(b) != 1) {
+ ret = 0;
+ goto done;
+ }
+
if (*write_lock_level < level + 1) {
*write_lock_level = level + 1;
- btrfs_release_path(p);
- goto again;
+ ASSERT(p->locks[level] == BTRFS_WRITE_LOCK ||
+ p->locks[level] == BTRFS_READ_LOCK);
+
+ /* if it's not root node or the lock is not WRTIE_LOCK */
+ if ((level < BTRFS_MAX_LEVEL - 1 && p->nodes[level + 1]) ||
+ p->locks[level] != BTRFS_WRITE_LOCK) {
+ btrfs_release_path(p);
+ goto again;
+ }
}
btrfs_set_path_blocking(p);
@@ -2851,8 +2880,15 @@ cow_done:
if (slot == 0 && ins_len &&
write_lock_level < level + 1) {
write_lock_level = level + 1;
- btrfs_release_path(p);
- goto again;
+ ASSERT(p->locks[level] == BTRFS_WRITE_LOCK ||
+ p->locks[level] == BTRFS_READ_LOCK);
+
+ /* if it's not root node or the lock is not WRTIE_LOCK */
+ if ((level < BTRFS_MAX_LEVEL - 1 && p->nodes[level + 1]) ||
+ p->locks[level] != BTRFS_WRITE_LOCK) {
+ btrfs_release_path(p);
+ goto again;
+ }
}
unlock_up(p, level, lowest_unlock,
@@ -5130,6 +5166,8 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
int level;
int ret = 1;
int keep_locks = path->keep_locks;
+ u64 blocknr;
+ u64 blockgen;
path->keep_locks = 1;
again:
@@ -5197,16 +5235,31 @@ find_next_key:
ret = 0;
goto out;
}
- btrfs_set_path_blocking(path);
- cur = read_node_slot(root, cur, slot);
- BUG_ON(!cur); /* -ENOMEM */
- btrfs_tree_read_lock(cur);
+ unlock_up(path, level, 1, 0, NULL);
+
+ blocknr = btrfs_node_blockptr(cur, slot);
+ blockgen = btrfs_node_ptr_generation(cur, slot);
+ cur = btrfs_find_tree_block(root->fs_info, blocknr);
+ if (cur && btrfs_buffer_uptodate(cur, blockgen, 1) > 0) {
+ int tmp;
+ tmp = btrfs_tree_read_lock_atomic(cur);
+ if (!tmp) {
+ btrfs_set_path_blocking(path);
+ btrfs_tree_read_lock(cur);
+ btrfs_clear_path_blocking(path, cur, BTRFS_READ_LOCK);
+ }
+ } else {
+ btrfs_set_path_blocking(path);
+ cur = read_node_slot(root, cur, slot);
+ BUG_ON(!cur); /* -ENOMEM */
+
+ btrfs_tree_read_lock(cur);
+ btrfs_clear_path_blocking(path, cur, BTRFS_READ_LOCK);
+ }
path->locks[level - 1] = BTRFS_READ_LOCK;
path->nodes[level - 1] = cur;
- unlock_up(path, level, 1, 0, NULL);
- btrfs_clear_path_blocking(path, NULL, 0);
}
out:
path->keep_locks = keep_locks;
@@ -228,6 +228,7 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
+ path->leave_spinning = 1;
key.objectid = dir;
key.type = BTRFS_DIR_ITEM_KEY;
@@ -704,6 +704,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
+ path->leave_spinning = 1;
again:
next_offset = (u64)-1;
found_next = 0;
@@ -834,10 +835,8 @@ insert:
} else {
ins_size = csum_size;
}
- path->leave_spinning = 1;
ret = btrfs_insert_empty_item(trans, root, path, &file_key,
ins_size);
- path->leave_spinning = 0;
if (ret < 0)
goto fail_unlock;
if (WARN_ON(ret != 0))
@@ -715,12 +715,15 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
int update_refs;
int found = 0;
int leafs_visited = 0;
+ int old_spinning = path->leave_spinning;
if (drop_cache)
btrfs_drop_extent_cache(inode, start, end - 1, 0);
- if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent)
+ if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent) {
modify_tree = 0;
+ path->leave_spinning = 1;
+ }
update_refs = (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
root == root->fs_info->tree_root);
@@ -809,6 +812,7 @@ next_slot:
search_start = max(key.offset, start);
if (recow || !modify_tree) {
modify_tree = -1;
+ path->leave_spinning = 0;
btrfs_release_path(path);
continue;
}
@@ -1011,6 +1015,7 @@ delete_extent_item:
btrfs_release_path(path);
if (drop_end)
*drop_end = found ? min(end, extent_end) : end;
+ path->leave_spinning = old_spinning;
return ret;
}
@@ -528,6 +528,8 @@ static int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid)
if (!path)
return -ENOMEM;
+ path->leave_spinning = 1;
+
search_key.objectid = BTRFS_LAST_FREE_OBJECTID;
search_key.type = -1;
search_key.offset = (u64)-1;
@@ -5347,6 +5347,7 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
if (!path)
return -ENOMEM;
+ path->leave_spinning = 1;
di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), name,
namelen, 0);
if (IS_ERR(di))
@@ -6027,6 +6028,8 @@ static int btrfs_set_inode_index_count(struct inode *inode)
if (!path)
return -ENOMEM;
+ path->leave_spinning = 1;
+
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto out;
@@ -33,6 +33,7 @@ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
+ path->leave_spinning = 1;
ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
@@ -54,6 +55,7 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
+ path->leave_spinning = 1;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0)
@@ -147,6 +147,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
+ path->leave_spinning = 1;
ret = btrfs_search_slot(trans, root, key, path, 0, 1);
if (ret < 0) {
@@ -46,6 +46,7 @@ ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
if (!path)
return -ENOMEM;
+ path->leave_spinning = 1;
/* lookup the xattr by name */
di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode), name,
strlen(name), 0);
@@ -105,6 +106,7 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
path->skip_release_on_error = 1;
+ path->leave_spinning = 1;
if (!value) {
di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode),
Kent Overstreet posted some dbench test numbers in the announcement of bcachefs[1], in which btrfs's performance is much worse than that of ext4 and xfs, especially in the case of multiple threads. This difference can be observed on fast storage, I ran 'dbench -t10 64' with 1.6T NVMe disk, Processor: Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz Memory: 504G I took time to dig it a bit, perf shows that in the case of multiple threads we spend most of cpu cycles on spin_lock_irqsave() and spin_unlock_irqrestore() pair, which is called by wait_event() in btree locking. 72.84% 72.84% dbench [kernel.vmlinux] [k] native_queued_spin_lock_slowpath | ---native_queued_spin_lock_slowpath | |--71.64%-- _raw_spin_lock_irqsave | | | |--52.17%-- prepare_to_wait_event | | | | | |--94.33%-- btrfs_tree_lock | | | | | | | |--99.10%-- btrfs_lock_root_node | | | | btrfs_search_slot | | | | | | | | | |--26.44%-- btrfs_lookup_dir_item | | | | | | | | | | | |--99.31%-- __btrfs_unlink_inode Having serious contention on btree lock can also be proved by another fact, that is, if you use subvolume instead of directory for each dbench client, the test number in the case of multiple threads will be considerably nice, for 64 clients, Throughput 5904.71 MB/sec 64 clients 64 procs max_latency=816.715 ms I did a few things to avoid waiting for blocking writers and readers: 1) Use path->leave_spinning=1 as much as possible, this leaves us holding spinning lock after searching btree. 2) Find out the cases that we don't have to take blocking lock, for example, we don't need blocking lock when parent node has more than 1/4 items it can hold. 3) Avoid unnecessary "goto again", eg. on btree root level, just update write_lock_level if we already hold BTRFS_WRITE_LOCK. 4) Remove btrfs_set_path_blocking() in btrfs_clear_path_blocking(), this contributes to a large part of improved numbers, this function is introduced to avoid lockdep warning, but after I turned lockdep on, xfstests didn't report about such a warning. 5) Make btrfs_search_forward to use non-sleepable function to find eb, this fixes a deadlock with previous changes. Here is the end results for 64 clients, with vanilla 4.2, btrfs runs 15x faster but with higher latency. tput(MB/sec) max_latency(ms) xfs 2742.93 21.855 ext4 7182.92 19.053 btrfs+subvol w/o 5904.71 816.715 btrfs+dir w/o 122.778 718.674 *btrfs+dir w 1715.77 1366.981 I've marked it as RFC since I'm not confident on the lockdep part. Any comments are welcome! [1]: https://lkml.org/lkml/2015/8/21/22 Signed-off-by: Liu Bo <bo.li.liu@oracle.com> --- fs/btrfs/ctree.c | 79 +++++++++++++++++++++++++++++++++++++++++++--------- fs/btrfs/dir-item.c | 1 + fs/btrfs/file-item.c | 3 +- fs/btrfs/file.c | 7 ++++- fs/btrfs/inode-map.c | 2 ++ fs/btrfs/inode.c | 3 ++ fs/btrfs/orphan.c | 2 ++ fs/btrfs/root-tree.c | 1 + fs/btrfs/xattr.c | 2 ++ 9 files changed, 84 insertions(+), 16 deletions(-)