@@ -910,6 +910,13 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
btrfs_return_cluster_to_free_space(block_group, cluster);
spin_unlock(&cluster->refill_lock);
+ if (btrfs_fs_incompat(fs_info, HMZONED)) {
+ spin_lock(&fs_info->treelog_bg_lock);
+ if (fs_info->treelog_bg == block_group->start)
+ fs_info->treelog_bg = 0;
+ spin_unlock(&fs_info->treelog_bg_lock);
+ }
+
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
@@ -907,6 +907,8 @@ struct btrfs_fs_info {
#endif
struct mutex hmzoned_meta_io_lock;
+ spinlock_t treelog_bg_lock;
+ u64 treelog_bg;
};
static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
@@ -1360,16 +1360,10 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info)
{
struct btrfs_root *log_root;
- int ret;
log_root = alloc_log_tree(trans, fs_info);
if (IS_ERR(log_root))
return PTR_ERR(log_root);
- ret = btrfs_alloc_log_tree_node(trans, log_root);
- if (ret) {
- kfree(log_root);
- return ret;
- }
WARN_ON(fs_info->log_root_tree);
fs_info->log_root_tree = log_root;
return 0;
@@ -2841,6 +2835,8 @@ int __cold open_ctree(struct super_block *sb,
fs_info->send_in_progress = 0;
+ spin_lock_init(&fs_info->treelog_bg_lock);
+
ret = btrfs_alloc_stripe_hash_table(fs_info);
if (ret) {
err = ret;
@@ -3704,8 +3704,10 @@ static int find_free_extent_unclustered(struct btrfs_block_group *bg,
*/
static int find_free_extent_zoned(struct btrfs_block_group *cache,
- struct find_free_extent_ctl *ffe_ctl)
+ struct find_free_extent_ctl *ffe_ctl,
+ bool for_treelog)
{
+ struct btrfs_fs_info *fs_info = cache->fs_info;
struct btrfs_space_info *space_info = cache->space_info;
struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
u64 start = cache->start;
@@ -3718,12 +3720,26 @@ static int find_free_extent_zoned(struct btrfs_block_group *cache,
btrfs_hmzoned_data_io_lock(cache);
spin_lock(&space_info->lock);
spin_lock(&cache->lock);
+ spin_lock(&fs_info->treelog_bg_lock);
+
+ ASSERT(!for_treelog || cache->start == fs_info->treelog_bg ||
+ fs_info->treelog_bg == 0);
if (cache->ro) {
ret = -EAGAIN;
goto out;
}
+ /*
+ * Do not allow currently using block group to be tree-log
+ * dedicated block group.
+ */
+ if (for_treelog && !fs_info->treelog_bg &&
+ (cache->used || cache->reserved)) {
+ ret = 1;
+ goto out;
+ }
+
avail = cache->length - cache->alloc_offset;
if (avail < num_bytes) {
ffe_ctl->max_extent_size = avail;
@@ -3731,6 +3747,9 @@ static int find_free_extent_zoned(struct btrfs_block_group *cache,
goto out;
}
+ if (for_treelog && !fs_info->treelog_bg)
+ fs_info->treelog_bg = cache->start;
+
ffe_ctl->found_offset = start + cache->alloc_offset;
cache->alloc_offset += num_bytes;
spin_lock(&ctl->tree_lock);
@@ -3738,12 +3757,15 @@ static int find_free_extent_zoned(struct btrfs_block_group *cache,
spin_unlock(&ctl->tree_lock);
ASSERT(IS_ALIGNED(ffe_ctl->found_offset,
- cache->fs_info->stripesize));
+ fs_info->stripesize));
ffe_ctl->search_start = ffe_ctl->found_offset;
__btrfs_add_reserved_bytes(cache, ffe_ctl->ram_bytes, num_bytes,
ffe_ctl->delalloc);
out:
+ if (ret && for_treelog)
+ fs_info->treelog_bg = 0;
+ spin_unlock(&fs_info->treelog_bg_lock);
spin_unlock(&cache->lock);
spin_unlock(&space_info->lock);
/* if succeeds, unlock after submit_bio */
@@ -3891,7 +3913,7 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
u64 ram_bytes, u64 num_bytes, u64 empty_size,
u64 hint_byte, struct btrfs_key *ins,
- u64 flags, int delalloc)
+ u64 flags, int delalloc, bool for_treelog)
{
int ret = 0;
struct btrfs_free_cluster *last_ptr = NULL;
@@ -3970,6 +3992,13 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
spin_unlock(&last_ptr->lock);
}
+ if (hmzoned && for_treelog) {
+ spin_lock(&fs_info->treelog_bg_lock);
+ if (fs_info->treelog_bg)
+ hint_byte = fs_info->treelog_bg;
+ spin_unlock(&fs_info->treelog_bg_lock);
+ }
+
ffe_ctl.search_start = max(ffe_ctl.search_start,
first_logical_byte(fs_info, 0));
ffe_ctl.search_start = max(ffe_ctl.search_start, hint_byte);
@@ -4015,8 +4044,15 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
list_for_each_entry(block_group,
&space_info->block_groups[ffe_ctl.index], list) {
/* If the block group is read-only, we can skip it entirely. */
- if (unlikely(block_group->ro))
+ if (unlikely(block_group->ro)) {
+ if (hmzoned && for_treelog) {
+ spin_lock(&fs_info->treelog_bg_lock);
+ if (block_group->start == fs_info->treelog_bg)
+ fs_info->treelog_bg = 0;
+ spin_unlock(&fs_info->treelog_bg_lock);
+ }
continue;
+ }
btrfs_grab_block_group(block_group, delalloc);
ffe_ctl.search_start = block_group->start;
@@ -4062,7 +4098,25 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
goto loop;
if (hmzoned) {
- ret = find_free_extent_zoned(block_group, &ffe_ctl);
+ u64 bytenr = block_group->start;
+ u64 log_bytenr;
+ bool skip;
+
+ /*
+ * Do not allow non-tree-log blocks in the
+ * dedicated tree-log block group, and vice versa.
+ */
+ spin_lock(&fs_info->treelog_bg_lock);
+ log_bytenr = fs_info->treelog_bg;
+ skip = log_bytenr &&
+ ((for_treelog && bytenr != log_bytenr) ||
+ (!for_treelog && bytenr == log_bytenr));
+ spin_unlock(&fs_info->treelog_bg_lock);
+ if (skip)
+ goto loop;
+
+ ret = find_free_extent_zoned(block_group, &ffe_ctl,
+ for_treelog);
if (ret)
goto loop;
/*
@@ -4222,12 +4276,13 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
bool final_tried = num_bytes == min_alloc_size;
u64 flags;
int ret;
+ bool for_treelog = root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID;
flags = get_alloc_profile_by_root(root, is_data);
again:
WARN_ON(num_bytes < fs_info->sectorsize);
ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size,
- hint_byte, ins, flags, delalloc);
+ hint_byte, ins, flags, delalloc, for_treelog);
if (!ret && !is_data) {
btrfs_dec_block_group_reservations(fs_info, ins->objectid);
} else if (ret == -ENOSPC) {
@@ -4245,8 +4300,8 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
sinfo = btrfs_find_space_info(fs_info, flags);
btrfs_err(fs_info,
- "allocation failed flags %llu, wanted %llu",
- flags, num_bytes);
+ "allocation failed flags %llu, wanted %llu treelog %d",
+ flags, num_bytes, for_treelog);
if (sinfo)
btrfs_dump_space_info(fs_info, sinfo,
num_bytes, 1);
@@ -18,6 +18,7 @@
#include "compression.h"
#include "qgroup.h"
#include "inode-map.h"
+#include "hmzoned.h"
/* magic values for the inode_only field in btrfs_log_inode:
*
@@ -105,6 +106,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
struct btrfs_root *log,
struct btrfs_path *path,
u64 dirid, int del_all);
+static void wait_log_commit(struct btrfs_root *root, int transid);
/*
* tree logging is a special write ahead log used to make sure that
@@ -139,16 +141,25 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
struct btrfs_log_ctx *ctx)
{
struct btrfs_fs_info *fs_info = root->fs_info;
+ bool hmzoned = btrfs_fs_incompat(fs_info, HMZONED);
int ret = 0;
mutex_lock(&root->log_mutex);
+again:
if (root->log_root) {
+ int index = (root->log_transid + 1) % 2;
+
if (btrfs_need_log_full_commit(trans)) {
ret = -EAGAIN;
goto out;
}
+ if (hmzoned && atomic_read(&root->log_commit[index])) {
+ wait_log_commit(root, root->log_transid - 1);
+ goto again;
+ }
+
if (!root->log_start_pid) {
clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
root->log_start_pid = current->pid;
@@ -157,8 +168,13 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
}
} else {
mutex_lock(&fs_info->tree_log_mutex);
- if (!fs_info->log_root_tree)
+ if (hmzoned && fs_info->log_root_tree) {
+ ret = -EAGAIN;
+ mutex_unlock(&fs_info->tree_log_mutex);
+ goto out;
+ } else if (!fs_info->log_root_tree) {
ret = btrfs_init_log_root_tree(trans, fs_info);
+ }
mutex_unlock(&fs_info->tree_log_mutex);
if (ret)
goto out;
@@ -191,11 +207,19 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
*/
static int join_running_log_trans(struct btrfs_root *root)
{
+ bool hmzoned = btrfs_fs_incompat(root->fs_info, HMZONED);
int ret = -ENOENT;
mutex_lock(&root->log_mutex);
+again:
if (root->log_root) {
+ int index = (root->log_transid + 1) % 2;
+
ret = 0;
+ if (hmzoned && atomic_read(&root->log_commit[index])) {
+ wait_log_commit(root, root->log_transid - 1);
+ goto again;
+ }
atomic_inc(&root->log_writers);
}
mutex_unlock(&root->log_mutex);
@@ -2724,6 +2748,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
btrfs_clean_tree_block(next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
+ btrfs_redirty_list_add(
+ trans->transaction, next);
} else {
if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
clear_extent_buffer_dirty(next);
@@ -3128,6 +3154,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
mutex_lock(&log_root_tree->log_mutex);
+ mutex_lock(&fs_info->tree_log_mutex);
+ if (!log_root_tree->node)
+ btrfs_alloc_log_tree_node(trans, log_root_tree);
+ mutex_unlock(&fs_info->tree_log_mutex);
+
/*
* Now we are safe to update the log_root_tree because we're under the
* log_mutex, and we're a current writer so we're holding the commit
@@ -3285,16 +3316,20 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
.process_func = process_one_buffer
};
- ret = walk_log_tree(trans, log, &wc);
- if (ret) {
- if (trans)
- btrfs_abort_transaction(trans, ret);
- else
- btrfs_handle_fs_error(log->fs_info, ret, NULL);
+ if (log->node) {
+ ret = walk_log_tree(trans, log, &wc);
+ if (ret) {
+ if (trans)
+ btrfs_abort_transaction(trans, ret);
+ else
+ btrfs_handle_fs_error(log->fs_info, ret, NULL);
+ }
}
clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1,
EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT);
+ if (trans && log->node)
+ btrfs_redirty_list_add(trans->transaction, log->node);
free_extent_buffer(log->node);
kfree(log);
}
The tree-log feature does not work on HMZONED mode as is. Blocks for a tree-log tree are allocated mixed with other metadata blocks, and btrfs writes and syncs the tree-log blocks to devices at the time of fsync(), which is different timing than a global transaction commit. As a result, both writing tree-log blocks and writing other metadata blocks become non-sequential writes which HMZONED mode must avoid. Also, since we can start more than one log transactions per subvolume at the same time, nodes from multiple transactions can be allocated interleaved. Such mixed allocation results in non-sequential writes at the time of log transaction commit. The nodes of the global log root tree (fs_info->log_root_tree), also have the same mixed allocation problem. This patch assigns a dedicated block group for tree-log blocks to separate two metadata writing streams (for tree-log blocks and other metadata blocks). As a result, each write stream can now be written to devices separately. "fs_info->treelog_bg" tracks the dedicated block group and btrfs assign "treelog_bg" on-demand on tree-log block allocation time. Then, this patch serializes log transactions by waiting for a committing transaction when someone tries to start a new transaction, to avoid the mixed allocation problem. We must also wait for running log transactions from another subvolume, but there is no easy way to detect which subvolume root is running a log transaction. So, this patch forbids starting a new log transaction when the global log root tree is already allocated by other subvolumes. Furthermore, this patch aligns the allocation order of nodes of "fs_info->log_root_tree" and nodes of "root->log_root" with the writing order of the nodes, by delaying allocation of the root node of "fs_info->log_root_tree," so that, the node buffers can go out sequentially to devices. Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com> --- fs/btrfs/block-group.c | 7 +++++ fs/btrfs/ctree.h | 2 ++ fs/btrfs/disk-io.c | 8 ++--- fs/btrfs/extent-tree.c | 71 +++++++++++++++++++++++++++++++++++++----- fs/btrfs/tree-log.c | 49 ++++++++++++++++++++++++----- 5 files changed, 116 insertions(+), 21 deletions(-)