@@ -2695,24 +2695,52 @@ static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
+ struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_block_rsv *global_rsv;
- u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
+ u64 num_heads;
+ u64 num_entries;
u64 num_bytes;
int ret = 0;
- num_bytes = btrfs_calc_trans_metadata_size(root, 1);
- num_heads = heads_to_leaves(root, num_heads);
- if (num_heads > 1)
- num_bytes += (num_heads - 1) * root->leafsize;
- num_bytes <<= 1;
global_rsv = &root->fs_info->global_block_rsv;
- /*
- * If we can't allocate any more chunks lets make sure we have _lots_ of
- * wiggle room since running delayed refs can create more delayed refs.
- */
- if (global_rsv->space_info->full)
+ if (trans) {
+ num_heads = trans->transaction->delayed_refs.num_heads_ready;
+ num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+ num_heads = heads_to_leaves(root, num_heads);
+ if (num_heads > 1)
+ num_bytes += (num_heads - 1) * root->leafsize;
num_bytes <<= 1;
+ /*
+ * If we can't allocate any more chunks lets make sure we have
+ * _lots_ of wiggle room since running delayed refs can create
+ * more delayed refs.
+ */
+ if (global_rsv->space_info->full)
+ num_bytes <<= 1;
+ } else {
+ if (root->fs_info->dedup_bs == 0)
+ return 0;
+
+ /* dedup enabled */
+ spin_lock(&root->fs_info->trans_lock);
+ if (!root->fs_info->running_transaction) {
+ spin_unlock(&root->fs_info->trans_lock);
+ return 0;
+ }
+
+ delayed_refs =
+ &root->fs_info->running_transaction->delayed_refs;
+
+ num_entries = atomic_read(&delayed_refs->num_entries);
+ num_heads = delayed_refs->num_heads;
+
+ spin_unlock(&root->fs_info->trans_lock);
+
+ /* The worst case */
+ num_bytes = (num_entries - num_heads) *
+ btrfs_calc_trans_metadata_size(root, 1);
+ }
spin_lock(&global_rsv->lock);
if (global_rsv->reserved <= num_bytes)
@@ -747,6 +747,12 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
&cur_trans->ordered_operations);
spin_unlock(&root->fs_info->ordered_root_lock);
+ if (cur_trans->blocked) {
+ cur_trans->blocked = 0;
+ if (waitqueue_active(&cur_trans->commit_wait))
+ wake_up(&cur_trans->commit_wait);
+ }
+
work = btrfs_alloc_delalloc_work(inode, wait, 1);
if (!work) {
spin_lock(&root->fs_info->ordered_root_lock);
@@ -215,6 +215,7 @@ loop:
cur_trans->transid = fs_info->generation;
fs_info->running_transaction = cur_trans;
cur_trans->aborted = 0;
+ cur_trans->blocked = 1;
spin_unlock(&fs_info->trans_lock);
return 0;
@@ -329,6 +330,27 @@ static void wait_current_trans(struct btrfs_root *root)
wait_event(root->fs_info->transaction_wait,
cur_trans->state >= TRANS_STATE_UNBLOCKED ||
cur_trans->aborted);
+
+ btrfs_put_transaction(cur_trans);
+ } else {
+ spin_unlock(&root->fs_info->trans_lock);
+ }
+}
+
+static noinline void wait_current_trans_for_commit(struct btrfs_root *root)
+{
+ struct btrfs_transaction *cur_trans;
+
+ spin_lock(&root->fs_info->trans_lock);
+ cur_trans = root->fs_info->running_transaction;
+ if (cur_trans && is_transaction_blocked(cur_trans)) {
+ atomic_inc(&cur_trans->use_count);
+ spin_unlock(&root->fs_info->trans_lock);
+
+ wait_event(cur_trans->commit_wait,
+ cur_trans->state >= TRANS_STATE_COMPLETED ||
+ cur_trans->aborted || cur_trans->blocked == 0);
+
btrfs_put_transaction(cur_trans);
} else {
spin_unlock(&root->fs_info->trans_lock);
@@ -436,6 +458,25 @@ again:
if (may_wait_transaction(root, type))
wait_current_trans(root);
+ /*
+ * In the case of dedupe, we need to throttle delayed refs at the
+ * very start stage, otherwise we'd run into ENOSPC because more
+ * delayed refs are added while processing delayed refs.
+ */
+ if (root->fs_info->dedup_bs > 0 && type == TRANS_JOIN &&
+ btrfs_check_space_for_delayed_refs(NULL, root)) {
+ struct btrfs_transaction *cur_trans;
+
+ spin_lock(&root->fs_info->trans_lock);
+ cur_trans = root->fs_info->running_transaction;
+ if (cur_trans && cur_trans->state == TRANS_STATE_RUNNING)
+ cur_trans->state = TRANS_STATE_BLOCKED;
+ spin_unlock(&root->fs_info->trans_lock);
+
+ wake_up_process(root->fs_info->transaction_kthread);
+ wait_current_trans_for_commit(root);
+ }
+
do {
ret = join_transaction(root, type);
if (ret == -EBUSY) {
@@ -59,6 +59,7 @@ struct btrfs_transaction {
struct list_head pending_chunks;
struct btrfs_delayed_ref_root delayed_refs;
int aborted;
+ int blocked;
};
#define __TRANS_FREEZABLE (1U << 0)
In the case of dedupe, btrfs will produce large number of delayed refs, and processing them can very likely eat all of the space reserved in global_block_rsv, and we'll end up with transaction abortion due to ENOSPC. I tried several different ways to reserve more space for global_block_rsv to hope it's enough for flushing delayed refs, but I failed and code could become very messy. I found that with high delayed refs pressure, the throttle work in the end_transaction had little use since it didn't block new delayed refs' insertion, so I put throttle stuff into the very start stage, i.e. start_transaction. We take the worst case into account in the throttle code, that is, every delayed_refs would update btree, so when we reach the limit that it may use up all the reserved space of global_block_rsv, we kick transaction_kthread to commit transaction to process these delayed refs, refresh global_block_rsv's space, and get pinned space back as well. That way we get rid of annoy ENOSPC problem. However, this leads to a new problem that it cannot use along with option "flushoncommit", otherwise it can cause ABBA deadlock between commit_transaction between ordered extents flush. Signed-off-by: Liu Bo <bo.li.liu@oracle.com> --- fs/btrfs/extent-tree.c | 50 ++++++++++++++++++++++++++++++++++++++----------- fs/btrfs/ordered-data.c | 6 ++++++ fs/btrfs/transaction.c | 41 ++++++++++++++++++++++++++++++++++++++++ fs/btrfs/transaction.h | 1 + 4 files changed, 87 insertions(+), 11 deletions(-)