@@ -596,6 +596,8 @@ enum btrfs_alloc_type {
BTRFS_ALLOC_SEQ = 1,
};
+struct expire_work;
+
struct btrfs_block_group_cache {
struct btrfs_key key;
struct btrfs_block_group_item item;
@@ -721,6 +723,14 @@ struct btrfs_block_group_cache {
struct mutex submit_lock;
u64 submit_offset;
struct bio_list submit_buffer;
+ struct expire_work *expire_work;
+ int expired:1;
+};
+
+struct expire_work {
+ struct list_head list;
+ struct delayed_work work;
+ struct btrfs_block_group_cache *block_group;
};
/* delayed seq elem */
@@ -1194,6 +1204,9 @@ struct btrfs_fs_info {
spinlock_t ref_verify_lock;
struct rb_root block_tree;
#endif
+
+ struct list_head expire_work_list;
+ struct mutex expire_work_lock;
};
static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
@@ -2717,6 +2717,8 @@ int open_ctree(struct super_block *sb,
INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
spin_lock_init(&fs_info->reada_lock);
btrfs_init_ref_verify(fs_info);
+ INIT_LIST_HEAD(&fs_info->expire_work_list);
+ mutex_init(&fs_info->expire_work_lock);
fs_info->thread_pool_size = min_t(unsigned long,
num_online_cpus() + 2, 8);
@@ -125,6 +125,7 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
WARN_ON(cache->pinned > 0);
WARN_ON(cache->reserved > 0);
WARN_ON(!bio_list_empty(&cache->submit_buffer));
+ WARN_ON(cache->expire_work);
/*
* If not empty, someone is still holding mutex of
@@ -10180,6 +10181,13 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
block_group->cached == BTRFS_CACHE_ERROR)
free_excluded_extents(block_group);
+ if (block_group->alloc_type == BTRFS_ALLOC_SEQ) {
+ mutex_lock(&block_group->submit_lock);
+ WARN_ON(!bio_list_empty(&block_group->submit_buffer));
+ WARN_ON(block_group->expire_work != NULL);
+ mutex_unlock(&block_group->submit_lock);
+ }
+
btrfs_remove_free_space_cache(block_group);
ASSERT(block_group->cached != BTRFS_CACHE_STARTED);
ASSERT(list_empty(&block_group->dirty_list));
@@ -10513,6 +10521,7 @@ btrfs_get_block_group_alloc_offset(struct btrfs_block_group_cache *cache)
}
cache->submit_offset = logical + cache->alloc_offset;
+ cache->expired = 0;
out:
cache->alloc_type = alloc_type;
@@ -10565,6 +10574,7 @@ btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info,
btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
cache->alloc_type = BTRFS_ALLOC_FIT;
cache->alloc_offset = 0;
+ cache->expire_work = NULL;
if (btrfs_fs_incompat(fs_info, HMZONED)) {
ret = btrfs_get_block_group_alloc_offset(cache);
@@ -11329,11 +11339,13 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
/* Don't want to race with allocators so take the groups_sem */
down_write(&space_info->groups_sem);
+ mutex_lock(&block_group->submit_lock);
spin_lock(&block_group->lock);
if (block_group->reserved || block_group->pinned ||
btrfs_block_group_used(&block_group->item) ||
block_group->ro ||
- list_is_singular(&block_group->list)) {
+ list_is_singular(&block_group->list) ||
+ !bio_list_empty(&block_group->submit_buffer)) {
/*
* We want to bail if we made new allocations or have
* outstanding allocations in this block group. We do
@@ -11342,10 +11354,12 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
*/
trace_btrfs_skip_unused_block_group(block_group);
spin_unlock(&block_group->lock);
+ mutex_unlock(&block_group->submit_lock);
up_write(&space_info->groups_sem);
goto next;
}
spin_unlock(&block_group->lock);
+ mutex_unlock(&block_group->submit_lock);
/* We don't want to force the issue, only flip if it's ok. */
ret = inc_block_group_ro(block_group, 0);
@@ -154,6 +154,24 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function
* completes. The next time when the filesystem is mounted writable
* again, the device replace operation continues.
*/
+
+ /* expire pending bios in submit buffer */
+ if (btrfs_fs_incompat(fs_info, HMZONED)) {
+ struct expire_work *work;
+ struct btrfs_block_group_cache *block_group;
+
+ mutex_lock(&fs_info->expire_work_lock);
+ list_for_each_entry(work, &fs_info->expire_work_list, list) {
+ block_group = work->block_group;
+ mutex_lock(&block_group->submit_lock);
+ if (block_group->expire_work)
+ mod_delayed_work(
+ system_unbound_wq,
+ &block_group->expire_work->work, 0);
+ mutex_unlock(&block_group->submit_lock);
+ };
+ mutex_unlock(&fs_info->expire_work_lock);
+ }
}
#ifdef CONFIG_PRINTK
@@ -6840,6 +6840,124 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
}
}
+static void expire_bios_fn(struct work_struct *work)
+{
+ struct expire_work *ework;
+ struct btrfs_block_group_cache *cache;
+ struct bio *bio, *next;
+
+ ework = container_of(work, struct expire_work, work.work);
+ cache = ework->block_group;
+
+ mutex_lock(&cache->fs_info->expire_work_lock);
+ mutex_lock(&cache->submit_lock);
+ list_del(&cache->expire_work->list);
+
+ if (btrfs_fs_closing(cache->fs_info)) {
+ WARN_ON(!bio_list_empty(&cache->submit_buffer));
+ goto end;
+ }
+
+ if (bio_list_empty(&cache->submit_buffer))
+ goto end;
+
+ bio = bio_list_get(&cache->submit_buffer);
+ cache->expired = 1;
+ mutex_unlock(&cache->submit_lock);
+
+ btrfs_handle_fs_error(cache->fs_info, -EIO,
+ "bio submit buffer expired");
+ btrfs_err(cache->fs_info, "block group %llu submit pos %llu",
+ cache->key.objectid, cache->submit_offset);
+
+ while (bio) {
+ struct map_bio_data *map_private =
+ (struct map_bio_data *)bio->bi_private;
+
+ next = bio->bi_next;
+ bio->bi_next = NULL;
+ bio->bi_private = map_private->orig_bi_private;
+ kfree(map_private);
+
+ trace_btrfs_expire_bio(cache, bio);
+ bio->bi_status = BLK_STS_IOERR;
+ bio_endio(bio);
+
+ bio = next;
+ }
+
+end:
+ kfree(cache->expire_work);
+ cache->expire_work = NULL;
+ mutex_unlock(&cache->submit_lock);
+ mutex_unlock(&cache->fs_info->expire_work_lock);
+ btrfs_put_block_group(cache);
+}
+
+static int schedule_expire_work(struct btrfs_block_group_cache *cache)
+{
+ const unsigned long delay = 90 * HZ;
+ struct btrfs_fs_info *fs_info = cache->fs_info;
+ struct expire_work *work;
+ int ret = 0;
+
+ mutex_lock(&fs_info->expire_work_lock);
+ mutex_lock(&cache->submit_lock);
+ if (cache->expire_work) {
+ mod_delayed_work(system_unbound_wq, &cache->expire_work->work,
+ delay);
+ goto end;
+ }
+
+ work = kmalloc(sizeof(*work), GFP_NOFS);
+ if (!work) {
+ ret = -ENOMEM;
+ goto end;
+ }
+ work->block_group = cache;
+ INIT_LIST_HEAD(&work->list);
+ INIT_DELAYED_WORK(&work->work, expire_bios_fn);
+ cache->expire_work = work;
+
+ list_add(&work->list, &fs_info->expire_work_list);
+ btrfs_get_block_group(cache);
+ mod_delayed_work(system_unbound_wq, &cache->expire_work->work, delay);
+
+end:
+ mutex_unlock(&cache->submit_lock);
+ mutex_unlock(&cache->fs_info->expire_work_lock);
+ return ret;
+}
+
+static bool cancel_expire_work(struct btrfs_block_group_cache *cache)
+{
+ struct expire_work *work;
+ bool ret = true;
+
+ mutex_lock(&cache->fs_info->expire_work_lock);
+ mutex_lock(&cache->submit_lock);
+ work = cache->expire_work;
+ if (!work)
+ goto end;
+ cache->expire_work = NULL;
+
+ ret = cancel_delayed_work(&work->work);
+ /*
+ * if cancel failed, expire_work is freed by the
+ * expire worker thread
+ */
+ if (!ret)
+ goto end;
+
+ list_del(&work->list);
+ kfree(work);
+ btrfs_put_block_group(cache);
+
+end:
+ mutex_unlock(&cache->submit_lock);
+ mutex_unlock(&cache->fs_info->expire_work_lock);
+ return ret;
+}
static blk_status_t __btrfs_map_bio(struct btrfs_fs_info *fs_info,
struct bio *bio, int mirror_num,
@@ -6931,7 +7049,9 @@ static blk_status_t __btrfs_map_bio_zoned(struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *cache = NULL;
struct map_bio_data *map_private;
int sent;
+ bool should_queue;
blk_status_t ret;
+ int ret2;
WARN_ON(bio_op(cur_bio) != REQ_OP_WRITE);
@@ -6944,8 +7064,20 @@ static blk_status_t __btrfs_map_bio_zoned(struct btrfs_fs_info *fs_info,
}
mutex_lock(&cache->submit_lock);
- if (cache->submit_offset == logical)
+
+ if (cache->expired) {
+ trace_btrfs_bio_in_expired_block_group(cache, cur_bio);
+ mutex_unlock(&cache->submit_lock);
+ btrfs_put_block_group(cache);
+ WARN_ON_ONCE(1);
+ return BLK_STS_IOERR;
+ }
+
+ if (cache->submit_offset == logical) {
+ mutex_unlock(&cache->submit_lock);
+ cancel_expire_work(cache);
goto send_bios;
+ }
if (cache->submit_offset > logical) {
trace_btrfs_bio_before_write_pointer(cache, cur_bio);
@@ -6968,13 +7100,18 @@ static blk_status_t __btrfs_map_bio_zoned(struct btrfs_fs_info *fs_info,
bio_list_add(&cache->submit_buffer, cur_bio);
mutex_unlock(&cache->submit_lock);
+
+ ret2 = schedule_expire_work(cache);
+ if (ret2) {
+ btrfs_put_block_group(cache);
+ return errno_to_blk_status(ret2);
+ }
btrfs_put_block_group(cache);
/* mimic a good result ... */
return BLK_STS_OK;
send_bios:
- mutex_unlock(&cache->submit_lock);
/* send this bio */
ret = __btrfs_map_bio(fs_info, cur_bio, mirror_num, 1, 1);
if (ret != BLK_STS_OK) {
@@ -7013,6 +7150,7 @@ static blk_status_t __btrfs_map_bio_zoned(struct btrfs_fs_info *fs_info,
bio = next;
}
} while (sent);
+ should_queue = !bio_list_empty(&cache->submit_buffer);
mutex_unlock(&cache->submit_lock);
/* send the collected bios */
@@ -7031,8 +7169,10 @@ static blk_status_t __btrfs_map_bio_zoned(struct btrfs_fs_info *fs_info,
if (length)
goto loop;
- btrfs_put_block_group(cache);
+ if (should_queue)
+ WARN_ON(schedule_expire_work(cache));
+ btrfs_put_block_group(cache);
return BLK_STS_OK;
}
@@ -2131,6 +2131,8 @@ DEFINE_EVENT(btrfs_hmzoned_bio_buffer_events, name, \
)
DEFINE_BTRFS_HMZONED_BIO_BUF_EVENT(btrfs_bio_before_write_pointer);
+DEFINE_BTRFS_HMZONED_BIO_BUF_EVENT(btrfs_expire_bio);
+DEFINE_BTRFS_HMZONED_BIO_BUF_EVENT(btrfs_bio_in_expired_block_group);
#endif /* _TRACE_BTRFS_H */
It is possible to have bios stalled in the submit buffer due to some bug or device problem. In such situation, btrfs stops working waiting for buffered bios completions. To avoid such hang, add a worker that will cancel the stalled bios after a timeout. Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com> --- fs/btrfs/ctree.h | 13 ++++ fs/btrfs/disk-io.c | 2 + fs/btrfs/extent-tree.c | 16 +++- fs/btrfs/super.c | 18 +++++ fs/btrfs/volumes.c | 146 ++++++++++++++++++++++++++++++++++- include/trace/events/btrfs.h | 2 + 6 files changed, 193 insertions(+), 4 deletions(-)