diff mbox series

btrfs: zoned: use dedicated lock for data relocation

Message ID 1ad4d3f6ed32ab2d3352adb6da7ba4ff049e79a0.1650257630.git.naohiro.aota@wdc.com (mailing list archive)
State New, archived
Headers show
Series btrfs: zoned: use dedicated lock for data relocation | expand

Commit Message

Naohiro Aota April 18, 2022, 5:50 a.m. UTC
Currently, we use btrfs_inode_{lock,unlock}() to grant an exclusive
writeback of the relocation data inode in
btrfs_zoned_data_reloc_{lock,unlock}(). However, that can cause a deadlock
in the following path.

Thread A takes btrfs_inode_lock() and waits for metadata reservation by
e.g, waiting for writeback:

prealloc_file_extent_cluster()
  - btrfs_inode_lock(&inode->vfs_inode, 0);
  - btrfs_prealloc_file_range()
  ...
    - btrfs_replace_file_extents()
      - btrfs_start_transaction
      ...
        - btrfs_reserve_metadata_bytes()

Thread B (e.g, doing a writeback work) needs to wait for the inode lock to
continue writeback process:

do_writepages
  - btrfs_writepages
    - extent_writpages
      - btrfs_zoned_data_reloc_lock(BTRFS_I(inode));
        - btrfs_inode_lock()

The deadlock is caused by relying on the vfs_inode's lock. By using it, we
introduced unnecessary exclusion of writeback and
btrfs_prealloc_file_range(). Also, the lock at this point is useless as we
don't have any dirty pages in the inode yet.

Introduce fs_info->zoned_data_reloc_io_lock and use it for the exclusive
writeback.

Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
---
 fs/btrfs/ctree.h   | 1 +
 fs/btrfs/disk-io.c | 1 +
 fs/btrfs/zoned.h   | 4 ++--
 3 files changed, 4 insertions(+), 2 deletions(-)

Comments

Damien Le Moal April 18, 2022, 7:03 a.m. UTC | #1
On 4/18/22 14:50, Naohiro Aota wrote:
> Currently, we use btrfs_inode_{lock,unlock}() to grant an exclusive
> writeback of the relocation data inode in
> btrfs_zoned_data_reloc_{lock,unlock}(). However, that can cause a deadlock
> in the following path.
> 
> Thread A takes btrfs_inode_lock() and waits for metadata reservation by
> e.g, waiting for writeback:
> 
> prealloc_file_extent_cluster()
>   - btrfs_inode_lock(&inode->vfs_inode, 0);
>   - btrfs_prealloc_file_range()
>   ...
>     - btrfs_replace_file_extents()
>       - btrfs_start_transaction
>       ...
>         - btrfs_reserve_metadata_bytes()
> 
> Thread B (e.g, doing a writeback work) needs to wait for the inode lock to
> continue writeback process:
> 
> do_writepages
>   - btrfs_writepages
>     - extent_writpages
>       - btrfs_zoned_data_reloc_lock(BTRFS_I(inode));
>         - btrfs_inode_lock()
> 
> The deadlock is caused by relying on the vfs_inode's lock. By using it, we
> introduced unnecessary exclusion of writeback and
> btrfs_prealloc_file_range(). Also, the lock at this point is useless as we
> don't have any dirty pages in the inode yet.
> 
> Introduce fs_info->zoned_data_reloc_io_lock and use it for the exclusive
> writeback.
> 
> Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>

Fixes tag ?
Cc stable ?

Since this is fixing a deadlock, these tags are necessary, no ?

> ---
>  fs/btrfs/ctree.h   | 1 +
>  fs/btrfs/disk-io.c | 1 +
>  fs/btrfs/zoned.h   | 4 ++--
>  3 files changed, 4 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index 55dee124ee44..580a392d7c37 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -1056,6 +1056,7 @@ struct btrfs_fs_info {
>  	 */
>  	spinlock_t relocation_bg_lock;
>  	u64 data_reloc_bg;
> +	struct mutex zoned_data_reloc_io_lock;
>  
>  	u64 nr_global_roots;
>  
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 2689e8589627..2a0284c2430e 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -3179,6 +3179,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
>  	mutex_init(&fs_info->reloc_mutex);
>  	mutex_init(&fs_info->delalloc_root_mutex);
>  	mutex_init(&fs_info->zoned_meta_io_lock);
> +	mutex_init(&fs_info->zoned_data_reloc_io_lock);
>  	seqlock_init(&fs_info->profiles_lock);
>  
>  	INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
> diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
> index fc2034e66ce3..de923fc8449d 100644
> --- a/fs/btrfs/zoned.h
> +++ b/fs/btrfs/zoned.h
> @@ -361,7 +361,7 @@ static inline void btrfs_zoned_data_reloc_lock(struct btrfs_inode *inode)
>  	struct btrfs_root *root = inode->root;
>  
>  	if (btrfs_is_data_reloc_root(root) && btrfs_is_zoned(root->fs_info))
> -		btrfs_inode_lock(&inode->vfs_inode, 0);
> +		mutex_lock(&root->fs_info->zoned_data_reloc_io_lock);
>  }
>  
>  static inline void btrfs_zoned_data_reloc_unlock(struct btrfs_inode *inode)
> @@ -369,7 +369,7 @@ static inline void btrfs_zoned_data_reloc_unlock(struct btrfs_inode *inode)
>  	struct btrfs_root *root = inode->root;
>  
>  	if (btrfs_is_data_reloc_root(root) && btrfs_is_zoned(root->fs_info))
> -		btrfs_inode_unlock(&inode->vfs_inode, 0);
> +		mutex_unlock(&root->fs_info->zoned_data_reloc_io_lock);
>  }
>  
>  #endif
Naohiro Aota April 18, 2022, 7:13 a.m. UTC | #2
On Mon, Apr 18, 2022 at 04:03:41PM +0900, Damien Le Moal wrote:
> On 4/18/22 14:50, Naohiro Aota wrote:
> > Currently, we use btrfs_inode_{lock,unlock}() to grant an exclusive
> > writeback of the relocation data inode in
> > btrfs_zoned_data_reloc_{lock,unlock}(). However, that can cause a deadlock
> > in the following path.
> > 
> > Thread A takes btrfs_inode_lock() and waits for metadata reservation by
> > e.g, waiting for writeback:
> > 
> > prealloc_file_extent_cluster()
> >   - btrfs_inode_lock(&inode->vfs_inode, 0);
> >   - btrfs_prealloc_file_range()
> >   ...
> >     - btrfs_replace_file_extents()
> >       - btrfs_start_transaction
> >       ...
> >         - btrfs_reserve_metadata_bytes()
> > 
> > Thread B (e.g, doing a writeback work) needs to wait for the inode lock to
> > continue writeback process:
> > 
> > do_writepages
> >   - btrfs_writepages
> >     - extent_writpages
> >       - btrfs_zoned_data_reloc_lock(BTRFS_I(inode));
> >         - btrfs_inode_lock()
> > 
> > The deadlock is caused by relying on the vfs_inode's lock. By using it, we
> > introduced unnecessary exclusion of writeback and
> > btrfs_prealloc_file_range(). Also, the lock at this point is useless as we
> > don't have any dirty pages in the inode yet.
> > 
> > Introduce fs_info->zoned_data_reloc_io_lock and use it for the exclusive
> > writeback.
> > 
> > Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
> 
> Fixes tag ?
> Cc stable ?

Oops, I added them in the commit, but forgot to renew the patch.
I'll resend the updated one.

> Since this is fixing a deadlock, these tags are necessary, no ?
> 
> > ---
> >  fs/btrfs/ctree.h   | 1 +
> >  fs/btrfs/disk-io.c | 1 +
> >  fs/btrfs/zoned.h   | 4 ++--
> >  3 files changed, 4 insertions(+), 2 deletions(-)
> > 
> > diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> > index 55dee124ee44..580a392d7c37 100644
> > --- a/fs/btrfs/ctree.h
> > +++ b/fs/btrfs/ctree.h
> > @@ -1056,6 +1056,7 @@ struct btrfs_fs_info {
> >  	 */
> >  	spinlock_t relocation_bg_lock;
> >  	u64 data_reloc_bg;
> > +	struct mutex zoned_data_reloc_io_lock;
> >  
> >  	u64 nr_global_roots;
> >  
> > diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> > index 2689e8589627..2a0284c2430e 100644
> > --- a/fs/btrfs/disk-io.c
> > +++ b/fs/btrfs/disk-io.c
> > @@ -3179,6 +3179,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
> >  	mutex_init(&fs_info->reloc_mutex);
> >  	mutex_init(&fs_info->delalloc_root_mutex);
> >  	mutex_init(&fs_info->zoned_meta_io_lock);
> > +	mutex_init(&fs_info->zoned_data_reloc_io_lock);
> >  	seqlock_init(&fs_info->profiles_lock);
> >  
> >  	INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
> > diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
> > index fc2034e66ce3..de923fc8449d 100644
> > --- a/fs/btrfs/zoned.h
> > +++ b/fs/btrfs/zoned.h
> > @@ -361,7 +361,7 @@ static inline void btrfs_zoned_data_reloc_lock(struct btrfs_inode *inode)
> >  	struct btrfs_root *root = inode->root;
> >  
> >  	if (btrfs_is_data_reloc_root(root) && btrfs_is_zoned(root->fs_info))
> > -		btrfs_inode_lock(&inode->vfs_inode, 0);
> > +		mutex_lock(&root->fs_info->zoned_data_reloc_io_lock);
> >  }
> >  
> >  static inline void btrfs_zoned_data_reloc_unlock(struct btrfs_inode *inode)
> > @@ -369,7 +369,7 @@ static inline void btrfs_zoned_data_reloc_unlock(struct btrfs_inode *inode)
> >  	struct btrfs_root *root = inode->root;
> >  
> >  	if (btrfs_is_data_reloc_root(root) && btrfs_is_zoned(root->fs_info))
> > -		btrfs_inode_unlock(&inode->vfs_inode, 0);
> > +		mutex_unlock(&root->fs_info->zoned_data_reloc_io_lock);
> >  }
> >  
> >  #endif
> 
> 
> -- 
> Damien Le Moal
> Western Digital Research
diff mbox series

Patch

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 55dee124ee44..580a392d7c37 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1056,6 +1056,7 @@  struct btrfs_fs_info {
 	 */
 	spinlock_t relocation_bg_lock;
 	u64 data_reloc_bg;
+	struct mutex zoned_data_reloc_io_lock;
 
 	u64 nr_global_roots;
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2689e8589627..2a0284c2430e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3179,6 +3179,7 @@  void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
 	mutex_init(&fs_info->reloc_mutex);
 	mutex_init(&fs_info->delalloc_root_mutex);
 	mutex_init(&fs_info->zoned_meta_io_lock);
+	mutex_init(&fs_info->zoned_data_reloc_io_lock);
 	seqlock_init(&fs_info->profiles_lock);
 
 	INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index fc2034e66ce3..de923fc8449d 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -361,7 +361,7 @@  static inline void btrfs_zoned_data_reloc_lock(struct btrfs_inode *inode)
 	struct btrfs_root *root = inode->root;
 
 	if (btrfs_is_data_reloc_root(root) && btrfs_is_zoned(root->fs_info))
-		btrfs_inode_lock(&inode->vfs_inode, 0);
+		mutex_lock(&root->fs_info->zoned_data_reloc_io_lock);
 }
 
 static inline void btrfs_zoned_data_reloc_unlock(struct btrfs_inode *inode)
@@ -369,7 +369,7 @@  static inline void btrfs_zoned_data_reloc_unlock(struct btrfs_inode *inode)
 	struct btrfs_root *root = inode->root;
 
 	if (btrfs_is_data_reloc_root(root) && btrfs_is_zoned(root->fs_info))
-		btrfs_inode_unlock(&inode->vfs_inode, 0);
+		mutex_unlock(&root->fs_info->zoned_data_reloc_io_lock);
 }
 
 #endif