diff mbox series

[v2,6/7] btrfs: scrub: ensure we output at least one error message for unrepaired corruption

Message ID 295591e961dc5b1f14b8ffcbccfa1e3530e2ba91.1710906371.git.wqu@suse.com (mailing list archive)
State New, archived
Headers show
Series btrfs: scrub: refine the error messages output frequency | expand

Commit Message

Qu Wenruo March 20, 2024, 3:54 a.m. UTC
For btrfs scrub error messages, I have hit a lot of support cases on
older kernels where no filename is outputted at all, with only error
messages like:

 BTRFS error (device dm-0): bdev /dev/mapper/sys-rootlv errs: wr 0, rd 0, flush 0, corrupt 2823, gen 0
 BTRFS error (device dm-0): unable to fixup (regular) error at logical 1563504640 on dev /dev/mapper/sys-rootlv
 BTRFS error (device dm-0): bdev /dev/mapper/sys-rootlv errs: wr 0, rd 0, flush 0, corrupt 2824, gen 0
 BTRFS error (device dm-0): unable to fixup (regular) error at logical 1579016192 on dev /dev/mapper/sys-rootlv
 BTRFS error (device dm-0): bdev /dev/mapper/sys-rootlv errs: wr 0, rd 0, flush 0, corrupt 2825, gen 0

The "unable to fixup (regular) error" line shows we hit an unrepairable
error, then normally we would do data/metadata backref walk to grab the
correct info.

But we can hit cases like the inode is already orphan (unlinked from any
parent inode), or even the subvolume is orphan (unlinked and waiting to
be deleted).

In that case we're not sure what's the proper way to continue (Is it
some critical data/metadata? Would it prevent the system from booting?)

To improve the situation, this patch would:

- Ensure we at least output one message for each unrepairable error
  If by somehow we didn't output any error message for the error, we
  always fallback to the basic logical/physical error output, with its
  type (data or metadata).

Signed-off-by: Qu Wenruo <wqu@suse.com>
---
 fs/btrfs/scrub.c | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

Comments

Filipe Manana March 20, 2024, 1:30 p.m. UTC | #1
On Wed, Mar 20, 2024 at 3:55 AM Qu Wenruo <wqu@suse.com> wrote:
>
> For btrfs scrub error messages, I have hit a lot of support cases on
> older kernels where no filename is outputted at all, with only error
> messages like:
>
>  BTRFS error (device dm-0): bdev /dev/mapper/sys-rootlv errs: wr 0, rd 0, flush 0, corrupt 2823, gen 0
>  BTRFS error (device dm-0): unable to fixup (regular) error at logical 1563504640 on dev /dev/mapper/sys-rootlv
>  BTRFS error (device dm-0): bdev /dev/mapper/sys-rootlv errs: wr 0, rd 0, flush 0, corrupt 2824, gen 0
>  BTRFS error (device dm-0): unable to fixup (regular) error at logical 1579016192 on dev /dev/mapper/sys-rootlv
>  BTRFS error (device dm-0): bdev /dev/mapper/sys-rootlv errs: wr 0, rd 0, flush 0, corrupt 2825, gen 0
>
> The "unable to fixup (regular) error" line shows we hit an unrepairable
> error, then normally we would do data/metadata backref walk to grab the
> correct info.
>
> But we can hit cases like the inode is already orphan (unlinked from any
> parent inode), or even the subvolume is orphan (unlinked and waiting to
> be deleted).
>
> In that case we're not sure what's the proper way to continue (Is it
> some critical data/metadata? Would it prevent the system from booting?)
>
> To improve the situation, this patch would:
>
> - Ensure we at least output one message for each unrepairable error
>   If by somehow we didn't output any error message for the error, we
>   always fallback to the basic logical/physical error output, with its
>   type (data or metadata).
>
> Signed-off-by: Qu Wenruo <wqu@suse.com>

Reviewed-by: Filipe Manana <fdmanana@suse.com>

Looks good, just a minor comment below.

> ---
>  fs/btrfs/scrub.c | 25 ++++++++++++++++++-------
>  1 file changed, 18 insertions(+), 7 deletions(-)
>
> diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
> index 85f321e3e37c..0d2b042d75c2 100644
> --- a/fs/btrfs/scrub.c
> +++ b/fs/btrfs/scrub.c
> @@ -226,6 +226,7 @@ struct scrub_warning {
>         u64                     physical;
>         u64                     logical;
>         struct btrfs_device     *dev;
> +       bool                    message_printed;
>  };
>
>  static void release_scrub_stripe(struct scrub_stripe *stripe)
> @@ -425,7 +426,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
>          * we deliberately ignore the bit ipath might have been too small to
>          * hold all of the paths here
>          */
> -       for (i = 0; i < ipath->fspath->elem_cnt; ++i)
> +       for (i = 0; i < ipath->fspath->elem_cnt; ++i) {
>                 btrfs_warn_in_rcu(fs_info,
>  "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, path: %s",
>                                   swarn->errstr, swarn->logical,
> @@ -433,6 +434,8 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
>                                   swarn->physical,
>                                   root, inum, offset,
>                                   (char *)(unsigned long)ipath->fspath->val[i]);
> +               swarn->message_printed = true;
> +       }
>
>         btrfs_put_root(local_root);
>         free_ipath(ipath);
> @@ -445,7 +448,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
>                           btrfs_dev_name(swarn->dev),
>                           swarn->physical,
>                           root, inum, offset, ret);
> -
> +       swarn->message_printed = true;
>         free_ipath(ipath);
>         return 0;
>  }
> @@ -471,6 +474,7 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device *
>         swarn.logical = logical;
>         swarn.errstr = errstr;
>         swarn.dev = NULL;
> +       swarn.message_printed = false;
>
>         ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
>                                   &flags);
> @@ -492,12 +496,8 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device *
>                         ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
>                                                       item_size, &ref_root,
>                                                       &ref_level);
> -                       if (ret < 0) {
> -                               btrfs_warn(fs_info,
> -                               "failed to resolve tree backref for logical %llu: %d",
> -                                                 swarn.logical, ret);
> +                       if (ret < 0)
>                                 break;
> -                       }
>                         if (ret > 0)
>                                 break;
>                         btrfs_warn_in_rcu(fs_info,
> @@ -505,7 +505,13 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device *
>                                 errstr, swarn.logical, btrfs_dev_name(dev),
>                                 swarn.physical, (ref_level ? "node" : "leaf"),
>                                 ref_level, ref_root);
> +                       swarn.message_printed = true;
>                 }
> +               if (!swarn.message_printed)
> +                       btrfs_warn_in_rcu(fs_info,
> +                       "%s at metadata, logical %llu on dev %s physical %llu",
> +                                         errstr, swarn.logical,
> +                                         btrfs_dev_name(dev), swarn.physical);

This could be moved below the btrfs_release_path() call, as there's no
need to be holding a leaf locked while printing the message. It's an
error path and should not happen normally, but it's a good practice to
release any locks as soon as they aren't needed anymore.

Anyway, it's optional, and feel free to skip or do that adjustment
when committing the patch.

Thanks.

>                 btrfs_release_path(path);
>         } else {
>                 struct btrfs_backref_walk_ctx ctx = { 0 };
> @@ -520,6 +526,11 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device *
>                 swarn.dev = dev;
>
>                 iterate_extent_inodes(&ctx, true, scrub_print_warning_inode, &swarn);
> +               if (!swarn.message_printed)
> +                       btrfs_warn_in_rcu(fs_info,
> +       "%s at data, filename unresolved, logical %llu on dev %s physical %llu",
> +                                         errstr, swarn.logical,
> +                                         btrfs_dev_name(dev), swarn.physical);
>         }
>
>  out:
> --
> 2.44.0
>
>
diff mbox series

Patch

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 85f321e3e37c..0d2b042d75c2 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -226,6 +226,7 @@  struct scrub_warning {
 	u64			physical;
 	u64			logical;
 	struct btrfs_device	*dev;
+	bool			message_printed;
 };
 
 static void release_scrub_stripe(struct scrub_stripe *stripe)
@@ -425,7 +426,7 @@  static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
 	 * we deliberately ignore the bit ipath might have been too small to
 	 * hold all of the paths here
 	 */
-	for (i = 0; i < ipath->fspath->elem_cnt; ++i)
+	for (i = 0; i < ipath->fspath->elem_cnt; ++i) {
 		btrfs_warn_in_rcu(fs_info,
 "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, path: %s",
 				  swarn->errstr, swarn->logical,
@@ -433,6 +434,8 @@  static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
 				  swarn->physical,
 				  root, inum, offset,
 				  (char *)(unsigned long)ipath->fspath->val[i]);
+		swarn->message_printed = true;
+	}
 
 	btrfs_put_root(local_root);
 	free_ipath(ipath);
@@ -445,7 +448,7 @@  static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
 			  btrfs_dev_name(swarn->dev),
 			  swarn->physical,
 			  root, inum, offset, ret);
-
+	swarn->message_printed = true;
 	free_ipath(ipath);
 	return 0;
 }
@@ -471,6 +474,7 @@  static void scrub_print_common_warning(const char *errstr, struct btrfs_device *
 	swarn.logical = logical;
 	swarn.errstr = errstr;
 	swarn.dev = NULL;
+	swarn.message_printed = false;
 
 	ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
 				  &flags);
@@ -492,12 +496,8 @@  static void scrub_print_common_warning(const char *errstr, struct btrfs_device *
 			ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
 						      item_size, &ref_root,
 						      &ref_level);
-			if (ret < 0) {
-				btrfs_warn(fs_info,
-				"failed to resolve tree backref for logical %llu: %d",
-						  swarn.logical, ret);
+			if (ret < 0)
 				break;
-			}
 			if (ret > 0)
 				break;
 			btrfs_warn_in_rcu(fs_info,
@@ -505,7 +505,13 @@  static void scrub_print_common_warning(const char *errstr, struct btrfs_device *
 				errstr, swarn.logical, btrfs_dev_name(dev),
 				swarn.physical, (ref_level ? "node" : "leaf"),
 				ref_level, ref_root);
+			swarn.message_printed = true;
 		}
+		if (!swarn.message_printed)
+			btrfs_warn_in_rcu(fs_info,
+			"%s at metadata, logical %llu on dev %s physical %llu",
+					  errstr, swarn.logical,
+					  btrfs_dev_name(dev), swarn.physical);
 		btrfs_release_path(path);
 	} else {
 		struct btrfs_backref_walk_ctx ctx = { 0 };
@@ -520,6 +526,11 @@  static void scrub_print_common_warning(const char *errstr, struct btrfs_device *
 		swarn.dev = dev;
 
 		iterate_extent_inodes(&ctx, true, scrub_print_warning_inode, &swarn);
+		if (!swarn.message_printed)
+			btrfs_warn_in_rcu(fs_info,
+	"%s at data, filename unresolved, logical %llu on dev %s physical %llu",
+					  errstr, swarn.logical,
+					  btrfs_dev_name(dev), swarn.physical);
 	}
 
 out: