@@ -51,8 +51,8 @@ static u32 max_nritems(u8 level, u32 nodesize)
sizeof(struct btrfs_key_ptr));
}
-static int check_tree_block(struct btrfs_fs_info *fs_info,
- struct extent_buffer *buf)
+int check_tree_block(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *buf)
{
struct btrfs_fs_devices *fs_devices;
@@ -126,6 +126,8 @@ static inline struct extent_buffer* read_tree_block(
parent_transid);
}
+int check_tree_block(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *buf);
int read_extent_data(struct btrfs_root *root, char *data, u64 logical,
u64 *len, int mirror);
void readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
@@ -124,3 +124,148 @@ static struct scrub_full_stripe *alloc_full_stripe(int nr_stripes,
}
return ret;
}
+
+static inline int is_data_stripe(struct scrub_stripe *stripe)
+{
+ u64 bytenr = stripe->logical;
+
+ if (bytenr == BTRFS_RAID5_P_STRIPE || bytenr == BTRFS_RAID6_Q_STRIPE)
+ return 0;
+ return 1;
+}
+
+/*
+ * Check one tree mirror given by @bytenr and @mirror, or @data.
+ * If @data is not given(NULL), the function will try to read out tree block
+ * using @bytenr and @mirror.
+ * If @data is given, use data directly, won't try to read from disk.
+ *
+ * The extra @data prameter is handy for RAID5/6 recovery code to verify
+ * the recovered data.
+ *
+ * Return 0 if everything is OK.
+ * Return <0 something goes wrong, and @scrub_ctx accounting will be updated
+ * if it's a data corruption.
+ */
+static int check_tree_mirror(struct btrfs_fs_info *fs_info,
+ struct btrfs_scrub_progress *scrub_ctx,
+ char *data, u64 bytenr, int mirror)
+{
+ struct extent_buffer *eb;
+ u32 nodesize = fs_info->tree_root->nodesize;
+ int ret;
+
+ if (!IS_ALIGNED(bytenr, fs_info->tree_root->sectorsize)) {
+ /* Such error will be reported by check_tree_block() */
+ scrub_ctx->verify_errors++;
+ return -EIO;
+ }
+
+ eb = btrfs_find_create_tree_block(fs_info, bytenr, nodesize);
+ if (!eb)
+ return -ENOMEM;
+ if (data) {
+ memcpy(eb->data, data, nodesize);
+ } else {
+ ret = read_whole_eb(fs_info, eb, mirror);
+ if (ret) {
+ scrub_ctx->read_errors++;
+ error("failed to read tree block %llu mirror %d",
+ bytenr, mirror);
+ goto out;
+ }
+ }
+
+ scrub_ctx->tree_bytes_scrubbed += nodesize;
+ if (csum_tree_block(fs_info->tree_root, eb, 1)) {
+ error("tree block %llu mirror %d checksum mismatch", bytenr,
+ mirror);
+ scrub_ctx->csum_errors++;
+ ret = -EIO;
+ goto out;
+ }
+ ret = check_tree_block(fs_info, eb);
+ if (ret < 0) {
+ error("tree block %llu mirror %d is invalid", bytenr, mirror);
+ scrub_ctx->verify_errors++;
+ goto out;
+ }
+
+ scrub_ctx->tree_extents_scrubbed++;
+out:
+ free_extent_buffer(eb);
+ return ret;
+}
+
+/*
+ * read_extent_data() helper
+ *
+ * This function will handle short read and update @scrub_ctx when read
+ * error happens.
+ */
+static int read_extent_data_loop(struct btrfs_fs_info *fs_info,
+ struct btrfs_scrub_progress *scrub_ctx,
+ char *buf, u64 start, u64 len, int mirror)
+{
+ int ret = 0;
+ u64 cur = 0;
+
+ while (cur < len) {
+ u64 read_len = len - cur;
+
+ ret = read_extent_data(fs_info->tree_root, buf + cur,
+ start + cur, &read_len, mirror);
+ if (ret < 0) {
+ error("failed to read out data at bytenr %llu mirror %d",
+ start + cur, mirror);
+ scrub_ctx->read_errors++;
+ break;
+ }
+ cur += read_len;
+ }
+ return ret;
+}
+
+/*
+ * Recover all other (corrupted) mirrors for tree block.
+ *
+ * The method is quite simple, just read out the correct mirror specified by
+ * @good_mirror and write back correct data to all other blocks
+ */
+static int recover_tree_mirror(struct btrfs_fs_info *fs_info,
+ struct btrfs_scrub_progress *scrub_ctx,
+ u64 start, int good_mirror)
+{
+ char *buf;
+ u32 nodesize = fs_info->tree_root->nodesize;
+ int i;
+ int num_copies;
+ int ret;
+
+ buf = malloc(nodesize);
+ if (!buf)
+ return -ENOMEM;
+ ret = read_extent_data_loop(fs_info, scrub_ctx, buf, start, nodesize,
+ good_mirror);
+ if (ret < 0) {
+ error("failed to read tree block at bytenr %llu mirror %d",
+ start, good_mirror);
+ goto out;
+ }
+
+ num_copies = btrfs_num_copies(&fs_info->mapping_tree, start, nodesize);
+ for (i = 0; i <= num_copies; i++) {
+ if (i == good_mirror)
+ continue;
+ ret = write_data_to_disk(fs_info, buf, start, nodesize, i);
+ if (ret < 0) {
+ error("failed to write tree block at bytenr %llu mirror %d",
+ start, i);
+ goto out;
+ }
+ }
+ ret = 0;
+out:
+ free(buf);
+ return ret;
+}
Introduce new functions, check/recover_tree_mirror(), to check and recover mirror-based tree blocks (Single/DUP/RAID0/1/10). check_tree_mirror() can also be used on in-memory tree blocks using @data parameter. This is very handy for RAID5/6 case, either checking the data stripe tree block by @bytenr and 0 as @mirror, or using @data parameter for recovered in-memory data. While recover_tree_mirror() is only used for mirror-based profiles, as RAID56 recovery is done by stripe unit, not mirror unit. Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com> --- disk-io.c | 4 +- disk-io.h | 2 + scrub.c | 145 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 149 insertions(+), 2 deletions(-)