@@ -912,7 +912,7 @@ static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio,
static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
struct btrfs_io_context *bioc)
{
- const unsigned int real_stripes = bioc->num_stripes - bioc->num_tgtdevs;
+ const unsigned int real_stripes = bioc->num_stripes - bioc->replace_nr_stripes;
const unsigned int stripe_npages = BTRFS_STRIPE_LEN >> PAGE_SHIFT;
const unsigned int num_pages = stripe_npages * real_stripes;
const unsigned int stripe_nsectors =
@@ -1282,10 +1282,16 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio,
goto error;
}
- if (likely(!rbio->bioc->num_tgtdevs))
+ if (likely(!rbio->bioc->replace_nr_stripes))
return 0;
- /* Make a copy for the replace target device. */
+ /*
+ * Make a copy for the replace target device.
+ *
+ * Thus the source stripe number (in replace_stripe_src) should be valid.
+ */
+ ASSERT(rbio->bioc->replace_stripe_src >= 0);
+
for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
total_sector_nr++) {
struct sector_ptr *sector;
@@ -1293,7 +1299,12 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio,
stripe = total_sector_nr / rbio->stripe_nsectors;
sectornr = total_sector_nr % rbio->stripe_nsectors;
- if (!rbio->bioc->tgtdev_map[stripe]) {
+ /*
+ * For RAID56, there is only one device that can be replaced,
+ * and replace_stripe_src[0] indicates the stripe number we
+ * need to copy from.
+ */
+ if (stripe != rbio->bioc->replace_stripe_src) {
/*
* We can skip the whole stripe completely, note
* total_sector_nr will be increased by one anyway.
@@ -1316,7 +1327,7 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio,
}
ret = rbio_add_io_sector(rbio, bio_list, sector,
- rbio->bioc->tgtdev_map[stripe],
+ rbio->real_stripes,
sectornr, REQ_OP_WRITE);
if (ret)
goto error;
@@ -2442,7 +2453,12 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check)
else
BUG();
- if (bioc->num_tgtdevs && bioc->tgtdev_map[rbio->scrubp]) {
+ /*
+ * Replace is running and our P/Q stripe is being replaced, then we
+ * need to duplicate the final write to replace target.
+ */
+ if (bioc->replace_nr_stripes &&
+ bioc->replace_stripe_src == rbio->scrubp) {
is_replace = 1;
bitmap_copy(pbitmap, &rbio->dbitmap, rbio->stripe_nsectors);
}
@@ -2544,13 +2560,18 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check)
if (!is_replace)
goto submit_write;
+ /*
+ * Replace is running and our parity stripe needs to be duplicated to
+ * the target device. Check we have a valid source stripe number.
+ */
+ ASSERT(rbio->bioc->replace_stripe_src >= 0);
for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) {
struct sector_ptr *sector;
sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
ret = rbio_add_io_sector(rbio, &bio_list, sector,
- bioc->tgtdev_map[rbio->scrubp],
- sectornr, REQ_OP_WRITE);
+ rbio->real_stripes,
+ sectornr, REQ_OP_WRITE);
if (ret)
goto cleanup;
}
@@ -1230,7 +1230,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
sblock_other = sblocks_for_recheck[mirror_index];
} else {
struct scrub_recover *r = sblock_bad->sectors[0]->recover;
- int max_allowed = r->bioc->num_stripes - r->bioc->num_tgtdevs;
+ int max_allowed = r->bioc->num_stripes - r->bioc->replace_nr_stripes;
if (mirror_index >= max_allowed)
break;
@@ -1540,7 +1540,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
bioc->map_type,
bioc->raid_map,
bioc->num_stripes -
- bioc->num_tgtdevs,
+ bioc->replace_nr_stripes,
mirror_index,
&stripe_index,
&stripe_offset);
@@ -5898,8 +5898,7 @@ static void sort_parity_stripes(struct btrfs_io_context *bioc, int num_stripes)
}
static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
- u16 total_stripes,
- u16 real_stripes)
+ u16 total_stripes)
{
struct btrfs_io_context *bioc;
@@ -5908,8 +5907,6 @@ static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_
sizeof(struct btrfs_io_context) +
/* Plus the variable array for the stripes */
sizeof(struct btrfs_io_stripe) * (total_stripes) +
- /* Plus the variable array for the tgt dev */
- sizeof(u16) * (real_stripes) +
/*
* Plus the raid_map, which includes both the tgt dev
* and the stripes.
@@ -5923,8 +5920,8 @@ static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_
refcount_set(&bioc->refs, 1);
bioc->fs_info = fs_info;
- bioc->tgtdev_map = (u16 *)(bioc->stripes + total_stripes);
- bioc->raid_map = (u64 *)(bioc->tgtdev_map + real_stripes);
+ bioc->raid_map = (u64 *)(bioc->stripes + total_stripes);
+ bioc->replace_stripe_src = -1;
return bioc;
}
@@ -6188,93 +6185,75 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op,
int *num_stripes_ret, int *max_errors_ret)
{
u64 srcdev_devid = dev_replace->srcdev->devid;
- int tgtdev_indexes = 0;
+ /*
+ * At this stage, num_stripes is still the real number of stripes,
+ * excluding the duplicated stripes.
+ */
int num_stripes = *num_stripes_ret;
+ int nr_extra_stripes = 0;
int max_errors = *max_errors_ret;
int i;
- if (op == BTRFS_MAP_WRITE) {
- int index_where_to_add;
+ /*
+ * A block group which has "to_copy" set will eventually be copied by
+ * the dev-replace process. We can avoid cloning IO here.
+ */
+ if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical))
+ return;
+
+ /*
+ * Duplicate the write operations while the dev-replace procedure is
+ * running. Since the copying of the old disk to the new disk takes
+ * place at run time while the filesystem is mounted writable, the
+ * regular write operations to the old disk have to be duplicated to go
+ * to the new disk as well.
+ *
+ * Note that device->missing is handled by the caller, and that the
+ * write to the old disk is already set up in the stripes array.
+ */
+ for (i = 0; i < num_stripes; i++) {
+ struct btrfs_io_stripe *old = &bioc->stripes[i];
+ struct btrfs_io_stripe *new = &bioc->stripes[num_stripes + nr_extra_stripes];
+
+ if (old->dev->devid != srcdev_devid)
+ continue;
+
+ new->physical = old->physical;
+ new->dev = dev_replace->tgtdev;
+ if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK)
+ bioc->replace_stripe_src = i;
+ nr_extra_stripes++;
+ }
+
+ /* We can only have at most 2 extra nr_stripes (for DUP). */
+ ASSERT(nr_extra_stripes <= 2);
+ /*
+ * For GET_READ_MIRRORS, we can only return at most 1 extra stripe for
+ * replace.
+ * If we have 2 extra stripes, only choose the one with smaller physical.
+ */
+ if (op == BTRFS_MAP_GET_READ_MIRRORS && nr_extra_stripes == 2) {
+ struct btrfs_io_stripe *first = &bioc->stripes[num_stripes];
+ struct btrfs_io_stripe *second = &bioc->stripes[num_stripes + 1];
+
+ /* Only DUP can have two extra stripes. */
+ ASSERT(bioc->map_type & BTRFS_BLOCK_GROUP_DUP);
/*
- * A block group which have "to_copy" set will eventually
- * copied by dev-replace process. We can avoid cloning IO here.
+ * Swap the last stripe stripes and just reduce
+ * @nr_extra_stripes. The extra stripe would still be there,
+ * but won't be accessed.
*/
- if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical))
- return;
-
- /*
- * duplicate the write operations while the dev replace
- * procedure is running. Since the copying of the old disk to
- * the new disk takes place at run time while the filesystem is
- * mounted writable, the regular write operations to the old
- * disk have to be duplicated to go to the new disk as well.
- *
- * Note that device->missing is handled by the caller, and that
- * the write to the old disk is already set up in the stripes
- * array.
- */
- index_where_to_add = num_stripes;
- for (i = 0; i < num_stripes; i++) {
- if (bioc->stripes[i].dev->devid == srcdev_devid) {
- /* write to new disk, too */
- struct btrfs_io_stripe *new =
- bioc->stripes + index_where_to_add;
- struct btrfs_io_stripe *old =
- bioc->stripes + i;
-
- new->physical = old->physical;
- new->dev = dev_replace->tgtdev;
- bioc->tgtdev_map[i] = index_where_to_add;
- index_where_to_add++;
- max_errors++;
- tgtdev_indexes++;
- }
- }
- num_stripes = index_where_to_add;
- } else if (op == BTRFS_MAP_GET_READ_MIRRORS) {
- int index_srcdev = 0;
- int found = 0;
- u64 physical_of_found = 0;
-
- /*
- * During the dev-replace procedure, the target drive can also
- * be used to read data in case it is needed to repair a corrupt
- * block elsewhere. This is possible if the requested area is
- * left of the left cursor. In this area, the target drive is a
- * full copy of the source drive.
- */
- for (i = 0; i < num_stripes; i++) {
- if (bioc->stripes[i].dev->devid == srcdev_devid) {
- /*
- * In case of DUP, in order to keep it simple,
- * only add the mirror with the lowest physical
- * address
- */
- if (found &&
- physical_of_found <= bioc->stripes[i].physical)
- continue;
- index_srcdev = i;
- found = 1;
- physical_of_found = bioc->stripes[i].physical;
- }
- }
- if (found) {
- struct btrfs_io_stripe *tgtdev_stripe =
- bioc->stripes + num_stripes;
-
- tgtdev_stripe->physical = physical_of_found;
- tgtdev_stripe->dev = dev_replace->tgtdev;
- bioc->tgtdev_map[index_srcdev] = num_stripes;
-
- tgtdev_indexes++;
- num_stripes++;
+ if (first->physical > second->physical) {
+ swap(second->physical, first->physical);
+ swap(second->dev, first->dev);
+ nr_extra_stripes--;
}
}
- *num_stripes_ret = num_stripes;
- *max_errors_ret = max_errors;
- bioc->num_tgtdevs = tgtdev_indexes;
+ *num_stripes_ret = num_stripes + nr_extra_stripes;
+ *max_errors_ret = max_errors + nr_extra_stripes;
+ bioc->replace_nr_stripes = nr_extra_stripes;
}
static bool need_full_stripe(enum btrfs_map_op op)
@@ -6361,7 +6340,6 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
int dev_replace_is_ongoing = 0;
int patch_the_first_stripe_for_dev_replace = 0;
u16 num_alloc_stripes;
- u16 tgtdev_indexes = 0;
u64 physical_to_patch_in_first_stripe = 0;
u64 raid56_full_stripe_start = (u64)-1;
u64 max_len;
@@ -6506,13 +6484,16 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
}
num_alloc_stripes = num_stripes;
- if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) {
- if (op == BTRFS_MAP_WRITE)
- num_alloc_stripes <<= 1;
- if (op == BTRFS_MAP_GET_READ_MIRRORS)
- num_alloc_stripes++;
- tgtdev_indexes = num_stripes;
- }
+ if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
+ op != BTRFS_MAP_READ)
+ /*
+ * For replace case, we need to add extra stripes for extra
+ * duplicated stripes.
+ *
+ * For both WRITE and GET_READ_MIRRORS, we may have at most
+ * 2 more stripes (DUP types, otherwise 1).
+ */
+ num_alloc_stripes += 2;
/*
* If this I/O maps to a single device, try to return the device and
@@ -6537,11 +6518,12 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
goto out;
}
- bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes, tgtdev_indexes);
+ bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes);
if (!bioc) {
ret = -ENOMEM;
goto out;
}
+ bioc->map_type = map->type;
for (i = 0; i < num_stripes; i++) {
set_io_stripe(&bioc->stripes[i], map, stripe_index, stripe_offset,
@@ -6582,7 +6564,6 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
}
*bioc_ret = bioc;
- bioc->map_type = map->type;
bioc->num_stripes = num_stripes;
bioc->max_errors = max_errors;
bioc->mirror_num = mirror_num;
@@ -426,14 +426,13 @@ struct btrfs_io_context {
/*
* The following two members are for dev-replace case only.
*
- * @num_tgtdevs: Number of duplicated stripes which need to be
+ * @replace_nr_stripes: Number of duplicated stripes which need to be
* written to replace target.
* Should be <= 2 (2 for DUP, otherwise <= 1).
- * @tgtdev_map: The array indicates where the duplicated stripes
- * are from. The size is the number of original
- * stripes (num_stripes - num_tgtdevs).
+ * @replace_stripe_src: The array indicates where the duplicated stripes
+ * are from.
*
- * The @tgtdev_map[] array is mostly for RAID56 cases.
+ * The @replace_stripe_src[] array is mostly for RAID56 cases.
* As non-RAID56 stripes share the same contents of the mapped range,
* thus no need to bother where the duplicated ones are from.
*
@@ -448,14 +447,17 @@ struct btrfs_io_context {
* stripes[2]: dev = devid 3, physical = Z
* stripes[3]: dev = devid 0, physical = Y
*
- * num_tgtdevs = 1
- * tgtdev_map[0] = 0 <- Means stripes[0] is not involved in replace.
- * tgtdev_map[1] = 3 <- Means stripes[1] is involved in replace,
- * and it's duplicated to stripes[3].
- * tgtdev_map[2] = 0 <- Means stripes[2] is not involved in replace.
+ * replace_nr_stripes = 1
+ * replace_stripe_src = 1 <- Means stripes[1] is involved in replace.
+ * The duplicated stripe index would be
+ * (@num_stripes - 1).
+ *
+ * Note, that we can still have cases replace_nr_stripes = 2 for DUP.
+ * In that case, all stripes share the same content, thus we don't
+ * need to bother @replace_stripe_src value at all.
*/
- u16 num_tgtdevs;
- u16 *tgtdev_map;
+ u16 replace_nr_stripes;
+ s16 replace_stripe_src;
/*
* logical block numbers for the start of each stripe
* The last one or two are p/q. These are sorted,