@@ -133,6 +133,16 @@ struct btrfs_raid_bio {
/* second bad stripe (for raid6 use) */
int failb;
+ /*
+ * For steal_rbio, we can steal recovered correct page,
+ * but in finish_parity_scrub(), we still use bad on-disk
+ * page to calculate parity.
+ * Use these members to info finish_parity_scrub() to use
+ * correct pages
+ */
+ int bad_ondisk_a;
+ int bad_ondisk_b;
+
int scrubp;
/*
* number of pages needed to represent the full
@@ -310,6 +320,12 @@ static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
return;
+ /* Record recovered stripe number */
+ if (src->faila != -1)
+ dest->bad_ondisk_a = src->faila;
+ if (src->failb != -1)
+ dest->bad_ondisk_b = src->failb;
+
for (i = 0; i < dest->nr_pages; i++) {
s = src->stripe_pages[i];
if (!s || !PageUptodate(s)) {
@@ -998,6 +1014,8 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
rbio->stripe_npages = stripe_npages;
rbio->faila = -1;
rbio->failb = -1;
+ rbio->bad_ondisk_a = -1;
+ rbio->bad_ondisk_b = -1;
atomic_set(&rbio->refs, 1);
atomic_set(&rbio->error, 0);
atomic_set(&rbio->stripes_pending, 0);
@@ -2271,6 +2289,9 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
int bit;
int index;
struct page *page;
+ struct page *bio_page;
+ void *ptr;
+ void *bio_ptr;
for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) {
for (i = 0; i < rbio->real_stripes; i++) {
@@ -2281,6 +2302,23 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
if (!page)
return -ENOMEM;
+
+ /*
+ * It's possible that only several pages need recover,
+ * and rest are all good.
+ * In that case we need to copy good bio pages into
+ * stripe_pages[], as we will use stripe_pages[] other
+ * than bio_pages[] in finish_parity_scrub().
+ */
+ bio_page = page_in_rbio(rbio, i, bit, 0);
+ if ((i == rbio->bad_ondisk_a ||
+ i == rbio->bad_ondisk_b) && bio_page) {
+ ptr = kmap(page);
+ bio_ptr = kmap(bio_page);
+ memcpy(ptr, bio_ptr, PAGE_SIZE);
+ kunmap(bio_page);
+ kunmap(page);
+ }
rbio->stripe_pages[index] = page;
}
}
@@ -2292,6 +2330,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
{
struct btrfs_bio *bbio = rbio->bbio;
void *pointers[rbio->real_stripes];
+ struct page *mapped_pages[rbio->real_stripes];
DECLARE_BITMAP(pbitmap, rbio->stripe_npages);
int nr_data = rbio->nr_data;
int stripe;
@@ -2352,12 +2391,24 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
void *parity;
/* first collect one page from each data stripe */
for (stripe = 0; stripe < nr_data; stripe++) {
- p = page_in_rbio(rbio, stripe, pagenr, 0);
+
+ /*
+ * Use stolen recovered page other than bad
+ * on disk pages
+ */
+ if (stripe == rbio->bad_ondisk_a ||
+ stripe == rbio->bad_ondisk_b)
+ p = rbio_stripe_page(rbio, stripe, pagenr);
+ else
+ p = page_in_rbio(rbio, stripe, pagenr, 0);
pointers[stripe] = kmap(p);
+ mapped_pages[stripe] = p;
}
/* then add the parity stripe */
- pointers[stripe++] = kmap(p_page);
+ pointers[stripe] = kmap(p_page);
+ mapped_pages[stripe] = p_page;
+ stripe++;
if (q_stripe != -1) {
@@ -2365,7 +2416,9 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
* raid6, add the qstripe and call the
* library function to fill in our p/q
*/
- pointers[stripe++] = kmap(q_page);
+ pointers[stripe] = kmap(q_page);
+ mapped_pages[stripe] = q_page;
+ stripe++;
raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
pointers);
@@ -2385,8 +2438,9 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
bitmap_clear(rbio->dbitmap, pagenr, 1);
kunmap(p);
+ /* Free mapped pages */
for (stripe = 0; stripe < rbio->real_stripes; stripe++)
- kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
+ kunmap(mapped_pages[stripe]);
}
__free_page(p_page);
In the following situation, scrub will calculate wrong parity to overwrite correct one: RAID5 full stripe: Before | Dev 1 | Dev 2 | Dev 3 | | Data stripe 1 | Data stripe 2 | Parity Stripe | --------------------------------------------------- 0 | 0x0000 (Bad) | 0xcdcd | 0x0000 | --------------------------------------------------- 4K | 0xcdcd | 0xcdcd | 0x0000 | ... | 0xcdcd | 0xcdcd | 0x0000 | --------------------------------------------------- 64K After scrubbing dev3 only: | Dev 1 | Dev 2 | Dev 3 | | Data stripe 1 | Data stripe 2 | Parity Stripe | --------------------------------------------------- 0 | 0xcdcd (Good) | 0xcdcd | 0xcdcd (Bad) | --------------------------------------------------- 4K | 0xcdcd | 0xcdcd | 0x0000 | ... | 0xcdcd | 0xcdcd | 0x0000 | --------------------------------------------------- 64K The calltrace of such corruption is as following: scrub_bio_end_io_worker() get called for each extent read out |- scriub_block_complete() |- Data extent csum mismatch |- scrub_handle_errored_block |- scrub_recheck_block() |- scrub_submit_raid56_bio_wait() |- raid56_parity_recover() Now we have a rbio with correct data stripe 1 recovered. Let's call it "good_rbio". scrub_parity_check_and_repair() |- raid56_parity_submit_scrub_rbio() |- lock_stripe_add() | |- steal_rbio() | |- Recovered data are steal from "good_rbio", stored into | rbio->stripe_pages[] | Now rbio->bio_pages[] are bad data read from disk. |- async_scrub_parity() |- scrub_parity_work() (delayed_call to scrub_parity_work) scrub_parity_work() |- raid56_parity_scrub_stripe() |- validate_rbio_for_parity_scrub() |- finish_parity_scrub() |- Recalculate parity using *BAD* pages in rbio->bio_pages[] So good parity is overwritten with *BAD* one The fix is to introduce 2 new members, bad_ondisk_a/b, to struct btrfs_raid_bio, to info scrub code to use correct data pages to re-calculate parity. Reported-by: Goffredo Baroncelli <kreijack@inwind.it> Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com> --- v2: Fix a bug that some pages are not unmapped, reported by Chris. v3: Fix a bug using un-initialized pages, which leads to parity corruption again. --- fs/btrfs/raid56.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 58 insertions(+), 4 deletions(-)