Message ID | ZiFbWx6o-hQ38QyZ@casper.infradead.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Removing PG_error use from btrfs | expand |
On Thu, Apr 18, 2024 at 06:41:47PM +0100, Matthew Wilcox wrote: > We're down to just JFS and btrfs using the PG_error flag. I sent a > patch earlier to remove PG_error from JFS, so now it's your turn ... > > btrfs currently uses it to indicate superblock writeback errors. > This proposal moves that information to a counter in the btrfs_device. > Maybe this isn't the best approach. What do you think? Tracking the number of errors in the device is a good approach. The superblock write is asynchronous but it's not necessary to track the error in the page, we have the device structure in the end io callback. Also it's guaranteed that this is running only from one place so not even the atomics are needed. > I'm currently running fstests against it and it hasn't blown up yet. > > diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c > index 3d512b041977..5f6f8472ecec 100644 > --- a/fs/btrfs/disk-io.c > +++ b/fs/btrfs/disk-io.c > @@ -3627,28 +3627,24 @@ ALLOW_ERROR_INJECTION(open_ctree, ERRNO); > static void btrfs_end_super_write(struct bio *bio) > { > struct btrfs_device *device = bio->bi_private; > - struct bio_vec *bvec; > - struct bvec_iter_all iter_all; > - struct page *page; > - > - bio_for_each_segment_all(bvec, bio, iter_all) { > - page = bvec->bv_page; > + struct folio_iter fi; I'd rather make the conversion from pages to folios a separate patch from the error counting change. I haven't seen anything obviously wrong but the superblock write is a critical action so it's a matter of precaution. > + bio_for_each_folio_all(fi, bio) { > if (bio->bi_status) { > btrfs_warn_rl_in_rcu(device->fs_info, > - "lost page write due to IO error on %s (%d)", > + "lost sb write due to IO error on %s (%d)", > btrfs_dev_name(device), > blk_status_to_errno(bio->bi_status)); > - ClearPageUptodate(page); > - SetPageError(page); > btrfs_dev_stat_inc_and_print(device, > BTRFS_DEV_STAT_WRITE_ERRS); > - } else { > - SetPageUptodate(page); > + /* Ensure failure if a primary sb fails */ > + if (bio->bi_opf & REQ_FUA) > + atomic_set(&device->sb_wb_errors, INT_MAX / 2); This is using some magic constant so it would be better defined separately and documented what it means. > + else > + atomic_inc(&device->sb_wb_errors); > } > - > - put_page(page); > - unlock_page(page); > + folio_unlock(fi.folio); > + folio_put(fi.folio); > } > > bio_put(bio); > @@ -3750,19 +3746,21 @@ static int write_dev_supers(struct btrfs_device *device, > struct address_space *mapping = device->bdev->bd_mapping; > SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); > int i; > - int errors = 0; > int ret; > u64 bytenr, bytenr_orig; > > + atomic_set(&device->sb_wb_errors, 0); > + > if (max_mirrors == 0) > max_mirrors = BTRFS_SUPER_MIRROR_MAX; > > shash->tfm = fs_info->csum_shash; > > for (i = 0; i < max_mirrors; i++) { > - struct page *page; > + struct folio *folio; > struct bio *bio; > struct btrfs_super_block *disk_super; > + size_t offset; > > bytenr_orig = btrfs_sb_offset(i); > ret = btrfs_sb_log_location(device, i, WRITE, &bytenr); > @@ -3772,7 +3770,7 @@ static int write_dev_supers(struct btrfs_device *device, > btrfs_err(device->fs_info, > "couldn't get super block location for mirror %d", > i); > - errors++; > + atomic_inc(&device->sb_wb_errors); > continue; > } > if (bytenr + BTRFS_SUPER_INFO_SIZE >= > @@ -3785,20 +3783,18 @@ static int write_dev_supers(struct btrfs_device *device, > BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, > sb->csum); > > - page = find_or_create_page(mapping, bytenr >> PAGE_SHIFT, > - GFP_NOFS); > - if (!page) { > + folio = __filemap_get_folio(mapping, bytenr >> PAGE_SHIFT, > + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, GFP_NOFS); > + if (IS_ERR(folio)) { > btrfs_err(device->fs_info, > "couldn't get super block page for bytenr %llu", > bytenr); > - errors++; > + atomic_inc(&device->sb_wb_errors); > continue; > } > > - /* Bump the refcount for wait_dev_supers() */ > - get_page(page); > - > - disk_super = page_address(page); > + offset = offset_in_folio(folio, bytenr); > + disk_super = folio_address(folio) + offset; > memcpy(disk_super, sb, BTRFS_SUPER_INFO_SIZE); > > /* > @@ -3812,8 +3808,7 @@ static int write_dev_supers(struct btrfs_device *device, > bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT; > bio->bi_private = device; > bio->bi_end_io = btrfs_end_super_write; > - __bio_add_page(bio, page, BTRFS_SUPER_INFO_SIZE, > - offset_in_page(bytenr)); > + bio_add_folio_nofail(bio, folio, BTRFS_SUPER_INFO_SIZE, offset); > > /* > * We FUA only the first super block. The others we allow to > @@ -3825,9 +3820,9 @@ static int write_dev_supers(struct btrfs_device *device, > submit_bio(bio); > > if (btrfs_advance_sb_log(device, i)) > - errors++; > + atomic_inc(&device->sb_wb_errors); > } > - return errors < i ? 0 : -1; > + return atomic_read(&device->sb_wb_errors) < i ? 0 : -1; > } > > /* > @@ -3849,7 +3844,7 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) > max_mirrors = BTRFS_SUPER_MIRROR_MAX; > > for (i = 0; i < max_mirrors; i++) { > - struct page *page; > + struct folio *folio; > > ret = btrfs_sb_log_location(device, i, READ, &bytenr); > if (ret == -ENOENT) { > @@ -3864,29 +3859,19 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) > device->commit_total_bytes) > break; > > - page = find_get_page(device->bdev->bd_mapping, > + folio = filemap_get_folio(device->bdev->bd_mapping, > bytenr >> PAGE_SHIFT); > - if (!page) { > - errors++; > - if (i == 0) > - primary_failed = true; > + /* If the folio has been removed, then we know it completed */ > + if (IS_ERR(folio)) > continue; > - } > - /* Page is submitted locked and unlocked once the IO completes */ > - wait_on_page_locked(page); > - if (PageError(page)) { > - errors++; > - if (i == 0) > - primary_failed = true; > - } > - > - /* Drop our reference */ > - put_page(page); > - > - /* Drop the reference from the writing run */ > - put_page(page); > + /* Folio is unlocked once the IO completes */ > + folio_wait_locked(folio); > + folio_put(folio); > } > > + errors += atomic_read(&device->sb_wb_errors); > + if (errors >= INT_MAX / 2) > + primary_failed = true; Alternatively a flag can be set in the device if the primary superblock write fails but I think encoding that in the error count also works, as long as it's a named constant.
On Thu, Apr 18, 2024 at 08:00:51PM +0200, David Sterba wrote: > On Thu, Apr 18, 2024 at 06:41:47PM +0100, Matthew Wilcox wrote: > > btrfs currently uses it to indicate superblock writeback errors. > > This proposal moves that information to a counter in the btrfs_device. > > Maybe this isn't the best approach. What do you think? > > Tracking the number of errors in the device is a good approach. The > superblock write is asynchronous but it's not necessary to track the > error in the page, we have the device structure in the end io callback. > Also it's guaranteed that this is running only from one place so not > even the atomics are needed. Ah, but the completions might happen on different CPUs at the same time. And the completion of the first write might happen while the submission for a subsequent write are still running, and that might also adjust the sb_wb_error value, so I think it is needed. > I'd rather make the conversion from pages to folios a separate patch > from the error counting change. I haven't seen anything obviously wrong > but the superblock write is a critical action so it's a matter of > precaution. Understood. I've split off three patches for folio conversion; one for each function. > > - } else { > > - SetPageUptodate(page); > > + /* Ensure failure if a primary sb fails */ > > + if (bio->bi_opf & REQ_FUA) > > + atomic_set(&device->sb_wb_errors, INT_MAX / 2); > > This is using some magic constant so it would be better defined > separately and documented what it means. I was hoping that comment would be enough. Name for the constant? BTRFS_PRIMARY_SB_FAILED? > Alternatively a flag can be set in the device if the primary superblock > write fails but I think encoding that in the error count also works, as > long as it's a named constant. Yeah, I thought about that option too. This seemed like the easiest way to go.
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 3d512b041977..5f6f8472ecec 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3627,28 +3627,24 @@ ALLOW_ERROR_INJECTION(open_ctree, ERRNO); static void btrfs_end_super_write(struct bio *bio) { struct btrfs_device *device = bio->bi_private; - struct bio_vec *bvec; - struct bvec_iter_all iter_all; - struct page *page; - - bio_for_each_segment_all(bvec, bio, iter_all) { - page = bvec->bv_page; + struct folio_iter fi; + bio_for_each_folio_all(fi, bio) { if (bio->bi_status) { btrfs_warn_rl_in_rcu(device->fs_info, - "lost page write due to IO error on %s (%d)", + "lost sb write due to IO error on %s (%d)", btrfs_dev_name(device), blk_status_to_errno(bio->bi_status)); - ClearPageUptodate(page); - SetPageError(page); btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_WRITE_ERRS); - } else { - SetPageUptodate(page); + /* Ensure failure if a primary sb fails */ + if (bio->bi_opf & REQ_FUA) + atomic_set(&device->sb_wb_errors, INT_MAX / 2); + else + atomic_inc(&device->sb_wb_errors); } - - put_page(page); - unlock_page(page); + folio_unlock(fi.folio); + folio_put(fi.folio); } bio_put(bio); @@ -3750,19 +3746,21 @@ static int write_dev_supers(struct btrfs_device *device, struct address_space *mapping = device->bdev->bd_mapping; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); int i; - int errors = 0; int ret; u64 bytenr, bytenr_orig; + atomic_set(&device->sb_wb_errors, 0); + if (max_mirrors == 0) max_mirrors = BTRFS_SUPER_MIRROR_MAX; shash->tfm = fs_info->csum_shash; for (i = 0; i < max_mirrors; i++) { - struct page *page; + struct folio *folio; struct bio *bio; struct btrfs_super_block *disk_super; + size_t offset; bytenr_orig = btrfs_sb_offset(i); ret = btrfs_sb_log_location(device, i, WRITE, &bytenr); @@ -3772,7 +3770,7 @@ static int write_dev_supers(struct btrfs_device *device, btrfs_err(device->fs_info, "couldn't get super block location for mirror %d", i); - errors++; + atomic_inc(&device->sb_wb_errors); continue; } if (bytenr + BTRFS_SUPER_INFO_SIZE >= @@ -3785,20 +3783,18 @@ static int write_dev_supers(struct btrfs_device *device, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, sb->csum); - page = find_or_create_page(mapping, bytenr >> PAGE_SHIFT, - GFP_NOFS); - if (!page) { + folio = __filemap_get_folio(mapping, bytenr >> PAGE_SHIFT, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, GFP_NOFS); + if (IS_ERR(folio)) { btrfs_err(device->fs_info, "couldn't get super block page for bytenr %llu", bytenr); - errors++; + atomic_inc(&device->sb_wb_errors); continue; } - /* Bump the refcount for wait_dev_supers() */ - get_page(page); - - disk_super = page_address(page); + offset = offset_in_folio(folio, bytenr); + disk_super = folio_address(folio) + offset; memcpy(disk_super, sb, BTRFS_SUPER_INFO_SIZE); /* @@ -3812,8 +3808,7 @@ static int write_dev_supers(struct btrfs_device *device, bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT; bio->bi_private = device; bio->bi_end_io = btrfs_end_super_write; - __bio_add_page(bio, page, BTRFS_SUPER_INFO_SIZE, - offset_in_page(bytenr)); + bio_add_folio_nofail(bio, folio, BTRFS_SUPER_INFO_SIZE, offset); /* * We FUA only the first super block. The others we allow to @@ -3825,9 +3820,9 @@ static int write_dev_supers(struct btrfs_device *device, submit_bio(bio); if (btrfs_advance_sb_log(device, i)) - errors++; + atomic_inc(&device->sb_wb_errors); } - return errors < i ? 0 : -1; + return atomic_read(&device->sb_wb_errors) < i ? 0 : -1; } /* @@ -3849,7 +3844,7 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) max_mirrors = BTRFS_SUPER_MIRROR_MAX; for (i = 0; i < max_mirrors; i++) { - struct page *page; + struct folio *folio; ret = btrfs_sb_log_location(device, i, READ, &bytenr); if (ret == -ENOENT) { @@ -3864,29 +3859,19 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) device->commit_total_bytes) break; - page = find_get_page(device->bdev->bd_mapping, + folio = filemap_get_folio(device->bdev->bd_mapping, bytenr >> PAGE_SHIFT); - if (!page) { - errors++; - if (i == 0) - primary_failed = true; + /* If the folio has been removed, then we know it completed */ + if (IS_ERR(folio)) continue; - } - /* Page is submitted locked and unlocked once the IO completes */ - wait_on_page_locked(page); - if (PageError(page)) { - errors++; - if (i == 0) - primary_failed = true; - } - - /* Drop our reference */ - put_page(page); - - /* Drop the reference from the writing run */ - put_page(page); + /* Folio is unlocked once the IO completes */ + folio_wait_locked(folio); + folio_put(folio); } + errors += atomic_read(&device->sb_wb_errors); + if (errors >= INT_MAX / 2) + primary_failed = true; /* log error, force error return */ if (primary_failed) { btrfs_err(device->fs_info, "error writing primary super block to device %llu", diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index cf555f5b47ce..44c639720426 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -142,6 +142,8 @@ struct btrfs_device { /* type and info about this device */ u64 type; + atomic_t sb_wb_errors; + /* minimal io size for this device */ u32 sector_size;