@@ -1145,11 +1145,13 @@ static bool find_next_delalloc_bitmap(struct folio *folio,
* helper for extent_writepage(), doing all of the delayed allocation setup.
*
* This returns 1 if btrfs_run_delalloc_range function did all the work required
- * to write the page (copy into inline extent). In this case the IO has
- * been started and the page is already unlocked.
+ * to write the page (copy into inline extent or compression). In this case
+ * the IO has been started and we should no longer touch the page (may have
+ * already been unlocked).
*
* This returns 0 if all went well (page still locked)
- * This returns < 0 if there were errors (page still locked)
+ * This returns < 0 if there were errors (page still locked), in this case any
+ * newly created delalloc range will be marked as error and finished.
*/
static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
struct folio *folio,
@@ -1167,6 +1169,12 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
* last delalloc end.
*/
u64 last_delalloc_end = 0;
+ /*
+ * Save the last successfully ran delalloc range end (exclusive).
+ * This is for error handling to avoid ranges with ordered extent created
+ * but no IO will be submitted due to error.
+ */
+ u64 last_finished = page_start;
u64 delalloc_start = page_start;
u64 delalloc_end = page_end;
u64 delalloc_to_write = 0;
@@ -1235,11 +1243,19 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
found_len = last_delalloc_end + 1 - found_start;
if (ret >= 0) {
+ /*
+ * Some delalloc range may be created by previous folios.
+ * Thus we still need to clean those range up during error
+ * handling.
+ */
+ last_finished = found_start;
/* No errors hit so far, run the current delalloc range. */
ret = btrfs_run_delalloc_range(inode, folio,
found_start,
found_start + found_len - 1,
wbc);
+ if (ret >= 0)
+ last_finished = found_start + found_len;
} else {
/*
* We've hit an error during previous delalloc range,
@@ -1274,8 +1290,21 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
delalloc_start = found_start + found_len;
}
- if (ret < 0)
+ /*
+ * It's possible we have some ordered extents created before we hit
+ * an error, cleanup non-async successfully created delalloc ranges.
+ */
+ if (unlikely(ret < 0)) {
+ unsigned int bitmap_size = min(
+ (last_finished - page_start) >> fs_info->sectorsize_bits,
+ fs_info->sectors_per_page);
+
+ for_each_set_bit(bit, &bio_ctrl->submit_bitmap, bitmap_size)
+ btrfs_mark_ordered_io_finished(inode, folio,
+ page_start + (bit << fs_info->sectorsize_bits),
+ fs_info->sectorsize, false);
return ret;
+ }
out:
if (last_delalloc_end)
delalloc_end = last_delalloc_end;
@@ -1509,13 +1538,13 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
bio_ctrl->wbc->nr_to_write--;
-done:
- if (ret) {
+ if (ret)
btrfs_mark_ordered_io_finished(BTRFS_I(inode), folio,
page_start, PAGE_SIZE, !ret);
- mapping_set_error(folio->mapping, ret);
- }
+done:
+ if (ret < 0)
+ mapping_set_error(folio->mapping, ret);
/*
* Only unlock ranges that are submitted. As there can be some async
* submitted ranges inside the folio.
[BUG] There are several crash or hang during fstests runs with sectorsize < page size setup. It turns out that most of those hang happens after a btrfs_run_delalloc_range() failure (caused by -ENOSPC). The most common one is generic/750. The symptom are all related to ordered extent finishing, where we double account the target ordered extent. [CAUSE] Inside writepage_delalloc() if we hit an error from btrfs_run_delalloc_range(), we still need to unlock all the locked range, but that's the only error handling. If we have the following page layout with a 64K page size and 4K sector size: 0 4K 32K 40K 60K 64K |////| |////| |/////| Where |//| is the dirtied blocks inside the folio. Then we hit the following sequence: - Enter writepage_delalloc() for folio 0 - btrfs_run_delalloc_range() returned 0 for [0, 4K) And created regular COW ordered extent for range [0, 4K) - btrfs_run_delalloc_range() returned 0 for [32K, 40K) And created async extent for range [32K, 40K). This means the error handling will be done in another thread, we should not touch the range anymore. - btrfs_run_delalloc_range() failed with -ENOSPC for range [60K, 64K) In theory we should not fail since we should have reserved enough space at buffered write time, but let's ignore that rabbit hole and focus on the error handling. - Error handling in extent_writepage() Now we go to the done: tag, calling btrfs_mark_ordered_io_finished() for the whole folio range. This will find ranges [0, 4K) and [32K, 40K) to cleanup, for [0, 4K) it should be cleaned up, but for range [32K, 40K) it's asynchronously handled, the OE may have already been submitted. This will lead to the double account for range [32K, 40K) and crash the kernel. Unfortunately this bad error handling is from the very beginning of sector size < page size support. [FIX] Instead of relying on the btrfs_mark_ordered_io_finished() call to cleanup the whole folio range, record the last successfully ran delalloc range. And combined with bio_ctrl->submit_bitmap to properly clean up any newly created ordered extents. Since we have cleaned up the ordered extents in range, we should not rely on the btrfs_mark_ordered_io_finished() inside extent_writepage() anymore. By this, we should avoid double accounting during error handling. Cc: stable@vger.kernel.org # 5.15+ Signed-off-by: Qu Wenruo <wqu@suse.com> --- fs/btrfs/extent_io.c | 45 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 37 insertions(+), 8 deletions(-)