diff mbox series

[v4,2/6] btrfs: lock subpage ranges in one go for writepage_delalloc()

Message ID 91da2c388f3152f40af0e3e8c7ca7c7f8f6687db.1715386434.git.wqu@suse.com (mailing list archive)
State New
Headers show
Series btrfs: subpage + zoned fixes | expand

Commit Message

Qu Wenruo May 11, 2024, 12:15 a.m. UTC
If we have a subpage range like this for a 16K page with 4K sectorsize:

    0     4K     8K     12K     16K
    |/////|      |//////|       |

    |/////| = dirty range

Currently writepage_delalloc() would go through the following steps:

- lock range [0, 4K)
- run delalloc range for [0, 4K)
- lock range [8K, 12K)
- run delalloc range for [8K 12K)

So far it's fine for regular subpage, as btrfs_run_delalloc_range() can
only go into one of  run_delalloc_nocow(), cow_file_range() and
run_delalloc_compressed().

But there is a special pitfall for zoned subpage, where we will go
through run_delalloc_cow(), which would create the ordered extent for the
range and immediately submit the range.
This would unlock the whole page range, causing all kinds of different
ASSERT()s related to locked page.

This patch would not yet address the run_delalloc_cow() problem, but
would prepare for the proper fix by changing the workflow to the
following one:

- lock range [0, 4K)
- lock range [8K, 12K)
- run delalloc range for [0, 4K)
- run delalloc range for [8K, 12K)

So later for subpage cases we can do extra work to ensure
run_delalloc_cow() to only unlock the full page until the last lock
user.

To save the previously locked range, we utilize a structure called
locked_delalloc_list, which would cached the last hit delalloc range,
thus for non-subpage cases, it would use that cached value.

For subpage cases since we can hit multiple delalloc ranges inside a
page, a list would be utilized and we will allocate memory for them.
This introduced a new memory allocation thus extra error paths, but this
method is only tempoarary, we will later use subpage bitmap to avoid
the memory allocation.

Signed-off-by: Qu Wenruo <wqu@suse.com>
---
 fs/btrfs/extent_io.c | 140 +++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 136 insertions(+), 4 deletions(-)

Comments

Johannes Thumshirn May 12, 2024, 5:08 p.m. UTC | #1
Why do we even need this patch in the first place, when we're basically 
reverting it again in Patch 4/6?
Qu Wenruo May 12, 2024, 10:09 p.m. UTC | #2
在 2024/5/13 02:38, Johannes Thumshirn 写道:
> Why do we even need this patch in the first place, when we're basically
> reverting it again in Patch 4/6?

Indeed, I'll merge this patch into 4/6.

Thanks,
Qu
diff mbox series

Patch

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 8a4a7d00795f..6f237c77699a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1213,6 +1213,101 @@  static inline void contiguous_readpages(struct page *pages[], int nr_pages,
 	}
 }
 
+struct locked_delalloc_range {
+	struct list_head list;
+	u64 delalloc_start;
+	u32 delalloc_len;
+};
+
+/*
+ * Save the locked delalloc range.
+ *
+ * This is for subpage only, as for regular sectorsize, there will be at most
+ * one locked delalloc range for a page.
+ */
+struct locked_delalloc_list {
+	u64 last_delalloc_start;
+	u32 last_delalloc_len;
+	struct list_head head;
+};
+
+static void init_locked_delalloc_list(struct locked_delalloc_list *locked_list)
+{
+	INIT_LIST_HEAD(&locked_list->head);
+	locked_list->last_delalloc_start = 0;
+	locked_list->last_delalloc_len = 0;
+}
+
+static void release_locked_delalloc_list(struct locked_delalloc_list *locked_list)
+{
+	while (!list_empty(&locked_list->head)) {
+		struct locked_delalloc_range *entry;
+
+		entry = list_entry(locked_list->head.next,
+				   struct locked_delalloc_range, list);
+
+		list_del_init(&entry->list);
+		kfree(entry);
+	}
+}
+
+static int add_locked_delalloc_range(struct btrfs_fs_info *fs_info,
+				     struct locked_delalloc_list *locked_list,
+				     u64 start, u32 len)
+{
+	struct locked_delalloc_range *entry;
+
+	entry = kmalloc(sizeof(*entry), GFP_NOFS);
+	if (!entry)
+		return -ENOMEM;
+
+	if (locked_list->last_delalloc_len == 0) {
+		locked_list->last_delalloc_start = start;
+		locked_list->last_delalloc_len = len;
+		return 0;
+	}
+	/* The new entry must be beyond the current one. */
+	ASSERT(start >= locked_list->last_delalloc_start +
+			locked_list->last_delalloc_len);
+
+	/* Only subpage case can have more than one delalloc ranges inside a page. */
+	ASSERT(fs_info->sectorsize < PAGE_SIZE);
+
+	entry->delalloc_start = locked_list->last_delalloc_start;
+	entry->delalloc_len = locked_list->last_delalloc_len;
+	locked_list->last_delalloc_start = start;
+	locked_list->last_delalloc_len = len;
+	list_add_tail(&entry->list, &locked_list->head);
+	return 0;
+}
+
+static void __cold unlock_one_locked_delalloc_range(struct btrfs_inode *binode,
+						    struct page *locked_page,
+						    u64 start, u32 len)
+{
+	u64 delalloc_end = start + len - 1;
+
+	unlock_extent(&binode->io_tree, start, delalloc_end, NULL);
+	__unlock_for_delalloc(&binode->vfs_inode, locked_page, start,
+			      delalloc_end);
+}
+
+static void unlock_locked_delalloc_list(struct btrfs_inode *binode,
+					struct page *locked_page,
+					struct locked_delalloc_list *locked_list)
+{
+	struct locked_delalloc_range *entry;
+
+	list_for_each_entry(entry, &locked_list->head, list)
+		unlock_one_locked_delalloc_range(binode, locked_page,
+				entry->delalloc_start, entry->delalloc_len);
+	if (locked_list->last_delalloc_len) {
+		unlock_one_locked_delalloc_range(binode, locked_page,
+				locked_list->last_delalloc_start,
+				locked_list->last_delalloc_len);
+	}
+}
+
 /*
  * helper for __extent_writepage, doing all of the delayed allocation setup.
  *
@@ -1226,13 +1321,17 @@  static inline void contiguous_readpages(struct page *pages[], int nr_pages,
 static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
 		struct page *page, struct writeback_control *wbc)
 {
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(&inode->vfs_inode);
 	const u64 page_start = page_offset(page);
 	const u64 page_end = page_start + PAGE_SIZE - 1;
+	struct locked_delalloc_list locked_list;
+	struct locked_delalloc_range *entry;
 	u64 delalloc_start = page_start;
 	u64 delalloc_end = page_end;
 	u64 delalloc_to_write = 0;
 	int ret = 0;
 
+	init_locked_delalloc_list(&locked_list);
 	while (delalloc_start < page_end) {
 		delalloc_end = page_end;
 		if (!find_lock_delalloc_range(&inode->vfs_inode, page,
@@ -1240,14 +1339,47 @@  static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
 			delalloc_start = delalloc_end + 1;
 			continue;
 		}
-
-		ret = btrfs_run_delalloc_range(inode, page, delalloc_start,
-					       delalloc_end, wbc);
-		if (ret < 0)
+		ret = add_locked_delalloc_range(fs_info, &locked_list,
+				delalloc_start, delalloc_end + 1 - delalloc_start);
+		if (ret < 0) {
+			unlock_locked_delalloc_list(inode, page, &locked_list);
+			release_locked_delalloc_list(&locked_list);
 			return ret;
+		}
 
 		delalloc_start = delalloc_end + 1;
 	}
+	list_for_each_entry(entry, &locked_list.head, list) {
+		delalloc_end = entry->delalloc_start + entry->delalloc_len - 1;
+
+		/*
+		 * Hit error in the previous run, cleanup the locked
+		 * extents/pages.
+		 */
+		if (ret < 0) {
+			unlock_one_locked_delalloc_range(inode, page,
+					entry->delalloc_start, entry->delalloc_len);
+			continue;
+		}
+		ret = btrfs_run_delalloc_range(inode, page, entry->delalloc_start,
+					       delalloc_end, wbc);
+	}
+	if (locked_list.last_delalloc_len) {
+		delalloc_end = locked_list.last_delalloc_start +
+			       locked_list.last_delalloc_len - 1;
+
+		if (ret < 0)
+			unlock_one_locked_delalloc_range(inode, page,
+					locked_list.last_delalloc_start,
+					locked_list.last_delalloc_len);
+		else
+			ret = btrfs_run_delalloc_range(inode, page,
+					locked_list.last_delalloc_start,
+					delalloc_end, wbc);
+	}
+	release_locked_delalloc_list(&locked_list);
+	if (ret < 0)
+		return ret;
 
 	/*
 	 * delalloc_end is already one less than the total length, so