Message ID | 1301669700-11078-1-git-send-email-josef@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Fri, Apr 1, 2011 at 9:55 AM, Josef Bacik <josef@redhat.com> wrote: > I noticed a huge problem with the free space cache that was presenting as an > early ENOSPC. Turns out when writing the free space cache out I forgot to take > into account pinned extents and more importantly clusters. This would result in > us leaking free space everytime we unmounted the filesystem and remounted it. I > fix this by making sure to check and see if the current block group has a > cluster and writing out any entries that are in the cluster to the cache, as > well as writing any pinned extents we currently have to the cache since those > will be available for us to use the next time the fs mounts. This patch also > adds a check to the end of load_free_space_cache to make sure we got the right > amount of free space cache, and if not make sure to clear the cache and re-cache > the old fashioned way. Thanks, > > Signed-off-by: Josef Bacik <josef@redhat.com> > --- > V1->V2: > - use block_group->free_space instead of > btrfs_block_group_free_space(block_group) > > fs/btrfs/free-space-cache.c | 82 ++++++++++++++++++++++++++++++++++++++++-- > 1 files changed, 78 insertions(+), 4 deletions(-) > > diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c > index f03ef97..74bc432 100644 > --- a/fs/btrfs/free-space-cache.c > +++ b/fs/btrfs/free-space-cache.c > @@ -24,6 +24,7 @@ > #include "free-space-cache.h" > #include "transaction.h" > #include "disk-io.h" > +#include "extent_io.h" > > #define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8) > #define MAX_CACHE_BYTES_PER_GIG (32 * 1024) > @@ -222,6 +223,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info, > u64 num_entries; > u64 num_bitmaps; > u64 generation; > + u64 used = btrfs_block_group_used(&block_group->item); > u32 cur_crc = ~(u32)0; > pgoff_t index = 0; > unsigned long first_page_offset; > @@ -467,6 +469,17 @@ next: > index++; > } > > + spin_lock(&block_group->tree_lock); > + if (block_group->free_space != (block_group->key.offset - used - > + block_group->bytes_super)) { > + spin_unlock(&block_group->tree_lock); > + printk(KERN_ERR "block group %llu has an wrong amount of free " > + "space\n", block_group->key.objectid); > + ret = 0; > + goto free_cache; > + } > + spin_unlock(&block_group->tree_lock); > + > ret = 1; > out: > kfree(checksums); > @@ -495,8 +508,11 @@ int btrfs_write_out_cache(struct btrfs_root *root, > struct list_head *pos, *n; > struct page *page; > struct extent_state *cached_state = NULL; > + struct btrfs_free_cluster *cluster = NULL; > + struct extent_io_tree *unpin = NULL; > struct list_head bitmap_list; > struct btrfs_key key; > + u64 start, end, len; > u64 bytes = 0; > u32 *crc, *checksums; > pgoff_t index = 0, last_index = 0; > @@ -505,6 +521,7 @@ int btrfs_write_out_cache(struct btrfs_root *root, > int entries = 0; > int bitmaps = 0; > int ret = 0; > + bool next_page = false; > > root = root->fs_info->tree_root; > > @@ -551,6 +568,18 @@ int btrfs_write_out_cache(struct btrfs_root *root, > */ > first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64); > > + /* Get the cluster for this block_group if it exists */ > + if (!list_empty(&block_group->cluster_list)) > + cluster = list_entry(block_group->cluster_list.next, > + struct btrfs_free_cluster, > + block_group_list); > + > + /* > + * We shouldn't have switched the pinned extents yet so this is the > + * right one > + */ > + unpin = root->fs_info->pinned_extents; > + > /* > * Lock all pages first so we can lock the extent safely. > * > @@ -580,6 +609,12 @@ int btrfs_write_out_cache(struct btrfs_root *root, > lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, > 0, &cached_state, GFP_NOFS); > > + /* > + * When searching for pinned extents, we need to start at our start > + * offset. > + */ > + start = block_group->key.objectid; > + > /* Write out the extent entries */ > do { > struct btrfs_free_space_entry *entry; > @@ -587,6 +622,8 @@ int btrfs_write_out_cache(struct btrfs_root *root, > unsigned long offset = 0; > unsigned long start_offset = 0; > > + next_page = false; > + > if (index == 0) { > start_offset = first_page_offset; > offset = start_offset; > @@ -598,7 +635,7 @@ int btrfs_write_out_cache(struct btrfs_root *root, > entry = addr + start_offset; > > memset(addr, 0, PAGE_CACHE_SIZE); > - while (1) { > + while (node && !next_page) { > struct btrfs_free_space *e; > > e = rb_entry(node, struct btrfs_free_space, offset_index); > @@ -614,12 +651,49 @@ int btrfs_write_out_cache(struct btrfs_root *root, > entry->type = BTRFS_FREE_SPACE_EXTENT; > } > node = rb_next(node); > - if (!node) > - break; > + if (!node && cluster) { > + node = rb_first(&cluster->root); > + cluster = NULL; > + } > offset += sizeof(struct btrfs_free_space_entry); > if (offset + sizeof(struct btrfs_free_space_entry) >= > PAGE_CACHE_SIZE) > + next_page = true; > + entry++; > + } > + > + /* > + * We want to add any pinned extents to our free space cache > + * so we don't leak the space > + */ > + while (!next_page && (start < block_group->key.objectid + > + block_group->key.offset)) { > + ret = find_first_extent_bit(unpin, start, &start, &end, > + EXTENT_DIRTY); > + if (ret) { > + ret = 0; > break; > + } > + > + /* This pinned extent is out of our range */ > + if (start >= block_group->key.objectid + > + block_group->key.offset) > + break; > + > + len = block_group->key.objectid + > + block_group->key.offset - start; > + len = min(len, end + 1 - start); > + > + entries++; > + entry->offset = cpu_to_le64(start); > + entry->bytes = cpu_to_le64(len); > + entry->type = BTRFS_FREE_SPACE_EXTENT; > + > + start = end + 1; > + offset += sizeof(struct btrfs_free_space_entry); > + if (offset + sizeof(struct btrfs_free_space_entry) >= > + PAGE_CACHE_SIZE) > + next_page = true; > entry++; > } > *crc = ~(u32)0; > @@ -650,7 +724,7 @@ int btrfs_write_out_cache(struct btrfs_root *root, > page_cache_release(page); > > index++; > - } while (node); > + } while (node || next_page); > > /* Write out the bitmaps */ > list_for_each_safe(pos, n, &bitmap_list) { > -- > 1.7.2.3 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > I've been testing this patch on a 2.6.38.2 kernel with the patches from Chris' for-linus-unmerged branch. I've received the following oops that I think is in the area affected by this patch, so I'm posting here. At the time of the oops, I was downloading a file to a separate ext3 partition, so I'm not sure what precipitated this oops. I have btrfs on my root partition; however, space_cache is not enabled on my root partition. I have a build partition (with space_cache enabled, and I mounted with clear_cache) that had just successfully finished building and installing php, and was in the process of preparing to start building another package when the oops was received. The build routine should not have been accessing the build partition at that time since it was in the process of saving the source package to another partition. Perhaps it was a delayed sync from the build that had just finished. [261901.536430] BUG: unable to handle kernel NULL pointer dereference at (null) [261901.536438] IP: [<c10189fe>] kmap+0xb/0x39 [261901.536449] *pde = 00000000 [261901.536453] Oops: 0000 [#1] [261901.536456] last sysfs file: /sys/devices/virtual/dmi/id/bios_vendor [261901.536460] Modules linked in: nvidia(P) sl811_hcd [261901.536465] [261901.536470] Pid: 3622, comm: btrfs-transacti Tainted: P 2.6.38.2-sabayon #2 /MS-6570 [261901.536476] EIP: 0060:[<c10189fe>] EFLAGS: 00010246 CPU: 0 [261901.536480] EIP is at kmap+0xb/0x39 [261901.536483] EAX: 00000000 EBX: 00000000 ECX: 00000000 EDX: 00000000 [261901.536486] ESI: 00000000 EDI: 00001000 EBP: f00f5e08 ESP: f00f5e04 [261901.536490] DS: 007b ES: 007b FS: 0000 GS: 0000 SS: 0068 [261901.536493] Process btrfs-transacti (pid: 3622, ti=f00f4000 task=f2057780 task.ti=f00f4000) [261901.536496] Stack: [261901.536498] f3a9fc00 f00f5ed0 c123ce1e ff87f000 c1082936 00000000 00000022 f3a9fb00 [261901.536504] f697206c f3a9fc00 00000048 0000000f 00000010 00000001 00000000 ff87f000 [261901.536511] f3a9fb1c 00000040 f6719000 eead8780 00000efc 00000000 f24e0800 00010000 [261901.536517] Call Trace: [261901.536527] [<c123ce1e>] btrfs_write_out_cache+0x257/0x89f [261901.536536] [<c1082936>] ? page_address+0x20/0x92 [261901.536544] [<c1204fe8>] ? btrfs_run_delayed_refs+0x7c/0x102 [261901.536549] [<c12053f9>] btrfs_write_dirty_block_groups+0x38b/0x41a [261901.536554] [<c1210090>] commit_cowonly_roots+0xa9/0x178 [261901.536559] [<c101ceaf>] ? need_resched+0x14/0x1e [261901.536564] [<c12110d6>] btrfs_commit_transaction+0x2e9/0x5a1 [261901.536572] [<c10359cc>] ? autoremove_wake_function+0x0/0x2f [261901.536578] [<c120c58e>] transaction_kthread+0x10e/0x19d [261901.536582] [<c101d5b6>] ? complete+0x2d/0x36 [261901.536587] [<c120c480>] ? transaction_kthread+0x0/0x19d [261901.536591] [<c1035756>] kthread+0x61/0x66 [261901.536596] [<c10356f5>] ? kthread+0x0/0x66 [261901.536601] [<c1002c76>] kernel_thread_helper+0x6/0x10 [261901.536603] Code: e8 1e 69 c0 f0 02 00 00 05 a0 90 7b c1 e8 5f fe ff ff 85 c0 74 07 89 d8 e8 b8 9f 06 00 5b 5d c3 55 89 e5 53 89 c3 e8 bd c2 51 00 <8b> 03 c1 e8 1e 69 c0 f0 02 00 00 05 a0 90 7b c1 e8 31 fe ff ff [261901.536635] EIP: [<c10189fe>] kmap+0xb/0x39 SS:ESP 0068:f00f5e04 [261901.536640] CR2: 0000000000000000 [261901.536644] ---[ end trace eec3a32046e1c7b3 ]--- -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index f03ef97..74bc432 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -24,6 +24,7 @@ #include "free-space-cache.h" #include "transaction.h" #include "disk-io.h" +#include "extent_io.h" #define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8) #define MAX_CACHE_BYTES_PER_GIG (32 * 1024) @@ -222,6 +223,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info, u64 num_entries; u64 num_bitmaps; u64 generation; + u64 used = btrfs_block_group_used(&block_group->item); u32 cur_crc = ~(u32)0; pgoff_t index = 0; unsigned long first_page_offset; @@ -467,6 +469,17 @@ next: index++; } + spin_lock(&block_group->tree_lock); + if (block_group->free_space != (block_group->key.offset - used - + block_group->bytes_super)) { + spin_unlock(&block_group->tree_lock); + printk(KERN_ERR "block group %llu has an wrong amount of free " + "space\n", block_group->key.objectid); + ret = 0; + goto free_cache; + } + spin_unlock(&block_group->tree_lock); + ret = 1; out: kfree(checksums); @@ -495,8 +508,11 @@ int btrfs_write_out_cache(struct btrfs_root *root, struct list_head *pos, *n; struct page *page; struct extent_state *cached_state = NULL; + struct btrfs_free_cluster *cluster = NULL; + struct extent_io_tree *unpin = NULL; struct list_head bitmap_list; struct btrfs_key key; + u64 start, end, len; u64 bytes = 0; u32 *crc, *checksums; pgoff_t index = 0, last_index = 0; @@ -505,6 +521,7 @@ int btrfs_write_out_cache(struct btrfs_root *root, int entries = 0; int bitmaps = 0; int ret = 0; + bool next_page = false; root = root->fs_info->tree_root; @@ -551,6 +568,18 @@ int btrfs_write_out_cache(struct btrfs_root *root, */ first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64); + /* Get the cluster for this block_group if it exists */ + if (!list_empty(&block_group->cluster_list)) + cluster = list_entry(block_group->cluster_list.next, + struct btrfs_free_cluster, + block_group_list); + + /* + * We shouldn't have switched the pinned extents yet so this is the + * right one + */ + unpin = root->fs_info->pinned_extents; + /* * Lock all pages first so we can lock the extent safely. * @@ -580,6 +609,12 @@ int btrfs_write_out_cache(struct btrfs_root *root, lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 0, &cached_state, GFP_NOFS); + /* + * When searching for pinned extents, we need to start at our start + * offset. + */ + start = block_group->key.objectid; + /* Write out the extent entries */ do { struct btrfs_free_space_entry *entry; @@ -587,6 +622,8 @@ int btrfs_write_out_cache(struct btrfs_root *root, unsigned long offset = 0; unsigned long start_offset = 0; + next_page = false; + if (index == 0) { start_offset = first_page_offset; offset = start_offset; @@ -598,7 +635,7 @@ int btrfs_write_out_cache(struct btrfs_root *root, entry = addr + start_offset; memset(addr, 0, PAGE_CACHE_SIZE); - while (1) { + while (node && !next_page) { struct btrfs_free_space *e; e = rb_entry(node, struct btrfs_free_space, offset_index); @@ -614,12 +651,49 @@ int btrfs_write_out_cache(struct btrfs_root *root, entry->type = BTRFS_FREE_SPACE_EXTENT; } node = rb_next(node); - if (!node) - break; + if (!node && cluster) { + node = rb_first(&cluster->root); + cluster = NULL; + } offset += sizeof(struct btrfs_free_space_entry); if (offset + sizeof(struct btrfs_free_space_entry) >= PAGE_CACHE_SIZE) + next_page = true; + entry++; + } + + /* + * We want to add any pinned extents to our free space cache + * so we don't leak the space + */ + while (!next_page && (start < block_group->key.objectid + + block_group->key.offset)) { + ret = find_first_extent_bit(unpin, start, &start, &end, + EXTENT_DIRTY); + if (ret) { + ret = 0; break; + } + + /* This pinned extent is out of our range */ + if (start >= block_group->key.objectid + + block_group->key.offset) + break; + + len = block_group->key.objectid + + block_group->key.offset - start; + len = min(len, end + 1 - start); + + entries++; + entry->offset = cpu_to_le64(start); + entry->bytes = cpu_to_le64(len); + entry->type = BTRFS_FREE_SPACE_EXTENT; + + start = end + 1; + offset += sizeof(struct btrfs_free_space_entry); + if (offset + sizeof(struct btrfs_free_space_entry) >= + PAGE_CACHE_SIZE) + next_page = true; entry++; } *crc = ~(u32)0; @@ -650,7 +724,7 @@ int btrfs_write_out_cache(struct btrfs_root *root, page_cache_release(page); index++; - } while (node); + } while (node || next_page); /* Write out the bitmaps */ list_for_each_safe(pos, n, &bitmap_list) {