Message ID | 04478f1919b69f470f1c56bae80512491c97a8b1.1729277554.git.boris@bur.io (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | btrfs: hold merge refcount during extent_map merge | expand |
在 2024/10/19 06:17, Boris Burkov 写道: > In debugging some corrupt squashfs files, we observed symptoms of > corrupt page cache pages but correct on-disk contents. Further > investigation revealed that the exact symptom was a correct page > followed by an incorrect, duplicate, page. This got us thinking about > extent maps. > > commit ac05ca913e9f ("Btrfs: fix race between using extent maps and merging them") > enforces a reference count on the primary `em` extent_map being merged, > as that one gets modified. Indeed, at that time @merge is read-only then removed from the rb tree. No modification to the @merge members. > > However, since, > commit 3d2ac9922465 ("btrfs: introduce new members for extent_map") > both 'em' and 'merge' get modified, which introduced the same race and > need for refcounting for 'merge'. But I changed that, to make both @merge and @em to be the same, initially I thought it's safer but it's the completely the opposite. > > We were able to reproduce this by looping the affected squashfs workload > in parallel on a bunch of separate btrfs-es while also dropping caches. > > The fix is two-fold: > - check the refs > 2 criterion for both 'em' and 'merge' > - actually refcount 'merge' which we currently grab without a refcount > via rb_prev or rb_next. > > The other option we considered, but did not implement, was to stop > modifying 'merge' as it gets deleted immediately after. However, it was > less clear what effect that would have on the racing thread holding the > reference, so we went with this fix. > > Fixes: 3d2ac9922465 ("btrfs: introduce new members for extent_map") > Signed-off-by: Omar Sandoval <osandov@fb.com> > Signed-off-by: Boris Burkov <boris@bur.io> Reviewed-by: Qu Wenruo <wqu@suse.com> Thanks a lot for pinning down the bug and fix, Qu > --- > fs/btrfs/extent_map.c | 92 +++++++++++++++++++++++++++---------------- > 1 file changed, 57 insertions(+), 35 deletions(-) > > diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c > index d58d6fc40da1..108cd7573f61 100644 > --- a/fs/btrfs/extent_map.c > +++ b/fs/btrfs/extent_map.c > @@ -215,6 +215,16 @@ static bool can_merge_extent_map(const struct extent_map *em) > > if (em->flags & EXTENT_FLAG_LOGGING) > return false; > + /* > + * We can't modify an extent map that is in the tree and that is being > + * used by another task, as it can cause that other task to see it in > + * inconsistent state during the merging. We always have 1 reference for > + * the tree and 1 for this task (which is unpinning the extent map or > + * clearing the logging flag), so anything > 2 means it's being used by > + * other tasks too. > + */ > + if (refcount_read(&em->refs) > 2) > + return false; > > /* > * We don't want to merge stuff that hasn't been written to the log yet > @@ -333,57 +343,69 @@ static void validate_extent_map(struct btrfs_fs_info *fs_info, struct extent_map > } > } > > -static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em) > +static int try_merge_one_map(struct btrfs_inode *inode, struct extent_map *em, > + struct extent_map *merge, bool prev_merge) > { > struct btrfs_fs_info *fs_info = inode->root->fs_info; > - struct extent_map *merge = NULL; > - struct rb_node *rb; > + struct extent_map *prev; > + struct extent_map *next; > > + if (prev_merge) { > + prev = merge; > + next = em; > + } else { > + prev = em; > + next = merge; > + } > /* > - * We can't modify an extent map that is in the tree and that is being > - * used by another task, as it can cause that other task to see it in > - * inconsistent state during the merging. We always have 1 reference for > - * the tree and 1 for this task (which is unpinning the extent map or > - * clearing the logging flag), so anything > 2 means it's being used by > - * other tasks too. > - */ > - if (refcount_read(&em->refs) > 2) > - return; > + * Take a refcount for merge so we can maintain a proper > + * use count across threads for checking mergeability. > + */ > + refcount_inc(&merge->refs); > + if (!can_merge_extent_map(merge)) > + goto no_merge; > + > + if (!mergeable_maps(prev, next)) > + goto no_merge; > + > + if (prev_merge) > + em->start = merge->start; > + em->len += merge->len; > + em->generation = max(em->generation, merge->generation); > + if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) > + merge_ondisk_extents(prev, next); > + em->flags |= EXTENT_FLAG_MERGED; > + > + validate_extent_map(fs_info, em); > + refcount_dec(&merge->refs); > + remove_em(inode, merge); > + free_extent_map(merge); > + return 0; > +no_merge: > + refcount_dec(&merge->refs); > + return 1; > +} > + > +static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em) > +{ > + struct extent_map *merge = NULL; > + struct rb_node *rb; > > if (!can_merge_extent_map(em)) > return; > > if (em->start != 0) { > rb = rb_prev(&em->rb_node); > - if (rb) > + if (rb) { > merge = rb_entry(rb, struct extent_map, rb_node); > - if (rb && can_merge_extent_map(merge) && mergeable_maps(merge, em)) { > - em->start = merge->start; > - em->len += merge->len; > - em->generation = max(em->generation, merge->generation); > - > - if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) > - merge_ondisk_extents(merge, em); > - em->flags |= EXTENT_FLAG_MERGED; > - > - validate_extent_map(fs_info, em); > - remove_em(inode, merge); > - free_extent_map(merge); > + try_merge_one_map(inode, em, merge, true); > } > } > > rb = rb_next(&em->rb_node); > - if (rb) > + if (rb) { > merge = rb_entry(rb, struct extent_map, rb_node); > - if (rb && can_merge_extent_map(merge) && mergeable_maps(em, merge)) { > - em->len += merge->len; > - if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) > - merge_ondisk_extents(em, merge); > - validate_extent_map(fs_info, em); > - em->generation = max(em->generation, merge->generation); > - em->flags |= EXTENT_FLAG_MERGED; > - remove_em(inode, merge); > - free_extent_map(merge); > + try_merge_one_map(inode, em, merge, false); > } > } >
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index d58d6fc40da1..108cd7573f61 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -215,6 +215,16 @@ static bool can_merge_extent_map(const struct extent_map *em) if (em->flags & EXTENT_FLAG_LOGGING) return false; + /* + * We can't modify an extent map that is in the tree and that is being + * used by another task, as it can cause that other task to see it in + * inconsistent state during the merging. We always have 1 reference for + * the tree and 1 for this task (which is unpinning the extent map or + * clearing the logging flag), so anything > 2 means it's being used by + * other tasks too. + */ + if (refcount_read(&em->refs) > 2) + return false; /* * We don't want to merge stuff that hasn't been written to the log yet @@ -333,57 +343,69 @@ static void validate_extent_map(struct btrfs_fs_info *fs_info, struct extent_map } } -static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em) +static int try_merge_one_map(struct btrfs_inode *inode, struct extent_map *em, + struct extent_map *merge, bool prev_merge) { struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct extent_map *merge = NULL; - struct rb_node *rb; + struct extent_map *prev; + struct extent_map *next; + if (prev_merge) { + prev = merge; + next = em; + } else { + prev = em; + next = merge; + } /* - * We can't modify an extent map that is in the tree and that is being - * used by another task, as it can cause that other task to see it in - * inconsistent state during the merging. We always have 1 reference for - * the tree and 1 for this task (which is unpinning the extent map or - * clearing the logging flag), so anything > 2 means it's being used by - * other tasks too. - */ - if (refcount_read(&em->refs) > 2) - return; + * Take a refcount for merge so we can maintain a proper + * use count across threads for checking mergeability. + */ + refcount_inc(&merge->refs); + if (!can_merge_extent_map(merge)) + goto no_merge; + + if (!mergeable_maps(prev, next)) + goto no_merge; + + if (prev_merge) + em->start = merge->start; + em->len += merge->len; + em->generation = max(em->generation, merge->generation); + if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) + merge_ondisk_extents(prev, next); + em->flags |= EXTENT_FLAG_MERGED; + + validate_extent_map(fs_info, em); + refcount_dec(&merge->refs); + remove_em(inode, merge); + free_extent_map(merge); + return 0; +no_merge: + refcount_dec(&merge->refs); + return 1; +} + +static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em) +{ + struct extent_map *merge = NULL; + struct rb_node *rb; if (!can_merge_extent_map(em)) return; if (em->start != 0) { rb = rb_prev(&em->rb_node); - if (rb) + if (rb) { merge = rb_entry(rb, struct extent_map, rb_node); - if (rb && can_merge_extent_map(merge) && mergeable_maps(merge, em)) { - em->start = merge->start; - em->len += merge->len; - em->generation = max(em->generation, merge->generation); - - if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) - merge_ondisk_extents(merge, em); - em->flags |= EXTENT_FLAG_MERGED; - - validate_extent_map(fs_info, em); - remove_em(inode, merge); - free_extent_map(merge); + try_merge_one_map(inode, em, merge, true); } } rb = rb_next(&em->rb_node); - if (rb) + if (rb) { merge = rb_entry(rb, struct extent_map, rb_node); - if (rb && can_merge_extent_map(merge) && mergeable_maps(em, merge)) { - em->len += merge->len; - if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) - merge_ondisk_extents(em, merge); - validate_extent_map(fs_info, em); - em->generation = max(em->generation, merge->generation); - em->flags |= EXTENT_FLAG_MERGED; - remove_em(inode, merge); - free_extent_map(merge); + try_merge_one_map(inode, em, merge, false); } }