Message ID | 20250122122459.1148668-1-maharmstone@fb.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | btrfs: use atomic64_t for free_objectid | expand |
On Wed, Jan 22, 2025 at 12:25 PM Mark Harmstone <maharmstone@fb.com> wrote: > > Currently btrfs_get_free_objectid() uses a mutex to protect > free_objectid; I'm guessing this was because of the inode cache that we > used to have. The inode cache is no more, so simplify things by > replacing it with an atomic. > > There's no issues with ordering: free_objectid gets set to an initial > value, then calls to btrfs_get_free_objectid() return a monotonically > increasing value. > > This change means that btrfs_get_free_objectid() will no longer > potentially sleep, which was a blocker for adding a non-blocking mode > for inode and subvol creation. > > Signed-off-by: Mark Harmstone <maharmstone@fb.com> > --- > fs/btrfs/ctree.h | 4 +--- > fs/btrfs/disk-io.c | 43 ++++++++++++++++++------------------------- > fs/btrfs/qgroup.c | 11 ++++++----- > fs/btrfs/tree-log.c | 3 --- > 4 files changed, 25 insertions(+), 36 deletions(-) > > diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h > index 1096a80a64e7..04175698525b 100644 > --- a/fs/btrfs/ctree.h > +++ b/fs/btrfs/ctree.h > @@ -179,8 +179,6 @@ struct btrfs_root { > struct btrfs_fs_info *fs_info; > struct extent_io_tree dirty_log_pages; > > - struct mutex objectid_mutex; > - > spinlock_t accounting_lock; > struct btrfs_block_rsv *block_rsv; > > @@ -214,7 +212,7 @@ struct btrfs_root { > > u64 last_trans; > > - u64 free_objectid; > + atomic64_t free_objectid; > > struct btrfs_key defrag_progress; > struct btrfs_key defrag_max; > diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c > index f09db62e61a1..0543d9c3f8c0 100644 > --- a/fs/btrfs/disk-io.c > +++ b/fs/btrfs/disk-io.c > @@ -659,7 +659,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, > RB_CLEAR_NODE(&root->rb_node); > > btrfs_set_root_last_trans(root, 0); > - root->free_objectid = 0; > + atomic64_set(&root->free_objectid, 0); > root->nr_delalloc_inodes = 0; > root->nr_ordered_extents = 0; > xa_init(&root->inodes); > @@ -678,7 +678,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, > spin_lock_init(&root->ordered_extent_lock); > spin_lock_init(&root->accounting_lock); > spin_lock_init(&root->qgroup_meta_rsv_lock); > - mutex_init(&root->objectid_mutex); > mutex_init(&root->log_mutex); > mutex_init(&root->ordered_extent_mutex); > mutex_init(&root->delalloc_mutex); > @@ -1133,16 +1132,12 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) > } > } > > - mutex_lock(&root->objectid_mutex); > ret = btrfs_init_root_free_objectid(root); > - if (ret) { > - mutex_unlock(&root->objectid_mutex); > + if (ret) > goto fail; > - } > > - ASSERT(root->free_objectid <= BTRFS_LAST_FREE_OBJECTID); > - > - mutex_unlock(&root->objectid_mutex); > + ASSERT((u64)atomic64_read(&root->free_objectid) <= > + BTRFS_LAST_FREE_OBJECTID); > > return 0; > fail: > @@ -2730,8 +2725,9 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) > } > > /* > - * No need to hold btrfs_root::objectid_mutex since the fs > - * hasn't been fully initialised and we are the only user > + * No need to worry about atomic ordering of btrfs_root::free_objectid > + * since the fs hasn't been fully initialised and we are the > + * only user > */ > ret = btrfs_init_root_free_objectid(tree_root); > if (ret < 0) { > @@ -2739,7 +2735,8 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) > continue; > } > > - ASSERT(tree_root->free_objectid <= BTRFS_LAST_FREE_OBJECTID); > + ASSERT((u64)atomic64_read(&tree_root->free_objectid) <= > + BTRFS_LAST_FREE_OBJECTID); > > ret = btrfs_read_roots(fs_info); > if (ret < 0) { > @@ -4931,10 +4928,11 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root) > slot = path->slots[0] - 1; > l = path->nodes[0]; > btrfs_item_key_to_cpu(l, &found_key, slot); > - root->free_objectid = max_t(u64, found_key.objectid + 1, > - BTRFS_FIRST_FREE_OBJECTID); > + atomic64_set(&root->free_objectid, > + max_t(u64, found_key.objectid + 1, > + BTRFS_FIRST_FREE_OBJECTID)); > } else { > - root->free_objectid = BTRFS_FIRST_FREE_OBJECTID; > + atomic64_set(&root->free_objectid, BTRFS_FIRST_FREE_OBJECTID); > } > ret = 0; > error: > @@ -4944,20 +4942,15 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root) > > int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid) > { > - int ret; > - mutex_lock(&root->objectid_mutex); > + u64 val = atomic64_inc_return(&root->free_objectid) - 1; > > - if (unlikely(root->free_objectid >= BTRFS_LAST_FREE_OBJECTID)) { > + if (unlikely(val >= BTRFS_LAST_FREE_OBJECTID)) { > btrfs_warn(root->fs_info, > "the objectid of root %llu reaches its highest value", > btrfs_root_id(root)); > - ret = -ENOSPC; > - goto out; > + return -ENOSPC; > } > > - *objectid = root->free_objectid++; > - ret = 0; So this gives different semantics now. Before we increment free_objectid only if it's less than BTRFS_LAST_FREE_OBJECTID, so once we reach that value we can't assign more object IDs and must return -ENOSPC. But now we always increment and then do the check, so after some calls to btrfs_get_free_objectid() we wrap around the counter due to overflow and eventually allow reusing already assigned object IDs. I'm afraid the lock is still needed because of that. To make it more lightweight maybe switch the mutex to a spinlock. Thanks. > -out: > - mutex_unlock(&root->objectid_mutex); > - return ret; > + *objectid = val; > + return 0; > } > diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c > index aaf16019d829..b41e06d5d2fb 100644 > --- a/fs/btrfs/qgroup.c > +++ b/fs/btrfs/qgroup.c > @@ -472,18 +472,19 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) > * > * Ensure that we skip any such subvol ids. > * > - * We don't need to lock because this is only called > - * during mount before we start doing things like creating > - * subvolumes. > + * We don't need to worry about atomic ordering because > + * this is only called during mount before we start > + * doing things like creating subvolumes. > */ > if (is_fstree(qgroup->qgroupid) && > - qgroup->qgroupid > tree_root->free_objectid) > + qgroup->qgroupid > (u64)atomic64_read(&tree_root->free_objectid)) > /* > * Don't need to check against BTRFS_LAST_FREE_OBJECTID, > * as it will get checked on the next call to > * btrfs_get_free_objectid. > */ > - tree_root->free_objectid = qgroup->qgroupid + 1; > + atomic64_set(&tree_root->free_objectid, > + qgroup->qgroupid + 1); > } > ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); > if (ret < 0) > diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c > index 955d1677e865..9d19528fee17 100644 > --- a/fs/btrfs/tree-log.c > +++ b/fs/btrfs/tree-log.c > @@ -7325,9 +7325,6 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) > * We have just replayed everything, and the highest > * objectid of fs roots probably has changed in case > * some inode_item's got replayed. > - * > - * root->objectid_mutex is not acquired as log replay > - * could only happen during mount. > */ > ret = btrfs_init_root_free_objectid(root); > if (ret) > -- > 2.45.2 > >
On 22/1/25 12:39, Filipe Manana wrote: > On Wed, Jan 22, 2025 at 12:25 PM Mark Harmstone <maharmstone@fb.com> wrote: >> @@ -4944,20 +4942,15 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root) >> >> int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid) >> { >> - int ret; >> - mutex_lock(&root->objectid_mutex); >> + u64 val = atomic64_inc_return(&root->free_objectid) - 1; >> >> - if (unlikely(root->free_objectid >= BTRFS_LAST_FREE_OBJECTID)) { >> + if (unlikely(val >= BTRFS_LAST_FREE_OBJECTID)) { >> btrfs_warn(root->fs_info, >> "the objectid of root %llu reaches its highest value", >> btrfs_root_id(root)); >> - ret = -ENOSPC; >> - goto out; >> + return -ENOSPC; >> } >> >> - *objectid = root->free_objectid++; >> - ret = 0; > > So this gives different semantics now. > > Before we increment free_objectid only if it's less than > BTRFS_LAST_FREE_OBJECTID, so once we reach that value we can't assign > more object IDs and must return -ENOSPC. > > But now we always increment and then do the check, so after some calls > to btrfs_get_free_objectid() we wrap around the counter due to > overflow and eventually allow reusing already assigned object IDs. > > I'm afraid the lock is still needed because of that. To make it more > lightweight maybe switch the mutex to a spinlock. > > Thanks. Thanks Filipe. Do we even need the check, really? Even a denial of service attack wouldn't be able to practically call the function ~2^64 times. And there's no way to create an inode with an arbitrary number, short of manually hex-editing the disk. Mark
On Wed, Jan 22, 2025 at 12:59 PM Mark Harmstone <maharmstone@meta.com> wrote: > > On 22/1/25 12:39, Filipe Manana wrote: > > On Wed, Jan 22, 2025 at 12:25 PM Mark Harmstone <maharmstone@fb.com> wrote: > >> @@ -4944,20 +4942,15 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root) > >> > >> int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid) > >> { > >> - int ret; > >> - mutex_lock(&root->objectid_mutex); > >> + u64 val = atomic64_inc_return(&root->free_objectid) - 1; > >> > >> - if (unlikely(root->free_objectid >= BTRFS_LAST_FREE_OBJECTID)) { > >> + if (unlikely(val >= BTRFS_LAST_FREE_OBJECTID)) { > >> btrfs_warn(root->fs_info, > >> "the objectid of root %llu reaches its highest value", > >> btrfs_root_id(root)); > >> - ret = -ENOSPC; > >> - goto out; > >> + return -ENOSPC; > >> } > >> > >> - *objectid = root->free_objectid++; > >> - ret = 0; > > > > So this gives different semantics now. > > > > Before we increment free_objectid only if it's less than > > BTRFS_LAST_FREE_OBJECTID, so once we reach that value we can't assign > > more object IDs and must return -ENOSPC. > > > > But now we always increment and then do the check, so after some calls > > to btrfs_get_free_objectid() we wrap around the counter due to > > overflow and eventually allow reusing already assigned object IDs. > > > > I'm afraid the lock is still needed because of that. To make it more > > lightweight maybe switch the mutex to a spinlock. > > > > Thanks. > > Thanks Filipe. Do we even need the check, really? Even a denial of > service attack wouldn't be able to practically call the function ~2^64 > times. And there's no way to create an inode with an arbitrary number, > short of manually hex-editing the disk. Well we do such limit checks everywhere, why would we ignore them only here? Even if they are very unlikely to be hit in practice, if they happen, we can get into serious trouble. So I don't think it's wise to ignore the limit. Thanks. > > Mark
On Wed, 22 Jan 2025 at 13:40, Filipe Manana <fdmanana@kernel.org> wrote: > > On Wed, Jan 22, 2025 at 12:25 PM Mark Harmstone <maharmstone@fb.com> wrote: > > > > Currently btrfs_get_free_objectid() uses a mutex to protect > > free_objectid; I'm guessing this was because of the inode cache that we > > used to have. The inode cache is no more, so simplify things by > > replacing it with an atomic. > > > > There's no issues with ordering: free_objectid gets set to an initial > > value, then calls to btrfs_get_free_objectid() return a monotonically > > increasing value. > > > > This change means that btrfs_get_free_objectid() will no longer > > potentially sleep, which was a blocker for adding a non-blocking mode > > for inode and subvol creation. > > > > Signed-off-by: Mark Harmstone <maharmstone@fb.com> > > --- > > fs/btrfs/ctree.h | 4 +--- > > fs/btrfs/disk-io.c | 43 ++++++++++++++++++------------------------- > > fs/btrfs/qgroup.c | 11 ++++++----- > > fs/btrfs/tree-log.c | 3 --- > > 4 files changed, 25 insertions(+), 36 deletions(-) > > > > diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h > > index 1096a80a64e7..04175698525b 100644 > > --- a/fs/btrfs/ctree.h > > +++ b/fs/btrfs/ctree.h > > @@ -179,8 +179,6 @@ struct btrfs_root { > > struct btrfs_fs_info *fs_info; > > struct extent_io_tree dirty_log_pages; > > > > - struct mutex objectid_mutex; > > - > > spinlock_t accounting_lock; > > struct btrfs_block_rsv *block_rsv; > > > > @@ -214,7 +212,7 @@ struct btrfs_root { > > > > u64 last_trans; > > > > - u64 free_objectid; > > + atomic64_t free_objectid; > > > > struct btrfs_key defrag_progress; > > struct btrfs_key defrag_max; > > diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c > > index f09db62e61a1..0543d9c3f8c0 100644 > > --- a/fs/btrfs/disk-io.c > > +++ b/fs/btrfs/disk-io.c > > @@ -659,7 +659,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, > > RB_CLEAR_NODE(&root->rb_node); > > > > btrfs_set_root_last_trans(root, 0); > > - root->free_objectid = 0; > > + atomic64_set(&root->free_objectid, 0); > > root->nr_delalloc_inodes = 0; > > root->nr_ordered_extents = 0; > > xa_init(&root->inodes); > > @@ -678,7 +678,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, > > spin_lock_init(&root->ordered_extent_lock); > > spin_lock_init(&root->accounting_lock); > > spin_lock_init(&root->qgroup_meta_rsv_lock); > > - mutex_init(&root->objectid_mutex); > > mutex_init(&root->log_mutex); > > mutex_init(&root->ordered_extent_mutex); > > mutex_init(&root->delalloc_mutex); > > @@ -1133,16 +1132,12 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) > > } > > } > > > > - mutex_lock(&root->objectid_mutex); > > ret = btrfs_init_root_free_objectid(root); > > - if (ret) { > > - mutex_unlock(&root->objectid_mutex); > > + if (ret) > > goto fail; > > - } > > > > - ASSERT(root->free_objectid <= BTRFS_LAST_FREE_OBJECTID); > > - > > - mutex_unlock(&root->objectid_mutex); > > + ASSERT((u64)atomic64_read(&root->free_objectid) <= > > + BTRFS_LAST_FREE_OBJECTID); > > > > return 0; > > fail: > > @@ -2730,8 +2725,9 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) > > } > > > > /* > > - * No need to hold btrfs_root::objectid_mutex since the fs > > - * hasn't been fully initialised and we are the only user > > + * No need to worry about atomic ordering of btrfs_root::free_objectid > > + * since the fs hasn't been fully initialised and we are the > > + * only user > > */ > > ret = btrfs_init_root_free_objectid(tree_root); > > if (ret < 0) { > > @@ -2739,7 +2735,8 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) > > continue; > > } > > > > - ASSERT(tree_root->free_objectid <= BTRFS_LAST_FREE_OBJECTID); > > + ASSERT((u64)atomic64_read(&tree_root->free_objectid) <= > > + BTRFS_LAST_FREE_OBJECTID); > > > > ret = btrfs_read_roots(fs_info); > > if (ret < 0) { > > @@ -4931,10 +4928,11 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root) > > slot = path->slots[0] - 1; > > l = path->nodes[0]; > > btrfs_item_key_to_cpu(l, &found_key, slot); > > - root->free_objectid = max_t(u64, found_key.objectid + 1, > > - BTRFS_FIRST_FREE_OBJECTID); > > + atomic64_set(&root->free_objectid, > > + max_t(u64, found_key.objectid + 1, > > + BTRFS_FIRST_FREE_OBJECTID)); > > } else { > > - root->free_objectid = BTRFS_FIRST_FREE_OBJECTID; > > + atomic64_set(&root->free_objectid, BTRFS_FIRST_FREE_OBJECTID); > > } > > ret = 0; > > error: > > @@ -4944,20 +4942,15 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root) > > > > int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid) > > { > > - int ret; > > - mutex_lock(&root->objectid_mutex); > > + u64 val = atomic64_inc_return(&root->free_objectid) - 1; > > > > - if (unlikely(root->free_objectid >= BTRFS_LAST_FREE_OBJECTID)) { > > + if (unlikely(val >= BTRFS_LAST_FREE_OBJECTID)) { > > btrfs_warn(root->fs_info, > > "the objectid of root %llu reaches its highest value", > > btrfs_root_id(root)); > > - ret = -ENOSPC; > > - goto out; > > + return -ENOSPC; > > } > > > > - *objectid = root->free_objectid++; > > - ret = 0; > > So this gives different semantics now. > > Before we increment free_objectid only if it's less than > BTRFS_LAST_FREE_OBJECTID, so once we reach that value we can't assign > more object IDs and must return -ENOSPC. > > But now we always increment and then do the check, so after some calls > to btrfs_get_free_objectid() we wrap around the counter due to > overflow and eventually allow reusing already assigned object IDs. > > I'm afraid the lock is still needed because of that. To make it more > lightweight maybe switch the mutex to a spinlock. How about this? ``` retry: val = atomic64_read(&root->free_objectid); ....; if (atomic64_cmpxchg(&root->free_objectid, val, val+1) != val) goto retry; *objectid = val; return 0; ``` > Thanks. > > > -out: > > - mutex_unlock(&root->objectid_mutex); > > - return ret; > > + *objectid = val; > > + return 0; > > } > > diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c > > index aaf16019d829..b41e06d5d2fb 100644 > > --- a/fs/btrfs/qgroup.c > > +++ b/fs/btrfs/qgroup.c > > @@ -472,18 +472,19 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) > > * > > * Ensure that we skip any such subvol ids. > > * > > - * We don't need to lock because this is only called > > - * during mount before we start doing things like creating > > - * subvolumes. > > + * We don't need to worry about atomic ordering because > > + * this is only called during mount before we start > > + * doing things like creating subvolumes. > > */ > > if (is_fstree(qgroup->qgroupid) && > > - qgroup->qgroupid > tree_root->free_objectid) > > + qgroup->qgroupid > (u64)atomic64_read(&tree_root->free_objectid)) > > /* > > * Don't need to check against BTRFS_LAST_FREE_OBJECTID, > > * as it will get checked on the next call to > > * btrfs_get_free_objectid. > > */ > > - tree_root->free_objectid = qgroup->qgroupid + 1; > > + atomic64_set(&tree_root->free_objectid, > > + qgroup->qgroupid + 1); > > } > > ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); > > if (ret < 0) > > diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c > > index 955d1677e865..9d19528fee17 100644 > > --- a/fs/btrfs/tree-log.c > > +++ b/fs/btrfs/tree-log.c > > @@ -7325,9 +7325,6 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) > > * We have just replayed everything, and the highest > > * objectid of fs roots probably has changed in case > > * some inode_item's got replayed. > > - * > > - * root->objectid_mutex is not acquired as log replay > > - * could only happen during mount. > > */ > > ret = btrfs_init_root_free_objectid(root); > > if (ret) > > -- > > 2.45.2 > > > > >
On Wed, Jan 22, 2025 at 12:21:28PM +0000, Mark Harmstone wrote: > Currently btrfs_get_free_objectid() uses a mutex to protect > free_objectid; I'm guessing this was because of the inode cache that we > used to have. The inode cache is no more, so simplify things by > replacing it with an atomic. > > There's no issues with ordering: free_objectid gets set to an initial > value, then calls to btrfs_get_free_objectid() return a monotonically > increasing value. > > This change means that btrfs_get_free_objectid() will no longer > potentially sleep, which was a blocker for adding a non-blocking mode > for inode and subvol creation. > > Signed-off-by: Mark Harmstone <maharmstone@fb.com> > --- > fs/btrfs/ctree.h | 4 +--- > fs/btrfs/disk-io.c | 43 ++++++++++++++++++------------------------- > fs/btrfs/qgroup.c | 11 ++++++----- > fs/btrfs/tree-log.c | 3 --- > 4 files changed, 25 insertions(+), 36 deletions(-) > > diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h > index 1096a80a64e7..04175698525b 100644 > --- a/fs/btrfs/ctree.h > +++ b/fs/btrfs/ctree.h > @@ -179,8 +179,6 @@ struct btrfs_root { > struct btrfs_fs_info *fs_info; > struct extent_io_tree dirty_log_pages; > > - struct mutex objectid_mutex; > - > spinlock_t accounting_lock; > struct btrfs_block_rsv *block_rsv; > > @@ -214,7 +212,7 @@ struct btrfs_root { > > u64 last_trans; > > - u64 free_objectid; > + atomic64_t free_objectid; Besides the other things pointed out, this also changes the type from u64 to s64 or requiring casts so we keep u64 as this is what the on-disk format defines.
On 22/1/25 15:07, David Sterba wrote: > > > On Wed, Jan 22, 2025 at 12:21:28PM +0000, Mark Harmstone wrote: >> Currently btrfs_get_free_objectid() uses a mutex to protect >> free_objectid; I'm guessing this was because of the inode cache that we >> used to have. The inode cache is no more, so simplify things by >> replacing it with an atomic. >> >> There's no issues with ordering: free_objectid gets set to an initial >> value, then calls to btrfs_get_free_objectid() return a monotonically >> increasing value. >> >> This change means that btrfs_get_free_objectid() will no longer >> potentially sleep, which was a blocker for adding a non-blocking mode >> for inode and subvol creation. >> >> Signed-off-by: Mark Harmstone <maharmstone@fb.com> >> --- >> fs/btrfs/ctree.h | 4 +--- >> fs/btrfs/disk-io.c | 43 ++++++++++++++++++------------------------- >> fs/btrfs/qgroup.c | 11 ++++++----- >> fs/btrfs/tree-log.c | 3 --- >> 4 files changed, 25 insertions(+), 36 deletions(-) >> >> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h >> index 1096a80a64e7..04175698525b 100644 >> --- a/fs/btrfs/ctree.h >> +++ b/fs/btrfs/ctree.h >> @@ -179,8 +179,6 @@ struct btrfs_root { >> struct btrfs_fs_info *fs_info; >> struct extent_io_tree dirty_log_pages; >> >> - struct mutex objectid_mutex; >> - >> spinlock_t accounting_lock; >> struct btrfs_block_rsv *block_rsv; >> >> @@ -214,7 +212,7 @@ struct btrfs_root { >> >> u64 last_trans; >> >> - u64 free_objectid; >> + atomic64_t free_objectid; > > Besides the other things pointed out, this also changes the type from > u64 to s64 or requiring casts so we keep u64 as this is what the on-disk > format defines. It does, but there's casts to u64 every time it's read (which, asserts aside, is only in btrfs_get_free_objectid).
On Wed, Jan 22, 2025 at 02:51:10PM +0100, Daniel Vacek wrote: > On Wed, 22 Jan 2025 at 13:40, Filipe Manana <fdmanana@kernel.org> wrote: > > > - if (unlikely(root->free_objectid >= BTRFS_LAST_FREE_OBJECTID)) { > > > + if (unlikely(val >= BTRFS_LAST_FREE_OBJECTID)) { > > > btrfs_warn(root->fs_info, > > > "the objectid of root %llu reaches its highest value", > > > btrfs_root_id(root)); > > > - ret = -ENOSPC; > > > - goto out; > > > + return -ENOSPC; > > > } > > > > > > - *objectid = root->free_objectid++; > > > - ret = 0; > > > > So this gives different semantics now. > > > > Before we increment free_objectid only if it's less than > > BTRFS_LAST_FREE_OBJECTID, so once we reach that value we can't assign > > more object IDs and must return -ENOSPC. > > > > But now we always increment and then do the check, so after some calls > > to btrfs_get_free_objectid() we wrap around the counter due to > > overflow and eventually allow reusing already assigned object IDs. > > > > I'm afraid the lock is still needed because of that. To make it more > > lightweight maybe switch the mutex to a spinlock. > > How about this? > > ``` > retry: val = atomic64_read(&root->free_objectid); > ....; > if (atomic64_cmpxchg(&root->free_objectid, val, val+1) != val) > goto retry; > *objectid = val; > return 0; > ``` Doesn't this waste some ids when there are many concurrent requests?
On Wed, Jan 22, 2025 at 12:39:21PM +0000, Filipe Manana wrote: > But now we always increment and then do the check, so after some calls > to btrfs_get_free_objectid() we wrap around the counter due to > overflow and eventually allow reusing already assigned object IDs. > > I'm afraid the lock is still needed because of that. To make it more > lightweight maybe switch the mutex to a spinlock. For inode number allocations a spinlock should work. For tree ids we may still need the mutex: btrfs_init_fs_root mutex btrfs_init_root_free_objectid btrfs_alloc_path btrfs_search_slot The difference is that inode number allocations are per-fs root, while the subvolume ids are exclusively in the tree_root.
On Thu, 23 Jan 2025 at 23:00, David Sterba <dsterba@suse.cz> wrote: > > On Wed, Jan 22, 2025 at 02:51:10PM +0100, Daniel Vacek wrote: > > On Wed, 22 Jan 2025 at 13:40, Filipe Manana <fdmanana@kernel.org> wrote: > > > > - if (unlikely(root->free_objectid >= BTRFS_LAST_FREE_OBJECTID)) { > > > > + if (unlikely(val >= BTRFS_LAST_FREE_OBJECTID)) { > > > > btrfs_warn(root->fs_info, > > > > "the objectid of root %llu reaches its highest value", > > > > btrfs_root_id(root)); > > > > - ret = -ENOSPC; > > > > - goto out; > > > > + return -ENOSPC; > > > > } > > > > > > > > - *objectid = root->free_objectid++; > > > > - ret = 0; > > > > > > So this gives different semantics now. > > > > > > Before we increment free_objectid only if it's less than > > > BTRFS_LAST_FREE_OBJECTID, so once we reach that value we can't assign > > > more object IDs and must return -ENOSPC. > > > > > > But now we always increment and then do the check, so after some calls > > > to btrfs_get_free_objectid() we wrap around the counter due to > > > overflow and eventually allow reusing already assigned object IDs. > > > > > > I'm afraid the lock is still needed because of that. To make it more > > > lightweight maybe switch the mutex to a spinlock. > > > > How about this? > > > > ``` > > retry: val = atomic64_read(&root->free_objectid); > > ....; > > if (atomic64_cmpxchg(&root->free_objectid, val, val+1) != val) > > goto retry; > > *objectid = val; > > return 0; > > ``` > > Doesn't this waste some ids when there are many concurrent requests? Quite the opposite, it's meant to prevent that. That's why I suggested it as the original patch was wasting them and that's what Filipe pointed out. It will only retry precisely when more concurrent requests race here. And thanks to cmpxchg only one of them wins and increments. The others retry in another iteration of the loop. I think this way no lock is needed and the previous behavior is preserved.
On Fri, Jan 24, 2025 at 8:22 AM Daniel Vacek <neelx@suse.com> wrote: > > On Thu, 23 Jan 2025 at 23:00, David Sterba <dsterba@suse.cz> wrote: > > > > On Wed, Jan 22, 2025 at 02:51:10PM +0100, Daniel Vacek wrote: > > > On Wed, 22 Jan 2025 at 13:40, Filipe Manana <fdmanana@kernel.org> wrote: > > > > > - if (unlikely(root->free_objectid >= BTRFS_LAST_FREE_OBJECTID)) { > > > > > + if (unlikely(val >= BTRFS_LAST_FREE_OBJECTID)) { > > > > > btrfs_warn(root->fs_info, > > > > > "the objectid of root %llu reaches its highest value", > > > > > btrfs_root_id(root)); > > > > > - ret = -ENOSPC; > > > > > - goto out; > > > > > + return -ENOSPC; > > > > > } > > > > > > > > > > - *objectid = root->free_objectid++; > > > > > - ret = 0; > > > > > > > > So this gives different semantics now. > > > > > > > > Before we increment free_objectid only if it's less than > > > > BTRFS_LAST_FREE_OBJECTID, so once we reach that value we can't assign > > > > more object IDs and must return -ENOSPC. > > > > > > > > But now we always increment and then do the check, so after some calls > > > > to btrfs_get_free_objectid() we wrap around the counter due to > > > > overflow and eventually allow reusing already assigned object IDs. > > > > > > > > I'm afraid the lock is still needed because of that. To make it more > > > > lightweight maybe switch the mutex to a spinlock. > > > > > > How about this? > > > > > > ``` > > > retry: val = atomic64_read(&root->free_objectid); > > > ....; > > > if (atomic64_cmpxchg(&root->free_objectid, val, val+1) != val) > > > goto retry; > > > *objectid = val; > > > return 0; > > > ``` > > > > Doesn't this waste some ids when there are many concurrent requests? > > Quite the opposite, it's meant to prevent that. That's why I suggested > it as the original patch was wasting them and that's what Filipe > pointed out. Not wasting, but allowing the counter to wrap around and return already in use object IDs, far more serious than wasting. And besides that, the counter wrap-around would allow the return of invalid object IDs, in the range from 0 to BTRFS_FIRST_FREE_OBJECTID (256). > > It will only retry precisely when more concurrent requests race here. > And thanks to cmpxchg only one of them wins and increments. The others > retry in another iteration of the loop. > > I think this way no lock is needed and the previous behavior is preserved. That looks fine to me. But under heavy concurrency, there's the potential to loop too much, so I would at least add a cond_resched() call before doing the goto. With the mutex there's the advantage of not looping and wasting CPU if such high concurrency happens, tasks will be blocked and yield the cpu for other tasks. Any improvements in performance could also be measured easily with fs_mark, which will heavily hit this code path. I would prefer if the patch adds fs_mark numbers (or from any other test) in the changelogs. Thanks.
On Thu, 23 Jan 2025 at 23:11, David Sterba <dsterba@suse.cz> wrote: > > On Wed, Jan 22, 2025 at 12:39:21PM +0000, Filipe Manana wrote: > > But now we always increment and then do the check, so after some calls > > to btrfs_get_free_objectid() we wrap around the counter due to > > overflow and eventually allow reusing already assigned object IDs. > > > > I'm afraid the lock is still needed because of that. To make it more > > lightweight maybe switch the mutex to a spinlock. > > For inode number allocations a spinlock should work. For tree ids we may > still need the mutex: > > btrfs_init_fs_root > mutex > btrfs_init_root_free_objectid > btrfs_alloc_path > btrfs_search_slot > > The difference is that inode number allocations are per-fs root, while > the subvolume ids are exclusively in the tree_root. Can someone else hold a reference to the same given root and perhaps already created new objects while `btrfs_init_fs_root(root, ...)` is called? This sounds counter-intuitive, to re-initialize an already being used root. But that's the only explanation for the mutex here, IIUC. Otherwise it would not be needed.
On Fri, 24 Jan 2025 at 13:26, Filipe Manana <fdmanana@kernel.org> wrote: > > On Fri, Jan 24, 2025 at 8:22 AM Daniel Vacek <neelx@suse.com> wrote: > > > > On Thu, 23 Jan 2025 at 23:00, David Sterba <dsterba@suse.cz> wrote: > > > > > > On Wed, Jan 22, 2025 at 02:51:10PM +0100, Daniel Vacek wrote: > > > > On Wed, 22 Jan 2025 at 13:40, Filipe Manana <fdmanana@kernel.org> wrote: > > > > > > - if (unlikely(root->free_objectid >= BTRFS_LAST_FREE_OBJECTID)) { > > > > > > + if (unlikely(val >= BTRFS_LAST_FREE_OBJECTID)) { > > > > > > btrfs_warn(root->fs_info, > > > > > > "the objectid of root %llu reaches its highest value", > > > > > > btrfs_root_id(root)); > > > > > > - ret = -ENOSPC; > > > > > > - goto out; > > > > > > + return -ENOSPC; > > > > > > } > > > > > > > > > > > > - *objectid = root->free_objectid++; > > > > > > - ret = 0; > > > > > > > > > > So this gives different semantics now. > > > > > > > > > > Before we increment free_objectid only if it's less than > > > > > BTRFS_LAST_FREE_OBJECTID, so once we reach that value we can't assign > > > > > more object IDs and must return -ENOSPC. > > > > > > > > > > But now we always increment and then do the check, so after some calls > > > > > to btrfs_get_free_objectid() we wrap around the counter due to > > > > > overflow and eventually allow reusing already assigned object IDs. > > > > > > > > > > I'm afraid the lock is still needed because of that. To make it more > > > > > lightweight maybe switch the mutex to a spinlock. > > > > > > > > How about this? > > > > > > > > ``` > > > > retry: val = atomic64_read(&root->free_objectid); > > > > ....; > > > > if (atomic64_cmpxchg(&root->free_objectid, val, val+1) != val) > > > > goto retry; > > > > *objectid = val; > > > > return 0; > > > > ``` > > > > > > Doesn't this waste some ids when there are many concurrent requests? > > > > Quite the opposite, it's meant to prevent that. That's why I suggested > > it as the original patch was wasting them and that's what Filipe > > pointed out. > > Not wasting, but allowing the counter to wrap around and return > already in use object IDs, far more serious than wasting. > And besides that, the counter wrap-around would allow the return of > invalid object IDs, in the range from 0 to BTRFS_FIRST_FREE_OBJECTID > (256). Oh, sorry about the confusion. Those three dots ... were meant as a placeholder for the original -ENOSPC condition. Again, keeping the original logic without any changes other than getting rid of the lock. > > > > It will only retry precisely when more concurrent requests race here. > > And thanks to cmpxchg only one of them wins and increments. The others > > retry in another iteration of the loop. > > > > I think this way no lock is needed and the previous behavior is preserved. > > That looks fine to me. But under heavy concurrency, there's the > potential to loop too much, so I would at least add a cond_resched() > call before doing the goto. Possibly. > With the mutex there's the advantage of not looping and wasting CPU if > such high concurrency happens, tasks will be blocked and yield the cpu > for other tasks. Right. My understanding was that if this function is heavily contended, the mutex would be a major bottleneck. Which you would be likely aware of. Otherwise this is just a protection against rare races. Anyways, `cond_resched()` should not hurt even if not strictly needed. Better safe than sorry. > Any improvements in performance could also be measured easily with > fs_mark, which will heavily hit this code path. > I would prefer if the patch adds fs_mark numbers (or from any other > test) in the changelogs. > > Thanks.
On 24/1/25 12:25, Filipe Manana wrote: >> >> It will only retry precisely when more concurrent requests race here. >> And thanks to cmpxchg only one of them wins and increments. The others >> retry in another iteration of the loop. >> >> I think this way no lock is needed and the previous behavior is preserved. > > That looks fine to me. But under heavy concurrency, there's the > potential to loop too much, so I would at least add a cond_resched() > call before doing the goto. > With the mutex there's the advantage of not looping and wasting CPU if > such high concurrency happens, tasks will be blocked and yield the cpu > for other tasks. And then we have the problem of the function potentially sleeping, which was one of the things I'm trying to avoid. I still think an atomic is the correct choice here: * Skipped integers aren't a problem, as there's no reliance on the numbers being contiguous. * The overflow check is wasted cycles, and should never have been there in the first place. Even if we were to grab a new integer a billion times a second, it would take 584 years to wrap around. Mark
On Mon, Jan 27, 2025 at 5:42 PM Mark Harmstone <maharmstone@meta.com> wrote: > > On 24/1/25 12:25, Filipe Manana wrote: > >> > >> It will only retry precisely when more concurrent requests race here. > >> And thanks to cmpxchg only one of them wins and increments. The others > >> retry in another iteration of the loop. > >> > >> I think this way no lock is needed and the previous behavior is preserved. > > > > That looks fine to me. But under heavy concurrency, there's the > > potential to loop too much, so I would at least add a cond_resched() > > call before doing the goto. > > With the mutex there's the advantage of not looping and wasting CPU if > > such high concurrency happens, tasks will be blocked and yield the cpu > > for other tasks. > > And then we have the problem of the function potentially sleeping, which > was one of the things I'm trying to avoid. The sleep should be there to avoid looping too much and starving other tasks in case of too heavy concurrency. > > I still think an atomic is the correct choice here: I'm not saying it isn't. > > * Skipped integers aren't a problem, as there's no reliance on the > numbers being contiguous. That wasn't the problem with the patch. > * The overflow check is wasted cycles, and should never have been there > in the first place. Even if we were to grab a new integer a billion > times a second, it would take 584 years to wrap around. That still feels wrong to me. We do limit checks everywhere, even because corruptions and bugs can happen. Removing the limit checks, could also result in using the invalid range from 0 to BTRFS_FIRST_FREE_OBJECTID (256), besides reusing numbers after that range. Do you actually have numbers to compare the atomic vs mutex? > > Mark
On Mon, Jan 27, 2025 at 05:42:28PM +0000, Mark Harmstone wrote: > On 24/1/25 12:25, Filipe Manana wrote: > >> > >> It will only retry precisely when more concurrent requests race here. > >> And thanks to cmpxchg only one of them wins and increments. The others > >> retry in another iteration of the loop. > >> > >> I think this way no lock is needed and the previous behavior is preserved. > > > > That looks fine to me. But under heavy concurrency, there's the > > potential to loop too much, so I would at least add a cond_resched() > > call before doing the goto. > > With the mutex there's the advantage of not looping and wasting CPU if > > such high concurrency happens, tasks will be blocked and yield the cpu > > for other tasks. > > And then we have the problem of the function potentially sleeping, which > was one of the things I'm trying to avoid. > > I still think an atomic is the correct choice here: > > * Skipped integers aren't a problem, as there's no reliance on the > numbers being contiguous. > * The overflow check is wasted cycles, and should never have been there > in the first place. We do many checks that almost never happen but declare the range of the expected values and can catch eventual bugs caused by the "impossible" reasons like memory bitflips or consequences of other errors that only show up due to such checks. I would not cosider one condition wasted cycles in this case, unless we really are optimizing a hot path and counting the cycles. > Even if we were to grab a new integer a billion > times a second, it would take 584 years to wrap around. Good to know, but I would not use that as an argument. This may hold if you predict based on contemporary hardware. New technologies can bring an order of magnitude improvement, eg. like NVMe brought compared to SSD technology.
On Mon, Jan 27, 2025 at 09:17:17PM +0100, David Sterba wrote: > On Mon, Jan 27, 2025 at 05:42:28PM +0000, Mark Harmstone wrote: > > On 24/1/25 12:25, Filipe Manana wrote: > > >> > > >> It will only retry precisely when more concurrent requests race here. > > >> And thanks to cmpxchg only one of them wins and increments. The others > > >> retry in another iteration of the loop. > > >> > > >> I think this way no lock is needed and the previous behavior is preserved. > > > > > > That looks fine to me. But under heavy concurrency, there's the > > > potential to loop too much, so I would at least add a cond_resched() > > > call before doing the goto. > > > With the mutex there's the advantage of not looping and wasting CPU if > > > such high concurrency happens, tasks will be blocked and yield the cpu > > > for other tasks. > > > > And then we have the problem of the function potentially sleeping, which > > was one of the things I'm trying to avoid. > > > > I still think an atomic is the correct choice here: > > > > * Skipped integers aren't a problem, as there's no reliance on the > > numbers being contiguous. > > * The overflow check is wasted cycles, and should never have been there > > in the first place. > > We do many checks that almost never happen but declare the range of the > expected values and can catch eventual bugs caused by the "impossible" > reasons like memory bitflips or consequences of other errors that only > show up due to such checks. I would not cosider one condition wasted > cycles in this case, unless we really are optimizing a hot path and > counting the cycles. > Could you explain a bit more what you view the philosophy on "impossible" inputs to be? In this case, we are *not* safe from full on memory corruption, as some other thread might doodle on our free_objectid after we do the check. It helps with a bad write for inode metadata, but in that case, we still have the following on our side: 1. we have multiple redundant inode items, so fsck/tree checker can notice an inconsistency when we see an item with a random massive objectid out of order. I haven't re-read it to see if it does, but it seems easy enough to implement if not. 2. a single bit flip, even of the MSB, still doesn't put us THAT close to the end of the range and Mark's argument about 2^64 being a big number still presents a reasonable, if not bulletproof, defense. I generally support correctness trumping performance, but suppose the existing code was the atomic and we got a patch to do the inc under a mutex, justified by the possible bug. Would we be excited by that patch, or would we say it's not a real bug? I was thinking some more about it, and was wondering if we could get away with setting the max objectid to something smaller than -256 s.t. we felt safe with some trivial algorithm like "atomic_inc with dec on failure", which would obviously not fly with a buffer of only 256. Another option is aborting the fs when we get an obviously impossibly large inode. In that case, the disaster scenario of overflowing into a dupe is averted, and it will never happen except in the case of corruption, so it's not a worse UX than ENOSPC. Presumably, we can ensure that we can't commit the transaction once a thread hits this error, so no one should be able to write an item with an overflowed inode number. Those slightly rambly ideas aside, I also support Daniel's solution. I would worry about doing it without cond_resched as we don't really know how much we currently rely on the queueing behavior of mutex. > > Even if we were to grab a new integer a billion > > times a second, it would take 584 years to wrap around. > > Good to know, but I would not use that as an argument. This may hold if > you predict based on contemporary hardware. New technologies can bring > an order of magnitude improvement, eg. like NVMe brought compared to SSD > technology. I eagerly look forward to my 40GHz processor :)
On Wed, 29 Jan 2025 at 23:57, Boris Burkov <boris@bur.io> wrote: > > On Mon, Jan 27, 2025 at 09:17:17PM +0100, David Sterba wrote: > > On Mon, Jan 27, 2025 at 05:42:28PM +0000, Mark Harmstone wrote: > > > On 24/1/25 12:25, Filipe Manana wrote: > > > >> > > > >> It will only retry precisely when more concurrent requests race here. > > > >> And thanks to cmpxchg only one of them wins and increments. The others > > > >> retry in another iteration of the loop. > > > >> > > > >> I think this way no lock is needed and the previous behavior is preserved. > > > > > > > > That looks fine to me. But under heavy concurrency, there's the > > > > potential to loop too much, so I would at least add a cond_resched() > > > > call before doing the goto. > > > > With the mutex there's the advantage of not looping and wasting CPU if > > > > such high concurrency happens, tasks will be blocked and yield the cpu > > > > for other tasks. > > > > > > And then we have the problem of the function potentially sleeping, which > > > was one of the things I'm trying to avoid. > > > > > > I still think an atomic is the correct choice here: > > > > > > * Skipped integers aren't a problem, as there's no reliance on the > > > numbers being contiguous. > > > * The overflow check is wasted cycles, and should never have been there > > > in the first place. > > > > We do many checks that almost never happen but declare the range of the > > expected values and can catch eventual bugs caused by the "impossible" > > reasons like memory bitflips or consequences of other errors that only > > show up due to such checks. I would not cosider one condition wasted > > cycles in this case, unless we really are optimizing a hot path and > > counting the cycles. > > > > Could you explain a bit more what you view the philosophy on "impossible" > inputs to be? In this case, we are *not* safe from full on memory > corruption, as some other thread might doodle on our free_objectid after > we do the check. It helps with a bad write for inode metadata, but in > that case, we still have the following on our side: > > 1. we have multiple redundant inode items, so fsck/tree checker can > notice an inconsistency when we see an item with a random massive > objectid out of order. I haven't re-read it to see if it does, but it > seems easy enough to implement if not. > > 2. a single bit flip, even of the MSB, still doesn't put us THAT close > to the end of the range and Mark's argument about 2^64 being a big > number still presents a reasonable, if not bulletproof, defense. > > I generally support correctness trumping performance, but suppose the > existing code was the atomic and we got a patch to do the inc under a > mutex, justified by the possible bug. Would we be excited by that patch, > or would we say it's not a real bug? > > I was thinking some more about it, and was wondering if we could get > away with setting the max objectid to something smaller than -256 s.t. > we felt safe with some trivial algorithm like "atomic_inc with dec on > failure", which would obviously not fly with a buffer of only 256. You mean at least NR_CPUS, right? That would imply wrapping into `preempt_disable()` section. But yeah, that could be another possible solution here. The pros would be a single pass (no loop) and hence no `cond_resched()` needed even. For the cons, there are multiple `root->free_objectid <= BTRFS_LAST_FREE_OBJECTID` asserts and other uses of the const macro which would need to be reviewed and considered. > Another option is aborting the fs when we get an obviously impossibly > large inode. In that case, the disaster scenario of overflowing into a > dupe is averted, and it will never happen except in the case of > corruption, so it's not a worse UX than ENOSPC. Presumably, we can ensure > that we can't commit the transaction once a thread hits this error, so > no one should be able to write an item with an overflowed inode number. > > Those slightly rambly ideas aside, I also support Daniel's solution. I > would worry about doing it without cond_resched as we don't really know > how much we currently rely on the queueing behavior of mutex. Let me emphasize once again, I still have this feeling that if this mutex was contended, we would notoriously know about it as being a major throughput bottleneck. Whereby 'we' I mean you guys as I don't have much experience with regards to this one yet. But then again, if this mutex is rather a protection against unlikely races than against likely expected contention, then the overhead should already be minimal anyways in most cases and Mark's patch would make little to no difference really. More like just covering the unlikely edge cases (which could still be a good thing). So I'm wondering, is there actually any performance gain with Mark's patch to begin with? Or is the aim to cover and address the edge cases where CPUs (or rather tasks) may be racing and one/some are forced to sleep? The commit message tries to sell the non-blocking mode for new inodes/subvol's. That sounds fair as it is, but again, little experience from my side here. > > > Even if we were to grab a new integer a billion > > > times a second, it would take 584 years to wrap around. > > > > Good to know, but I would not use that as an argument. This may hold if > > you predict based on contemporary hardware. New technologies can bring > > an order of magnitude improvement, eg. like NVMe brought compared to SSD > > technology. > > I eagerly look forward to my 40GHz processor :) You never know about unexpected break-throughs. So fingers crossed. Though I'd be surprised. But maybe a question for (not just) Dave: Can you imagine (or do you know already) any workload which would rely on creating FS objects as lightning-fast as possible? The size of the storage would have to be enormous to hold that many files so that the BTRFS_LAST_FREE_OBJECTID limit could be reached or the files would have to be ephemeral (but in that case tmpfs sounds like a better fit anyways).
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 1096a80a64e7..04175698525b 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -179,8 +179,6 @@ struct btrfs_root { struct btrfs_fs_info *fs_info; struct extent_io_tree dirty_log_pages; - struct mutex objectid_mutex; - spinlock_t accounting_lock; struct btrfs_block_rsv *block_rsv; @@ -214,7 +212,7 @@ struct btrfs_root { u64 last_trans; - u64 free_objectid; + atomic64_t free_objectid; struct btrfs_key defrag_progress; struct btrfs_key defrag_max; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index f09db62e61a1..0543d9c3f8c0 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -659,7 +659,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, RB_CLEAR_NODE(&root->rb_node); btrfs_set_root_last_trans(root, 0); - root->free_objectid = 0; + atomic64_set(&root->free_objectid, 0); root->nr_delalloc_inodes = 0; root->nr_ordered_extents = 0; xa_init(&root->inodes); @@ -678,7 +678,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, spin_lock_init(&root->ordered_extent_lock); spin_lock_init(&root->accounting_lock); spin_lock_init(&root->qgroup_meta_rsv_lock); - mutex_init(&root->objectid_mutex); mutex_init(&root->log_mutex); mutex_init(&root->ordered_extent_mutex); mutex_init(&root->delalloc_mutex); @@ -1133,16 +1132,12 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) } } - mutex_lock(&root->objectid_mutex); ret = btrfs_init_root_free_objectid(root); - if (ret) { - mutex_unlock(&root->objectid_mutex); + if (ret) goto fail; - } - ASSERT(root->free_objectid <= BTRFS_LAST_FREE_OBJECTID); - - mutex_unlock(&root->objectid_mutex); + ASSERT((u64)atomic64_read(&root->free_objectid) <= + BTRFS_LAST_FREE_OBJECTID); return 0; fail: @@ -2730,8 +2725,9 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) } /* - * No need to hold btrfs_root::objectid_mutex since the fs - * hasn't been fully initialised and we are the only user + * No need to worry about atomic ordering of btrfs_root::free_objectid + * since the fs hasn't been fully initialised and we are the + * only user */ ret = btrfs_init_root_free_objectid(tree_root); if (ret < 0) { @@ -2739,7 +2735,8 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) continue; } - ASSERT(tree_root->free_objectid <= BTRFS_LAST_FREE_OBJECTID); + ASSERT((u64)atomic64_read(&tree_root->free_objectid) <= + BTRFS_LAST_FREE_OBJECTID); ret = btrfs_read_roots(fs_info); if (ret < 0) { @@ -4931,10 +4928,11 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root) slot = path->slots[0] - 1; l = path->nodes[0]; btrfs_item_key_to_cpu(l, &found_key, slot); - root->free_objectid = max_t(u64, found_key.objectid + 1, - BTRFS_FIRST_FREE_OBJECTID); + atomic64_set(&root->free_objectid, + max_t(u64, found_key.objectid + 1, + BTRFS_FIRST_FREE_OBJECTID)); } else { - root->free_objectid = BTRFS_FIRST_FREE_OBJECTID; + atomic64_set(&root->free_objectid, BTRFS_FIRST_FREE_OBJECTID); } ret = 0; error: @@ -4944,20 +4942,15 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root) int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid) { - int ret; - mutex_lock(&root->objectid_mutex); + u64 val = atomic64_inc_return(&root->free_objectid) - 1; - if (unlikely(root->free_objectid >= BTRFS_LAST_FREE_OBJECTID)) { + if (unlikely(val >= BTRFS_LAST_FREE_OBJECTID)) { btrfs_warn(root->fs_info, "the objectid of root %llu reaches its highest value", btrfs_root_id(root)); - ret = -ENOSPC; - goto out; + return -ENOSPC; } - *objectid = root->free_objectid++; - ret = 0; -out: - mutex_unlock(&root->objectid_mutex); - return ret; + *objectid = val; + return 0; } diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index aaf16019d829..b41e06d5d2fb 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -472,18 +472,19 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) * * Ensure that we skip any such subvol ids. * - * We don't need to lock because this is only called - * during mount before we start doing things like creating - * subvolumes. + * We don't need to worry about atomic ordering because + * this is only called during mount before we start + * doing things like creating subvolumes. */ if (is_fstree(qgroup->qgroupid) && - qgroup->qgroupid > tree_root->free_objectid) + qgroup->qgroupid > (u64)atomic64_read(&tree_root->free_objectid)) /* * Don't need to check against BTRFS_LAST_FREE_OBJECTID, * as it will get checked on the next call to * btrfs_get_free_objectid. */ - tree_root->free_objectid = qgroup->qgroupid + 1; + atomic64_set(&tree_root->free_objectid, + qgroup->qgroupid + 1); } ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); if (ret < 0) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 955d1677e865..9d19528fee17 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -7325,9 +7325,6 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) * We have just replayed everything, and the highest * objectid of fs roots probably has changed in case * some inode_item's got replayed. - * - * root->objectid_mutex is not acquired as log replay - * could only happen during mount. */ ret = btrfs_init_root_free_objectid(root); if (ret)
Currently btrfs_get_free_objectid() uses a mutex to protect free_objectid; I'm guessing this was because of the inode cache that we used to have. The inode cache is no more, so simplify things by replacing it with an atomic. There's no issues with ordering: free_objectid gets set to an initial value, then calls to btrfs_get_free_objectid() return a monotonically increasing value. This change means that btrfs_get_free_objectid() will no longer potentially sleep, which was a blocker for adding a non-blocking mode for inode and subvol creation. Signed-off-by: Mark Harmstone <maharmstone@fb.com> --- fs/btrfs/ctree.h | 4 +--- fs/btrfs/disk-io.c | 43 ++++++++++++++++++------------------------- fs/btrfs/qgroup.c | 11 ++++++----- fs/btrfs/tree-log.c | 3 --- 4 files changed, 25 insertions(+), 36 deletions(-)