diff mbox series

btrfs: reduce lock contention when eb cache miss for btree search

Message ID 20241009020149.23467-1-robbieko@synology.com (mailing list archive)
State New, archived
Headers show
Series btrfs: reduce lock contention when eb cache miss for btree search | expand

Commit Message

robbieko Oct. 9, 2024, 2:01 a.m. UTC
From: Robbie Ko <robbieko@synology.com>

When crawling btree, if an eb cache miss occurs, we change to use
the eb read lock and release all previous locks to reduce lock contention.

Because we have prepared the check parameters and the read lock
of eb we hold, we can ensure that no race will occur during the check
and cause unexpected errors.

Signed-off-by: Robbie Ko <robbieko@synology.com>
---
 fs/btrfs/ctree.c | 88 ++++++++++++++++++++++++++++--------------------
 1 file changed, 52 insertions(+), 36 deletions(-)

Comments

Filipe Manana Oct. 9, 2024, 2:18 p.m. UTC | #1
On Wed, Oct 9, 2024 at 3:09 AM robbieko <robbieko@synology.com> wrote:
>
> From: Robbie Ko <robbieko@synology.com>
>
> When crawling btree, if an eb cache miss occurs, we change to use
> the eb read lock and release all previous locks to reduce lock contention.
>
> Because we have prepared the check parameters and the read lock
> of eb we hold, we can ensure that no race will occur during the check
> and cause unexpected errors.
>
> Signed-off-by: Robbie Ko <robbieko@synology.com>
> ---
>  fs/btrfs/ctree.c | 88 ++++++++++++++++++++++++++++--------------------
>  1 file changed, 52 insertions(+), 36 deletions(-)
>
> diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
> index 0cc919d15b14..0efbe61497f3 100644
> --- a/fs/btrfs/ctree.c
> +++ b/fs/btrfs/ctree.c
> @@ -1515,12 +1515,12 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
>         struct btrfs_tree_parent_check check = { 0 };
>         u64 blocknr;
>         u64 gen;
> -       struct extent_buffer *tmp;
> -       int ret;
> +       struct extent_buffer *tmp = NULL;
> +       int ret, err;

Please avoid declaring two (or more) variables in the same line. 1 per
line is prefered for readability and be consistent with this
function's code.

>         int parent_level;
> -       bool unlock_up;
> +       bool create = false;
> +       bool lock = false;
>
> -       unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]);
>         blocknr = btrfs_node_blockptr(*eb_ret, slot);
>         gen = btrfs_node_ptr_generation(*eb_ret, slot);
>         parent_level = btrfs_header_level(*eb_ret);
> @@ -1551,52 +1551,66 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
>                          */
>                         if (btrfs_verify_level_key(tmp,
>                                         parent_level - 1, &check.first_key, gen)) {
> -                               free_extent_buffer(tmp);
> -                               return -EUCLEAN;
> +                               ret = -EUCLEAN;
> +                               goto out;
>                         }
>                         *eb_ret = tmp;
> -                       return 0;
> +                       tmp = NULL;
> +                       ret = 0;
> +                       goto out;

By setting tmp to NULL and jumping to the out label, we leak a
reference on the tmp extent buffer.

>                 }
>
>                 if (p->nowait) {
> -                       free_extent_buffer(tmp);
> -                       return -EAGAIN;
> +                       ret = -EAGAIN;
> +                       btrfs_release_path(p);

To reduce the critical section sizes, please set ret to -EAGAIN after
releasing the path.

> +                       goto out;
>                 }
>
> -               if (unlock_up)
> -                       btrfs_unlock_up_safe(p, level + 1);
> +               ret = -EAGAIN;
> +               btrfs_unlock_up_safe(p, level + 1);

Same here, set ret after the unlocking.

But why set ret to -EAGAIN? Here we know p->nowait is false.

> +               btrfs_tree_read_lock(tmp);
> +               lock = true;

And here, with the same reasoning, set 'lock' to true before calling
btrfs_tree_read_lock().

> +               btrfs_release_path(p);
>
>                 /* now we're allowed to do a blocking uptodate check */
> -               ret = btrfs_read_extent_buffer(tmp, &check);
> -               if (ret) {
> -                       free_extent_buffer(tmp);
> -                       btrfs_release_path(p);
> -                       return ret;
> +               err = btrfs_read_extent_buffer(tmp, &check);
> +               if (err) {
> +                       ret = err;
> +                       goto out;

Why do we need to have this 'err' variable?
Why not use 'ret' and simplify?

>                 }
> -
> -               if (unlock_up)
> -                       ret = -EAGAIN;
> -
>                 goto out;
>         } else if (p->nowait) {
> -               return -EAGAIN;
> -       }
> -
> -       if (unlock_up) {
> -               btrfs_unlock_up_safe(p, level + 1);
>                 ret = -EAGAIN;
> -       } else {
> -               ret = 0;
> +               btrfs_release_path(p);

Same here, set 'ret' to -EAGAIN after releasing the path.

> +               goto out;
>         }
>
> +       ret = -EAGAIN;
> +       btrfs_unlock_up_safe(p, level + 1);

Same here.

But why set ret to -EAGAIN? Here we know p->nowait is false.

> +
>         if (p->reada != READA_NONE)
>                 reada_for_search(fs_info, p, level, slot, key->objectid);
>
> -       tmp = read_tree_block(fs_info, blocknr, &check);
> +       tmp = btrfs_find_create_tree_block(fs_info, blocknr, check.owner_root, check.level);
>         if (IS_ERR(tmp)) {
> +               ret = PTR_ERR(tmp);
> +               tmp = NULL;
>                 btrfs_release_path(p);
> -               return PTR_ERR(tmp);
> +               goto out;
>         }
> +       create = true;
> +
> +       btrfs_tree_read_lock(tmp);
> +       lock = true;

Same here, set 'lock' to true before the call to btrfs_tree_read_lock();

> +       btrfs_release_path(p);
> +
> +       /* now we're allowed to do a blocking uptodate check */

Please capitalize the first word of a sentence and end the sentence
with punctuation.
This is the preferred style and we strive to do that for new comments
or code that moves comments around.

> +       err = btrfs_read_extent_buffer(tmp, &check);

This can block, so if p->nowait at this point we should instead exit
with -EAGAIN and not call this function.

> +       if (err) {
> +               ret = err;
> +               goto out;
> +       }

Same here, no need to use extra variable 'err', can just use 'ret'.

> +
>         /*
>          * If the read above didn't mark this buffer up to date,
>          * it will never end up being up to date.  Set ret to EIO now
> @@ -1607,11 +1621,13 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
>                 ret = -EIO;
>
>  out:
> -       if (ret == 0) {
> -               *eb_ret = tmp;
> -       } else {
> -               free_extent_buffer(tmp);
> -               btrfs_release_path(p);
> +       if (tmp) {
> +               if (lock)
> +                       btrfs_tree_read_unlock(tmp);
> +               if (create && ret && ret != -EAGAIN)
> +                       free_extent_buffer_stale(tmp);
> +               else
> +                       free_extent_buffer(tmp);

So in the success case we no longer set *eb_ret to tmp. Why? The
callers expect that.

Thanks.

>         }
>
>         return ret;
> @@ -2198,7 +2214,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
>                 }
>
>                 err = read_block_for_search(root, p, &b, level, slot, key);
> -               if (err == -EAGAIN)
> +               if (err == -EAGAIN && !p->nowait)
>                         goto again;
>                 if (err) {
>                         ret = err;
> @@ -2325,7 +2341,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
>                 }
>
>                 err = read_block_for_search(root, p, &b, level, slot, key);
> -               if (err == -EAGAIN)
> +               if (err == -EAGAIN && !p->nowait)
>                         goto again;
>                 if (err) {
>                         ret = err;
> --
> 2.17.1
>
>
Filipe Manana Oct. 9, 2024, 2:20 p.m. UTC | #2
On Wed, Oct 9, 2024 at 3:18 PM Filipe Manana <fdmanana@kernel.org> wrote:
>
> On Wed, Oct 9, 2024 at 3:09 AM robbieko <robbieko@synology.com> wrote:
> >
> > From: Robbie Ko <robbieko@synology.com>
> >
> > When crawling btree, if an eb cache miss occurs, we change to use
> > the eb read lock and release all previous locks to reduce lock contention.
> >
> > Because we have prepared the check parameters and the read lock
> > of eb we hold, we can ensure that no race will occur during the check
> > and cause unexpected errors.
> >
> > Signed-off-by: Robbie Ko <robbieko@synology.com>
> > ---
> >  fs/btrfs/ctree.c | 88 ++++++++++++++++++++++++++++--------------------
> >  1 file changed, 52 insertions(+), 36 deletions(-)
> >
> > diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
> > index 0cc919d15b14..0efbe61497f3 100644
> > --- a/fs/btrfs/ctree.c
> > +++ b/fs/btrfs/ctree.c
> > @@ -1515,12 +1515,12 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
> >         struct btrfs_tree_parent_check check = { 0 };
> >         u64 blocknr;
> >         u64 gen;
> > -       struct extent_buffer *tmp;
> > -       int ret;
> > +       struct extent_buffer *tmp = NULL;
> > +       int ret, err;
>
> Please avoid declaring two (or more) variables in the same line. 1 per
> line is prefered for readability and be consistent with this
> function's code.
>
> >         int parent_level;
> > -       bool unlock_up;
> > +       bool create = false;
> > +       bool lock = false;
> >
> > -       unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]);
> >         blocknr = btrfs_node_blockptr(*eb_ret, slot);
> >         gen = btrfs_node_ptr_generation(*eb_ret, slot);
> >         parent_level = btrfs_header_level(*eb_ret);
> > @@ -1551,52 +1551,66 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
> >                          */
> >                         if (btrfs_verify_level_key(tmp,
> >                                         parent_level - 1, &check.first_key, gen)) {
> > -                               free_extent_buffer(tmp);
> > -                               return -EUCLEAN;
> > +                               ret = -EUCLEAN;
> > +                               goto out;
> >                         }
> >                         *eb_ret = tmp;
> > -                       return 0;
> > +                       tmp = NULL;
> > +                       ret = 0;
> > +                       goto out;
>
> By setting tmp to NULL and jumping to the out label, we leak a
> reference on the tmp extent buffer.

Never mind about this one, it's the reference for the caller as we've
set *eb_ret to tmp.

>
> >                 }
> >
> >                 if (p->nowait) {
> > -                       free_extent_buffer(tmp);
> > -                       return -EAGAIN;
> > +                       ret = -EAGAIN;
> > +                       btrfs_release_path(p);
>
> To reduce the critical section sizes, please set ret to -EAGAIN after
> releasing the path.
>
> > +                       goto out;
> >                 }
> >
> > -               if (unlock_up)
> > -                       btrfs_unlock_up_safe(p, level + 1);
> > +               ret = -EAGAIN;
> > +               btrfs_unlock_up_safe(p, level + 1);
>
> Same here, set ret after the unlocking.
>
> But why set ret to -EAGAIN? Here we know p->nowait is false.
>
> > +               btrfs_tree_read_lock(tmp);
> > +               lock = true;
>
> And here, with the same reasoning, set 'lock' to true before calling
> btrfs_tree_read_lock().
>
> > +               btrfs_release_path(p);
> >
> >                 /* now we're allowed to do a blocking uptodate check */
> > -               ret = btrfs_read_extent_buffer(tmp, &check);
> > -               if (ret) {
> > -                       free_extent_buffer(tmp);
> > -                       btrfs_release_path(p);
> > -                       return ret;
> > +               err = btrfs_read_extent_buffer(tmp, &check);
> > +               if (err) {
> > +                       ret = err;
> > +                       goto out;
>
> Why do we need to have this 'err' variable?
> Why not use 'ret' and simplify?
>
> >                 }
> > -
> > -               if (unlock_up)
> > -                       ret = -EAGAIN;
> > -
> >                 goto out;
> >         } else if (p->nowait) {
> > -               return -EAGAIN;
> > -       }
> > -
> > -       if (unlock_up) {
> > -               btrfs_unlock_up_safe(p, level + 1);
> >                 ret = -EAGAIN;
> > -       } else {
> > -               ret = 0;
> > +               btrfs_release_path(p);
>
> Same here, set 'ret' to -EAGAIN after releasing the path.
>
> > +               goto out;
> >         }
> >
> > +       ret = -EAGAIN;
> > +       btrfs_unlock_up_safe(p, level + 1);
>
> Same here.
>
> But why set ret to -EAGAIN? Here we know p->nowait is false.
>
> > +
> >         if (p->reada != READA_NONE)
> >                 reada_for_search(fs_info, p, level, slot, key->objectid);
> >
> > -       tmp = read_tree_block(fs_info, blocknr, &check);
> > +       tmp = btrfs_find_create_tree_block(fs_info, blocknr, check.owner_root, check.level);
> >         if (IS_ERR(tmp)) {
> > +               ret = PTR_ERR(tmp);
> > +               tmp = NULL;
> >                 btrfs_release_path(p);
> > -               return PTR_ERR(tmp);
> > +               goto out;
> >         }
> > +       create = true;
> > +
> > +       btrfs_tree_read_lock(tmp);
> > +       lock = true;
>
> Same here, set 'lock' to true before the call to btrfs_tree_read_lock();
>
> > +       btrfs_release_path(p);
> > +
> > +       /* now we're allowed to do a blocking uptodate check */
>
> Please capitalize the first word of a sentence and end the sentence
> with punctuation.
> This is the preferred style and we strive to do that for new comments
> or code that moves comments around.
>
> > +       err = btrfs_read_extent_buffer(tmp, &check);
>
> This can block, so if p->nowait at this point we should instead exit
> with -EAGAIN and not call this function.
>
> > +       if (err) {
> > +               ret = err;
> > +               goto out;
> > +       }
>
> Same here, no need to use extra variable 'err', can just use 'ret'.
>
> > +
> >         /*
> >          * If the read above didn't mark this buffer up to date,
> >          * it will never end up being up to date.  Set ret to EIO now
> > @@ -1607,11 +1621,13 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
> >                 ret = -EIO;
> >
> >  out:
> > -       if (ret == 0) {
> > -               *eb_ret = tmp;
> > -       } else {
> > -               free_extent_buffer(tmp);
> > -               btrfs_release_path(p);
> > +       if (tmp) {
> > +               if (lock)
> > +                       btrfs_tree_read_unlock(tmp);
> > +               if (create && ret && ret != -EAGAIN)
> > +                       free_extent_buffer_stale(tmp);
> > +               else
> > +                       free_extent_buffer(tmp);
>
> So in the success case we no longer set *eb_ret to tmp. Why? The
> callers expect that.
>
> Thanks.
>
> >         }
> >
> >         return ret;
> > @@ -2198,7 +2214,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
> >                 }
> >
> >                 err = read_block_for_search(root, p, &b, level, slot, key);
> > -               if (err == -EAGAIN)
> > +               if (err == -EAGAIN && !p->nowait)
> >                         goto again;
> >                 if (err) {
> >                         ret = err;
> > @@ -2325,7 +2341,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
> >                 }
> >
> >                 err = read_block_for_search(root, p, &b, level, slot, key);
> > -               if (err == -EAGAIN)
> > +               if (err == -EAGAIN && !p->nowait)
> >                         goto again;
> >                 if (err) {
> >                         ret = err;
> > --
> > 2.17.1
> >
> >
Filipe Manana Oct. 9, 2024, 3:09 p.m. UTC | #3
On Wed, Oct 9, 2024 at 3:18 PM Filipe Manana <fdmanana@kernel.org> wrote:
>
> On Wed, Oct 9, 2024 at 3:09 AM robbieko <robbieko@synology.com> wrote:
> >
> > From: Robbie Ko <robbieko@synology.com>
> >
> > When crawling btree, if an eb cache miss occurs, we change to use
> > the eb read lock and release all previous locks to reduce lock contention.
> >
> > Because we have prepared the check parameters and the read lock
> > of eb we hold, we can ensure that no race will occur during the check
> > and cause unexpected errors.
> >
> > Signed-off-by: Robbie Ko <robbieko@synology.com>
> > ---
> >  fs/btrfs/ctree.c | 88 ++++++++++++++++++++++++++++--------------------
> >  1 file changed, 52 insertions(+), 36 deletions(-)
> >
> > diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
> > index 0cc919d15b14..0efbe61497f3 100644
> > --- a/fs/btrfs/ctree.c
> > +++ b/fs/btrfs/ctree.c
> > @@ -1515,12 +1515,12 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
> >         struct btrfs_tree_parent_check check = { 0 };
> >         u64 blocknr;
> >         u64 gen;
> > -       struct extent_buffer *tmp;
> > -       int ret;
> > +       struct extent_buffer *tmp = NULL;
> > +       int ret, err;
>
> Please avoid declaring two (or more) variables in the same line. 1 per
> line is prefered for readability and be consistent with this
> function's code.
>
> >         int parent_level;
> > -       bool unlock_up;
> > +       bool create = false;
> > +       bool lock = false;
> >
> > -       unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]);
> >         blocknr = btrfs_node_blockptr(*eb_ret, slot);
> >         gen = btrfs_node_ptr_generation(*eb_ret, slot);
> >         parent_level = btrfs_header_level(*eb_ret);
> > @@ -1551,52 +1551,66 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
> >                          */
> >                         if (btrfs_verify_level_key(tmp,
> >                                         parent_level - 1, &check.first_key, gen)) {
> > -                               free_extent_buffer(tmp);
> > -                               return -EUCLEAN;
> > +                               ret = -EUCLEAN;
> > +                               goto out;
> >                         }
> >                         *eb_ret = tmp;
> > -                       return 0;
> > +                       tmp = NULL;
> > +                       ret = 0;
> > +                       goto out;
>
> By setting tmp to NULL and jumping to the out label, we leak a
> reference on the tmp extent buffer.
>
> >                 }
> >
> >                 if (p->nowait) {
> > -                       free_extent_buffer(tmp);
> > -                       return -EAGAIN;
> > +                       ret = -EAGAIN;
> > +                       btrfs_release_path(p);
>
> To reduce the critical section sizes, please set ret to -EAGAIN after
> releasing the path.
>
> > +                       goto out;
> >                 }
> >
> > -               if (unlock_up)
> > -                       btrfs_unlock_up_safe(p, level + 1);
> > +               ret = -EAGAIN;
> > +               btrfs_unlock_up_safe(p, level + 1);
>
> Same here, set ret after the unlocking.
>
> But why set ret to -EAGAIN? Here we know p->nowait is false.

Nevermind this is because of the unconditional unlock now.

>
> > +               btrfs_tree_read_lock(tmp);
> > +               lock = true;
>
> And here, with the same reasoning, set 'lock' to true before calling
> btrfs_tree_read_lock().
>
> > +               btrfs_release_path(p);
> >
> >                 /* now we're allowed to do a blocking uptodate check */
> > -               ret = btrfs_read_extent_buffer(tmp, &check);
> > -               if (ret) {
> > -                       free_extent_buffer(tmp);
> > -                       btrfs_release_path(p);
> > -                       return ret;
> > +               err = btrfs_read_extent_buffer(tmp, &check);
> > +               if (err) {
> > +                       ret = err;
> > +                       goto out;
>
> Why do we need to have this 'err' variable?
> Why not use 'ret' and simplify?
>
> >                 }
> > -
> > -               if (unlock_up)
> > -                       ret = -EAGAIN;
> > -
> >                 goto out;
> >         } else if (p->nowait) {
> > -               return -EAGAIN;
> > -       }
> > -
> > -       if (unlock_up) {
> > -               btrfs_unlock_up_safe(p, level + 1);
> >                 ret = -EAGAIN;
> > -       } else {
> > -               ret = 0;
> > +               btrfs_release_path(p);
>
> Same here, set 'ret' to -EAGAIN after releasing the path.
>
> > +               goto out;
> >         }
> >
> > +       ret = -EAGAIN;
> > +       btrfs_unlock_up_safe(p, level + 1);
>
> Same here.
>
> But why set ret to -EAGAIN? Here we know p->nowait is false.

Nevermind this is because of the unconditional unlock now.

In both cases we still don't need the 'err' variable.
Just set 'ret' to -EAGAIN if btrfs_find_create_tree_block() below
doesn't return an error and btrfs_read_extent_buffer() below and above
don't return an error.

>
> > +
> >         if (p->reada != READA_NONE)
> >                 reada_for_search(fs_info, p, level, slot, key->objectid);
> >
> > -       tmp = read_tree_block(fs_info, blocknr, &check);
> > +       tmp = btrfs_find_create_tree_block(fs_info, blocknr, check.owner_root, check.level);
> >         if (IS_ERR(tmp)) {
> > +               ret = PTR_ERR(tmp);
> > +               tmp = NULL;
> >                 btrfs_release_path(p);
> > -               return PTR_ERR(tmp);
> > +               goto out;
> >         }
> > +       create = true;
> > +
> > +       btrfs_tree_read_lock(tmp);
> > +       lock = true;
>
> Same here, set 'lock' to true before the call to btrfs_tree_read_lock();
>
> > +       btrfs_release_path(p);
> > +
> > +       /* now we're allowed to do a blocking uptodate check */
>
> Please capitalize the first word of a sentence and end the sentence
> with punctuation.
> This is the preferred style and we strive to do that for new comments
> or code that moves comments around.
>
> > +       err = btrfs_read_extent_buffer(tmp, &check);
>
> This can block, so if p->nowait at this point we should instead exit
> with -EAGAIN and not call this function.
>
> > +       if (err) {
> > +               ret = err;
> > +               goto out;
> > +       }
>
> Same here, no need to use extra variable 'err', can just use 'ret'.
>
> > +
> >         /*
> >          * If the read above didn't mark this buffer up to date,
> >          * it will never end up being up to date.  Set ret to EIO now
> > @@ -1607,11 +1621,13 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
> >                 ret = -EIO;
> >
> >  out:
> > -       if (ret == 0) {
> > -               *eb_ret = tmp;
> > -       } else {
> > -               free_extent_buffer(tmp);
> > -               btrfs_release_path(p);
> > +       if (tmp) {
> > +               if (lock)
> > +                       btrfs_tree_read_unlock(tmp);
> > +               if (create && ret && ret != -EAGAIN)
> > +                       free_extent_buffer_stale(tmp);
> > +               else
> > +                       free_extent_buffer(tmp);
>
> So in the success case we no longer set *eb_ret to tmp. Why? The
> callers expect that.

Ok, it's because of the -EAGAIN due to having the unconditional unlock now.

And btw, have you observed any case where this improved anything? Any
benchmarks?

I can't see how this can make anything better because this function is
never called for a root node, and therefore level + 1 <
BTRFS_MAX_LEVEL.

But there is one case where this patch causes unnecessary path
releases and makes the caller retry a search:
when none of the levels above were locked. That's why we had the
following expression before:

unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]);

So we wouldn't make the caller retry the search if upper levels aren't
locked or p->skip_locking == 1.
But now after this patch we make the caller retry the search in that case.

Thanks.

>
> Thanks.
>
> >         }
> >
> >         return ret;
> > @@ -2198,7 +2214,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
> >                 }
> >
> >                 err = read_block_for_search(root, p, &b, level, slot, key);
> > -               if (err == -EAGAIN)
> > +               if (err == -EAGAIN && !p->nowait)
> >                         goto again;
> >                 if (err) {
> >                         ret = err;
> > @@ -2325,7 +2341,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
> >                 }
> >
> >                 err = read_block_for_search(root, p, &b, level, slot, key);
> > -               if (err == -EAGAIN)
> > +               if (err == -EAGAIN && !p->nowait)
> >                         goto again;
> >                 if (err) {
> >                         ret = err;
> > --
> > 2.17.1
> >
> >
robbieko Oct. 11, 2024, 2:40 a.m. UTC | #4
Filipe Manana 於 2024/10/9 下午11:09 寫道:
> On Wed, Oct 9, 2024 at 3:18 PM Filipe Manana <fdmanana@kernel.org> wrote:
>> On Wed, Oct 9, 2024 at 3:09 AM robbieko <robbieko@synology.com> wrote:
>>> From: Robbie Ko <robbieko@synology.com>
>>>
>>> When crawling btree, if an eb cache miss occurs, we change to use
>>> the eb read lock and release all previous locks to reduce lock contention.
>>>
>>> Because we have prepared the check parameters and the read lock
>>> of eb we hold, we can ensure that no race will occur during the check
>>> and cause unexpected errors.
>>>
>>> Signed-off-by: Robbie Ko <robbieko@synology.com>
>>> ---
>>>   fs/btrfs/ctree.c | 88 ++++++++++++++++++++++++++++--------------------
>>>   1 file changed, 52 insertions(+), 36 deletions(-)
>>>
>>> diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
>>> index 0cc919d15b14..0efbe61497f3 100644
>>> --- a/fs/btrfs/ctree.c
>>> +++ b/fs/btrfs/ctree.c
>>> @@ -1515,12 +1515,12 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
>>>          struct btrfs_tree_parent_check check = { 0 };
>>>          u64 blocknr;
>>>          u64 gen;
>>> -       struct extent_buffer *tmp;
>>> -       int ret;
>>> +       struct extent_buffer *tmp = NULL;
>>> +       int ret, err;
>> Please avoid declaring two (or more) variables in the same line. 1 per
>> line is prefered for readability and be consistent with this
>> function's code.

Okay, I'll modify it.

>>
>>>          int parent_level;
>>> -       bool unlock_up;
>>> +       bool create = false;
>>> +       bool lock = false;
>>>
>>> -       unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]);
>>>          blocknr = btrfs_node_blockptr(*eb_ret, slot);
>>>          gen = btrfs_node_ptr_generation(*eb_ret, slot);
>>>          parent_level = btrfs_header_level(*eb_ret);
>>> @@ -1551,52 +1551,66 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
>>>                           */
>>>                          if (btrfs_verify_level_key(tmp,
>>>                                          parent_level - 1, &check.first_key, gen)) {
>>> -                               free_extent_buffer(tmp);
>>> -                               return -EUCLEAN;
>>> +                               ret = -EUCLEAN;
>>> +                               goto out;
>>>                          }
>>>                          *eb_ret = tmp;
>>> -                       return 0;
>>> +                       tmp = NULL;
>>> +                       ret = 0;
>>> +                       goto out;
>> By setting tmp to NULL and jumping to the out label, we leak a
>> reference on the tmp extent buffer.
We assign tmp to eb_ret, so we don't leak.
>>
>>>                  }
>>>
>>>                  if (p->nowait) {
>>> -                       free_extent_buffer(tmp);
>>> -                       return -EAGAIN;
>>> +                       ret = -EAGAIN;
>>> +                       btrfs_release_path(p);
>> To reduce the critical section sizes, please set ret to -EAGAIN after
>> releasing the path.
Okay, I'll modify it.
>>
>>> +                       goto out;
>>>                  }
>>>
>>> -               if (unlock_up)
>>> -                       btrfs_unlock_up_safe(p, level + 1);
>>> +               ret = -EAGAIN;
>>> +               btrfs_unlock_up_safe(p, level + 1);
>> Same here, set ret after the unlocking.
>>
>> But why set ret to -EAGAIN? Here we know p->nowait is false.
> Nevermind this is because of the unconditional unlock now.
Yes, because we unlock unconditionally, the ret value of this function 
must be -EAGAIN.
>
>>> +               btrfs_tree_read_lock(tmp);
>>> +               lock = true;
>> And here, with the same reasoning, set 'lock' to true before calling
>> btrfs_tree_read_lock().
>>
>>> +               btrfs_release_path(p);
>>>
>>>                  /* now we're allowed to do a blocking uptodate check */
>>> -               ret = btrfs_read_extent_buffer(tmp, &check);
>>> -               if (ret) {
>>> -                       free_extent_buffer(tmp);
>>> -                       btrfs_release_path(p);
>>> -                       return ret;
>>> +               err = btrfs_read_extent_buffer(tmp, &check);
>>> +               if (err) {
>>> +                       ret = err;
>>> +                       goto out;
>> Why do we need to have this 'err' variable?
>> Why not use 'ret' and simplify?
>>
>>>                  }
>>> -
>>> -               if (unlock_up)
>>> -                       ret = -EAGAIN;
>>> -
>>>                  goto out;
>>>          } else if (p->nowait) {
>>> -               return -EAGAIN;
>>> -       }
>>> -
>>> -       if (unlock_up) {
>>> -               btrfs_unlock_up_safe(p, level + 1);
>>>                  ret = -EAGAIN;
>>> -       } else {
>>> -               ret = 0;
>>> +               btrfs_release_path(p);
>> Same here, set 'ret' to -EAGAIN after releasing the path.
>>
>>> +               goto out;
>>>          }
>>>
>>> +       ret = -EAGAIN;
>>> +       btrfs_unlock_up_safe(p, level + 1);
>> Same here.
>>
>> But why set ret to -EAGAIN? Here we know p->nowait is false.
> Nevermind this is because of the unconditional unlock now.
>
> In both cases we still don't need the 'err' variable.
> Just set 'ret' to -EAGAIN if btrfs_find_create_tree_block() below
> doesn't return an error and btrfs_read_extent_buffer() below and above
> don't return an error.

Indeed, we can only use ret value to receive the result of 
btrfs_find_create_tree_block or btrfs_read_extent_buffer.
If the function returns correctly (return 0), we must change ret to 
-EAGAIN again and again,
which is undoubtedly an increase in the function complexity.
Therefore, I use the err variable. If other err occurs, the err can be 
set to ret,
so that the caller can immediately stop and continue searching.

>
>>> +
>>>          if (p->reada != READA_NONE)
>>>                  reada_for_search(fs_info, p, level, slot, key->objectid);
>>>
>>> -       tmp = read_tree_block(fs_info, blocknr, &check);
>>> +       tmp = btrfs_find_create_tree_block(fs_info, blocknr, check.owner_root, check.level);
>>>          if (IS_ERR(tmp)) {
>>> +               ret = PTR_ERR(tmp);
>>> +               tmp = NULL;
>>>                  btrfs_release_path(p);
>>> -               return PTR_ERR(tmp);
>>> +               goto out;
>>>          }
>>> +       create = true;
>>> +
>>> +       btrfs_tree_read_lock(tmp);
>>> +       lock = true;
>> Same here, set 'lock' to true before the call to btrfs_tree_read_lock();
>>
>>> +       btrfs_release_path(p);
>>> +
>>> +       /* now we're allowed to do a blocking uptodate check */
>> Please capitalize the first word of a sentence and end the sentence
>> with punctuation.
>> This is the preferred style and we strive to do that for new comments
>> or code that moves comments around.
Okay, I'll modify it.
>>
>>> +       err = btrfs_read_extent_buffer(tmp, &check);
>> This can block, so if p->nowait at this point we should instead exit
>> with -EAGAIN and not call this function.
>>
>>> +       if (err) {
>>> +               ret = err;
>>> +               goto out;
>>> +       }
>> Same here, no need to use extra variable 'err', can just use 'ret'.
>>
>>> +
>>>          /*
>>>           * If the read above didn't mark this buffer up to date,
>>>           * it will never end up being up to date.  Set ret to EIO now
>>> @@ -1607,11 +1621,13 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
>>>                  ret = -EIO;
>>>
>>>   out:
>>> -       if (ret == 0) {
>>> -               *eb_ret = tmp;
>>> -       } else {
>>> -               free_extent_buffer(tmp);
>>> -               btrfs_release_path(p);
>>> +       if (tmp) {
>>> +               if (lock)
>>> +                       btrfs_tree_read_unlock(tmp);
>>> +               if (create && ret && ret != -EAGAIN)
>>> +                       free_extent_buffer_stale(tmp);
>>> +               else
>>> +                       free_extent_buffer(tmp);
>> So in the success case we no longer set *eb_ret to tmp. Why? The
>> callers expect that.
> Ok, it's because of the -EAGAIN due to having the unconditional unlock now.
>
> And btw, have you observed any case where this improved anything? Any
> benchmarks?

In the case of very high-speed 4k random write (cow), using nvme will 
improve.

But I can't give it a clear data improvement, because there are still many

different bottlenecks in the context of 4k random write,

and other patches need to be used to achieve significant improvements.

>
> I can't see how this can make anything better because this function is
> never called for a root node, and therefore level + 1 <
> BTRFS_MAX_LEVEL.

The main improvement point of this patch is that if a eb cache miss 
occurs in a leaf and needs to execute IO,
our original approach is to release level 2~MAX, but this patch is 
changed to release level 1~MAX.
We even Level 1 has also been released, so before releasing level 1, we 
changed to take the lock of level 0 to avoid race.
This change reduces the lock contention of level 1.

>
> But there is one case where this patch causes unnecessary path
> releases and makes the caller retry a search:
> when none of the levels above were locked. That's why we had the
> following expression before:
>
> unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]);
>
> So we wouldn't make the caller retry the search if upper levels aren't
> locked or p->skip_locking == 1.
> But now after this patch we make the caller retry the search in that case.

Indeed, I missed something about skip_locking. In this case, there is no 
need to search again, I will correct it.

Thank you for your review.

>
> Thanks.
>
>> Thanks.
>>
>>>          }
>>>
>>>          return ret;
>>> @@ -2198,7 +2214,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
>>>                  }
>>>
>>>                  err = read_block_for_search(root, p, &b, level, slot, key);
>>> -               if (err == -EAGAIN)
>>> +               if (err == -EAGAIN && !p->nowait)
>>>                          goto again;
>>>                  if (err) {
>>>                          ret = err;
>>> @@ -2325,7 +2341,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
>>>                  }
>>>
>>>                  err = read_block_for_search(root, p, &b, level, slot, key);
>>> -               if (err == -EAGAIN)
>>> +               if (err == -EAGAIN && !p->nowait)
>>>                          goto again;
>>>                  if (err) {
>>>                          ret = err;
>>> --
>>> 2.17.1
>>>
>>>
diff mbox series

Patch

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 0cc919d15b14..0efbe61497f3 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1515,12 +1515,12 @@  read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
 	struct btrfs_tree_parent_check check = { 0 };
 	u64 blocknr;
 	u64 gen;
-	struct extent_buffer *tmp;
-	int ret;
+	struct extent_buffer *tmp = NULL;
+	int ret, err;
 	int parent_level;
-	bool unlock_up;
+	bool create = false;
+	bool lock = false;
 
-	unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]);
 	blocknr = btrfs_node_blockptr(*eb_ret, slot);
 	gen = btrfs_node_ptr_generation(*eb_ret, slot);
 	parent_level = btrfs_header_level(*eb_ret);
@@ -1551,52 +1551,66 @@  read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
 			 */
 			if (btrfs_verify_level_key(tmp,
 					parent_level - 1, &check.first_key, gen)) {
-				free_extent_buffer(tmp);
-				return -EUCLEAN;
+				ret = -EUCLEAN;
+				goto out;
 			}
 			*eb_ret = tmp;
-			return 0;
+			tmp = NULL;
+			ret = 0;
+			goto out;
 		}
 
 		if (p->nowait) {
-			free_extent_buffer(tmp);
-			return -EAGAIN;
+			ret = -EAGAIN;
+			btrfs_release_path(p);
+			goto out;
 		}
 
-		if (unlock_up)
-			btrfs_unlock_up_safe(p, level + 1);
+		ret = -EAGAIN;
+		btrfs_unlock_up_safe(p, level + 1);
+		btrfs_tree_read_lock(tmp);
+		lock = true;
+		btrfs_release_path(p);
 
 		/* now we're allowed to do a blocking uptodate check */
-		ret = btrfs_read_extent_buffer(tmp, &check);
-		if (ret) {
-			free_extent_buffer(tmp);
-			btrfs_release_path(p);
-			return ret;
+		err = btrfs_read_extent_buffer(tmp, &check);
+		if (err) {
+			ret = err;
+			goto out;
 		}
-
-		if (unlock_up)
-			ret = -EAGAIN;
-
 		goto out;
 	} else if (p->nowait) {
-		return -EAGAIN;
-	}
-
-	if (unlock_up) {
-		btrfs_unlock_up_safe(p, level + 1);
 		ret = -EAGAIN;
-	} else {
-		ret = 0;
+		btrfs_release_path(p);
+		goto out;
 	}
 
+	ret = -EAGAIN;
+	btrfs_unlock_up_safe(p, level + 1);
+
 	if (p->reada != READA_NONE)
 		reada_for_search(fs_info, p, level, slot, key->objectid);
 
-	tmp = read_tree_block(fs_info, blocknr, &check);
+	tmp = btrfs_find_create_tree_block(fs_info, blocknr, check.owner_root, check.level);
 	if (IS_ERR(tmp)) {
+		ret = PTR_ERR(tmp);
+		tmp = NULL;
 		btrfs_release_path(p);
-		return PTR_ERR(tmp);
+		goto out;
 	}
+	create = true;
+
+	btrfs_tree_read_lock(tmp);
+	lock = true;
+	btrfs_release_path(p);
+
+	/* now we're allowed to do a blocking uptodate check */
+	err = btrfs_read_extent_buffer(tmp, &check);
+	if (err) {
+		ret = err;
+		goto out;
+	}
+
 	/*
 	 * If the read above didn't mark this buffer up to date,
 	 * it will never end up being up to date.  Set ret to EIO now
@@ -1607,11 +1621,13 @@  read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
 		ret = -EIO;
 
 out:
-	if (ret == 0) {
-		*eb_ret = tmp;
-	} else {
-		free_extent_buffer(tmp);
-		btrfs_release_path(p);
+	if (tmp) {
+		if (lock)
+			btrfs_tree_read_unlock(tmp);
+		if (create && ret && ret != -EAGAIN)
+			free_extent_buffer_stale(tmp);
+		else
+			free_extent_buffer(tmp);
 	}
 
 	return ret;
@@ -2198,7 +2214,7 @@  int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		}
 
 		err = read_block_for_search(root, p, &b, level, slot, key);
-		if (err == -EAGAIN)
+		if (err == -EAGAIN && !p->nowait)
 			goto again;
 		if (err) {
 			ret = err;
@@ -2325,7 +2341,7 @@  int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
 		}
 
 		err = read_block_for_search(root, p, &b, level, slot, key);
-		if (err == -EAGAIN)
+		if (err == -EAGAIN && !p->nowait)
 			goto again;
 		if (err) {
 			ret = err;