diff mbox series

[10/10] btrfs: move extent_buffer::lock_owner to debug section

Message ID 3f96e9c3d06ed846b19b63cd9001cb0c66bd8a00.1694126893.git.dsterba@suse.com (mailing list archive)
State New, archived
Headers show
Series Cleanups and struct packing | expand

Commit Message

David Sterba Sept. 7, 2023, 11:09 p.m. UTC
The lock_owner is used for a rare corruption case and we haven't seen
any reports in years. Move it to the debugging section of eb.  To close
the holes also move log_index so the final layout looks like:

struct extent_buffer {
        u64                        start;                /*     0     8 */
        long unsigned int          len;                  /*     8     8 */
        long unsigned int          bflags;               /*    16     8 */
        struct btrfs_fs_info *     fs_info;              /*    24     8 */
        spinlock_t                 refs_lock;            /*    32     4 */
        atomic_t                   refs;                 /*    36     4 */
        int                        read_mirror;          /*    40     4 */
        s8                         log_index;            /*    44     1 */

        /* XXX 3 bytes hole, try to pack */

        struct callback_head       callback_head __attribute__((__aligned__(8))); /*    48    16 */
        /* --- cacheline 1 boundary (64 bytes) --- */
        struct rw_semaphore        lock;                 /*    64    40 */
        struct page *              pages[16];            /*   104   128 */

        /* size: 232, cachelines: 4, members: 11 */
        /* sum members: 229, holes: 1, sum holes: 3 */
        /* forced alignments: 1, forced holes: 1, sum forced holes: 3 */
        /* last cacheline: 40 bytes */
} __attribute__((__aligned__(8)));

This saves 8 bytes in total and still keeps the lock on a separate cacheline.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 32 +++++++++++++++++++++++---------
 fs/btrfs/extent_io.h   |  4 ++--
 fs/btrfs/locking.c     | 15 ++++++++++++---
 3 files changed, 37 insertions(+), 14 deletions(-)

Comments

Qu Wenruo Sept. 8, 2023, 12:11 a.m. UTC | #1
On 2023/9/8 07:09, David Sterba wrote:
> The lock_owner is used for a rare corruption case and we haven't seen
> any reports in years. Move it to the debugging section of eb.  To close
> the holes also move log_index so the final layout looks like:

Just found some extra space we can squish out:

> struct extent_buffer {
>          u64                        start;                /*     0     8 */
>          long unsigned int          len;                  /*     8     8 */

u32 is definitely enough.
(u16 is not enough for 64K nodesize, unfortunately)

>          long unsigned int          bflags;               /*    16     8 */

For now we don't need 64bit for bflags, 32bit is enough, but
unfortunately that's what the existing test/set/clear_bit() requires...

>          struct btrfs_fs_info *     fs_info;              /*    24     8 */
>          spinlock_t                 refs_lock;            /*    32     4 */
>          atomic_t                   refs;                 /*    36     4 */
>          int                        read_mirror;          /*    40     4 */

We don't really need int for read_mirror, but that would be another
patch(set) to change them.

>          s8                         log_index;            /*    44     1 */
>
>          /* XXX 3 bytes hole, try to pack */
>
>          struct callback_head       callback_head __attribute__((__aligned__(8))); /*    48    16 */
>          /* --- cacheline 1 boundary (64 bytes) --- */
>          struct rw_semaphore        lock;                 /*    64    40 */
>          struct page *              pages[16];            /*   104   128 */
>
>          /* size: 232, cachelines: 4, members: 11 */
>          /* sum members: 229, holes: 1, sum holes: 3 */
>          /* forced alignments: 1, forced holes: 1, sum forced holes: 3 */
>          /* last cacheline: 40 bytes */
> } __attribute__((__aligned__(8)));
>
> This saves 8 bytes in total and still keeps the lock on a separate cacheline.
>
> Signed-off-by: David Sterba <dsterba@suse.com>

Reviewed-by: Qu Wenruo <wqu@suse.com>

Thanks,
Qu

> ---
>   fs/btrfs/extent-tree.c | 32 +++++++++++++++++++++++---------
>   fs/btrfs/extent_io.h   |  4 ++--
>   fs/btrfs/locking.c     | 15 ++++++++++++---
>   3 files changed, 37 insertions(+), 14 deletions(-)
>
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index 3e46bb4cc957..6f6838226fe7 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -4801,6 +4801,28 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
>   	return ret;
>   }
>
> +#ifdef CONFIG_BTRFS_DEBUG
> +/*
> + * Extra safety check in case the extent tree is corrupted and extent allocator
> + * chooses to use a tree block which is already used and locked.
> + */
> +static bool check_eb_lock_owner(const struct extent_buffer *eb)
> +{
> +	if (eb->lock_owner == current->pid) {
> +		btrfs_err_rl(eb->fs_info,
> +"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
> +			     eb->start, btrfs_header_owner(eb), current->pid);
> +		return true;
> +	}
> +	return false;
> +}
> +#else
> +static bool check_eb_lock_owner(struct extent_buffer *eb)
> +{
> +	return false;
> +}
> +#endif
> +
>   static struct extent_buffer *
>   btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
>   		      u64 bytenr, int level, u64 owner,
> @@ -4814,15 +4836,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
>   	if (IS_ERR(buf))
>   		return buf;
>
> -	/*
> -	 * Extra safety check in case the extent tree is corrupted and extent
> -	 * allocator chooses to use a tree block which is already used and
> -	 * locked.
> -	 */
> -	if (buf->lock_owner == current->pid) {
> -		btrfs_err_rl(fs_info,
> -"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
> -			buf->start, btrfs_header_owner(buf), current->pid);
> +	if (check_eb_lock_owner(buf)) {
>   		free_extent_buffer(buf);
>   		return ERR_PTR(-EUCLEAN);
>   	}
> diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
> index 68368ba99321..2171057a4477 100644
> --- a/fs/btrfs/extent_io.h
> +++ b/fs/btrfs/extent_io.h
> @@ -80,16 +80,16 @@ struct extent_buffer {
>   	spinlock_t refs_lock;
>   	atomic_t refs;
>   	int read_mirror;
> -	struct rcu_head rcu_head;
> -	pid_t lock_owner;
>   	/* >= 0 if eb belongs to a log tree, -1 otherwise */
>   	s8 log_index;
> +	struct rcu_head rcu_head;
>
>   	struct rw_semaphore lock;
>
>   	struct page *pages[INLINE_EXTENT_BUFFER_PAGES];
>   #ifdef CONFIG_BTRFS_DEBUG
>   	struct list_head leak_list;
> +	pid_t lock_owner;
>   #endif
>   };
>
> diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
> index c3128cdf1177..6ac4fd8cc8dc 100644
> --- a/fs/btrfs/locking.c
> +++ b/fs/btrfs/locking.c
> @@ -103,6 +103,15 @@ void btrfs_maybe_reset_lockdep_class(struct btrfs_root *root, struct extent_buff
>
>   #endif
>
> +#ifdef CONFIG_BTRFS_DEBUG
> +static void btrfs_set_eb_lock_owner(struct extent_buffer *eb, pid_t owner)
> +{
> +	eb->lock_owner = owner;
> +}
> +#else
> +static void btrfs_set_eb_lock_owner(struct extent_buffer *eb, pid_t owner) { }
> +#endif
> +
>   /*
>    * Extent buffer locking
>    * =====================
> @@ -165,7 +174,7 @@ int btrfs_try_tree_read_lock(struct extent_buffer *eb)
>   int btrfs_try_tree_write_lock(struct extent_buffer *eb)
>   {
>   	if (down_write_trylock(&eb->lock)) {
> -		eb->lock_owner = current->pid;
> +		btrfs_set_eb_lock_owner(eb, current->pid);
>   		trace_btrfs_try_tree_write_lock(eb);
>   		return 1;
>   	}
> @@ -198,7 +207,7 @@ void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest)
>   		start_ns = ktime_get_ns();
>
>   	down_write_nested(&eb->lock, nest);
> -	eb->lock_owner = current->pid;
> +	btrfs_set_eb_lock_owner(eb, current->pid);
>   	trace_btrfs_tree_lock(eb, start_ns);
>   }
>
> @@ -213,7 +222,7 @@ void btrfs_tree_lock(struct extent_buffer *eb)
>   void btrfs_tree_unlock(struct extent_buffer *eb)
>   {
>   	trace_btrfs_tree_unlock(eb);
> -	eb->lock_owner = 0;
> +	btrfs_set_eb_lock_owner(eb, 0);
>   	up_write(&eb->lock);
>   }
>
David Sterba Sept. 8, 2023, 12:56 a.m. UTC | #2
On Fri, Sep 08, 2023 at 08:11:43AM +0800, Qu Wenruo wrote:
> 
> 
> On 2023/9/8 07:09, David Sterba wrote:
> > The lock_owner is used for a rare corruption case and we haven't seen
> > any reports in years. Move it to the debugging section of eb.  To close
> > the holes also move log_index so the final layout looks like:
> 
> Just found some extra space we can squish out:

Yeah I have a few more patches to reduce extent buffer size.

> > struct extent_buffer {
> >          u64                        start;                /*     0     8 */
> >          long unsigned int          len;                  /*     8     8 */
> 
> u32 is definitely enough.
> (u16 is not enough for 64K nodesize, unfortunately)

One idea is to store the bit shift and then calculate the length on each
use but then we can simply not store the length in extent buffer at all
because it's always fs_info->nodesize. In some functions it can be
directly replaced by that, elsewhere it's needed to do
eb->fs_info->nodesize. One more pointer dereference but from likely a
cached memory, gaining 8 bytes.

> 
> >          long unsigned int          bflags;               /*    16     8 */
> 
> For now we don't need 64bit for bflags, 32bit is enough, but
> unfortunately that's what the existing test/set/clear_bit() requires...

Indeed, and it's not just here, in the inode or some other structures
taht store a few flags an u32 would suffice. I have a prototype to add
atomic bit manipulations for a u32 type, but this requires manually
written assembly so for now it's just x86. This could be useful outside
of btrfs so I'd need to make it a proper API and get some feedback from
wider audience.

> >          struct btrfs_fs_info *     fs_info;              /*    24     8 */
> >          spinlock_t                 refs_lock;            /*    32     4 */
> >          atomic_t                   refs;                 /*    36     4 */
> >          int                        read_mirror;          /*    40     4 */
> 
> We don't really need int for read_mirror, but that would be another
> patch(set) to change them.

Yeah that's another potential space saved, I tried to track down all the
use of mirror values but it's passed around to bios and back.

> >          s8                         log_index;            /*    44     1 */

And log_index takes 3 values, this can be encoded into the flags, also a
prototype but the code becomes ugly.
Qu Wenruo Sept. 8, 2023, 1:29 a.m. UTC | #3
On 2023/9/8 08:56, David Sterba wrote:
> On Fri, Sep 08, 2023 at 08:11:43AM +0800, Qu Wenruo wrote:
>>
>>
>> On 2023/9/8 07:09, David Sterba wrote:
>>> The lock_owner is used for a rare corruption case and we haven't seen
>>> any reports in years. Move it to the debugging section of eb.  To close
>>> the holes also move log_index so the final layout looks like:
>>
>> Just found some extra space we can squish out:
>
> Yeah I have a few more patches to reduce extent buffer size.
>
>>> struct extent_buffer {
>>>           u64                        start;                /*     0     8 */
>>>           long unsigned int          len;                  /*     8     8 */
>>
>> u32 is definitely enough.
>> (u16 is not enough for 64K nodesize, unfortunately)
>
> One idea is to store the bit shift and then calculate the length on each
> use but then we can simply not store the length in extent buffer at all
> because it's always fs_info->nodesize. In some functions it can be
> directly replaced by that, elsewhere it's needed to do
> eb->fs_info->nodesize. One more pointer dereference but from likely a
> cached memory, gaining 8 bytes.

Oh, that's way better. We used to use dummy extent buffer with fixed
size (4K) for sys chunk array read, but since commit e959d3c1df3a
("btrfs: use dummy extent buffer for super block sys chunk array read")
we just use nodesize and leave the extra space untouched.

But it may be better to dig deeper, as I'm not 100% sure if every
(including dummy ones) extent buffer goes nodesize.

>
>>
>>>           long unsigned int          bflags;               /*    16     8 */
>>
>> For now we don't need 64bit for bflags, 32bit is enough, but
>> unfortunately that's what the existing test/set/clear_bit() requires...
>
> Indeed, and it's not just here, in the inode or some other structures
> taht store a few flags an u32 would suffice. I have a prototype to add
> atomic bit manipulations for a u32 type, but this requires manually
> written assembly so for now it's just x86. This could be useful outside
> of btrfs so I'd need to make it a proper API and get some feedback from
> wider audience.
>
>>>           struct btrfs_fs_info *     fs_info;              /*    24     8 */
>>>           spinlock_t                 refs_lock;            /*    32     4 */
>>>           atomic_t                   refs;                 /*    36     4 */
>>>           int                        read_mirror;          /*    40     4 */
>>
>> We don't really need int for read_mirror, but that would be another
>> patch(set) to change them.
>
> Yeah that's another potential space saved, I tried to track down all the
> use of mirror values but it's passed around to bios and back.

My current plan is to go s8, which still gives us 127 mirrors.
Which should be enough for RAID6 rebuilds.

Although we may need extra attention to prevent incorrect width
truncation/padding, as we're going to support minus number for
scrub_logical ioctl to grab P/Q stripes directly.

Thanks,
Qu
>
>>>           s8                         log_index;            /*    44     1 */
>
> And log_index takes 3 values, this can be encoded into the flags, also a
> prototype but the code becomes ugly.
diff mbox series

Patch

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3e46bb4cc957..6f6838226fe7 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4801,6 +4801,28 @@  int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
+#ifdef CONFIG_BTRFS_DEBUG
+/*
+ * Extra safety check in case the extent tree is corrupted and extent allocator
+ * chooses to use a tree block which is already used and locked.
+ */
+static bool check_eb_lock_owner(const struct extent_buffer *eb)
+{
+	if (eb->lock_owner == current->pid) {
+		btrfs_err_rl(eb->fs_info,
+"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
+			     eb->start, btrfs_header_owner(eb), current->pid);
+		return true;
+	}
+	return false;
+}
+#else
+static bool check_eb_lock_owner(struct extent_buffer *eb)
+{
+	return false;
+}
+#endif
+
 static struct extent_buffer *
 btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		      u64 bytenr, int level, u64 owner,
@@ -4814,15 +4836,7 @@  btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	if (IS_ERR(buf))
 		return buf;
 
-	/*
-	 * Extra safety check in case the extent tree is corrupted and extent
-	 * allocator chooses to use a tree block which is already used and
-	 * locked.
-	 */
-	if (buf->lock_owner == current->pid) {
-		btrfs_err_rl(fs_info,
-"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
-			buf->start, btrfs_header_owner(buf), current->pid);
+	if (check_eb_lock_owner(buf)) {
 		free_extent_buffer(buf);
 		return ERR_PTR(-EUCLEAN);
 	}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 68368ba99321..2171057a4477 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -80,16 +80,16 @@  struct extent_buffer {
 	spinlock_t refs_lock;
 	atomic_t refs;
 	int read_mirror;
-	struct rcu_head rcu_head;
-	pid_t lock_owner;
 	/* >= 0 if eb belongs to a log tree, -1 otherwise */
 	s8 log_index;
+	struct rcu_head rcu_head;
 
 	struct rw_semaphore lock;
 
 	struct page *pages[INLINE_EXTENT_BUFFER_PAGES];
 #ifdef CONFIG_BTRFS_DEBUG
 	struct list_head leak_list;
+	pid_t lock_owner;
 #endif
 };
 
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index c3128cdf1177..6ac4fd8cc8dc 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -103,6 +103,15 @@  void btrfs_maybe_reset_lockdep_class(struct btrfs_root *root, struct extent_buff
 
 #endif
 
+#ifdef CONFIG_BTRFS_DEBUG
+static void btrfs_set_eb_lock_owner(struct extent_buffer *eb, pid_t owner)
+{
+	eb->lock_owner = owner;
+}
+#else
+static void btrfs_set_eb_lock_owner(struct extent_buffer *eb, pid_t owner) { }
+#endif
+
 /*
  * Extent buffer locking
  * =====================
@@ -165,7 +174,7 @@  int btrfs_try_tree_read_lock(struct extent_buffer *eb)
 int btrfs_try_tree_write_lock(struct extent_buffer *eb)
 {
 	if (down_write_trylock(&eb->lock)) {
-		eb->lock_owner = current->pid;
+		btrfs_set_eb_lock_owner(eb, current->pid);
 		trace_btrfs_try_tree_write_lock(eb);
 		return 1;
 	}
@@ -198,7 +207,7 @@  void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest)
 		start_ns = ktime_get_ns();
 
 	down_write_nested(&eb->lock, nest);
-	eb->lock_owner = current->pid;
+	btrfs_set_eb_lock_owner(eb, current->pid);
 	trace_btrfs_tree_lock(eb, start_ns);
 }
 
@@ -213,7 +222,7 @@  void btrfs_tree_lock(struct extent_buffer *eb)
 void btrfs_tree_unlock(struct extent_buffer *eb)
 {
 	trace_btrfs_tree_unlock(eb);
-	eb->lock_owner = 0;
+	btrfs_set_eb_lock_owner(eb, 0);
 	up_write(&eb->lock);
 }