Message ID | 3f96e9c3d06ed846b19b63cd9001cb0c66bd8a00.1694126893.git.dsterba@suse.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Cleanups and struct packing | expand |
On 2023/9/8 07:09, David Sterba wrote: > The lock_owner is used for a rare corruption case and we haven't seen > any reports in years. Move it to the debugging section of eb. To close > the holes also move log_index so the final layout looks like: Just found some extra space we can squish out: > struct extent_buffer { > u64 start; /* 0 8 */ > long unsigned int len; /* 8 8 */ u32 is definitely enough. (u16 is not enough for 64K nodesize, unfortunately) > long unsigned int bflags; /* 16 8 */ For now we don't need 64bit for bflags, 32bit is enough, but unfortunately that's what the existing test/set/clear_bit() requires... > struct btrfs_fs_info * fs_info; /* 24 8 */ > spinlock_t refs_lock; /* 32 4 */ > atomic_t refs; /* 36 4 */ > int read_mirror; /* 40 4 */ We don't really need int for read_mirror, but that would be another patch(set) to change them. > s8 log_index; /* 44 1 */ > > /* XXX 3 bytes hole, try to pack */ > > struct callback_head callback_head __attribute__((__aligned__(8))); /* 48 16 */ > /* --- cacheline 1 boundary (64 bytes) --- */ > struct rw_semaphore lock; /* 64 40 */ > struct page * pages[16]; /* 104 128 */ > > /* size: 232, cachelines: 4, members: 11 */ > /* sum members: 229, holes: 1, sum holes: 3 */ > /* forced alignments: 1, forced holes: 1, sum forced holes: 3 */ > /* last cacheline: 40 bytes */ > } __attribute__((__aligned__(8))); > > This saves 8 bytes in total and still keeps the lock on a separate cacheline. > > Signed-off-by: David Sterba <dsterba@suse.com> Reviewed-by: Qu Wenruo <wqu@suse.com> Thanks, Qu > --- > fs/btrfs/extent-tree.c | 32 +++++++++++++++++++++++--------- > fs/btrfs/extent_io.h | 4 ++-- > fs/btrfs/locking.c | 15 ++++++++++++--- > 3 files changed, 37 insertions(+), 14 deletions(-) > > diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c > index 3e46bb4cc957..6f6838226fe7 100644 > --- a/fs/btrfs/extent-tree.c > +++ b/fs/btrfs/extent-tree.c > @@ -4801,6 +4801,28 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, > return ret; > } > > +#ifdef CONFIG_BTRFS_DEBUG > +/* > + * Extra safety check in case the extent tree is corrupted and extent allocator > + * chooses to use a tree block which is already used and locked. > + */ > +static bool check_eb_lock_owner(const struct extent_buffer *eb) > +{ > + if (eb->lock_owner == current->pid) { > + btrfs_err_rl(eb->fs_info, > +"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected", > + eb->start, btrfs_header_owner(eb), current->pid); > + return true; > + } > + return false; > +} > +#else > +static bool check_eb_lock_owner(struct extent_buffer *eb) > +{ > + return false; > +} > +#endif > + > static struct extent_buffer * > btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, > u64 bytenr, int level, u64 owner, > @@ -4814,15 +4836,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, > if (IS_ERR(buf)) > return buf; > > - /* > - * Extra safety check in case the extent tree is corrupted and extent > - * allocator chooses to use a tree block which is already used and > - * locked. > - */ > - if (buf->lock_owner == current->pid) { > - btrfs_err_rl(fs_info, > -"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected", > - buf->start, btrfs_header_owner(buf), current->pid); > + if (check_eb_lock_owner(buf)) { > free_extent_buffer(buf); > return ERR_PTR(-EUCLEAN); > } > diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h > index 68368ba99321..2171057a4477 100644 > --- a/fs/btrfs/extent_io.h > +++ b/fs/btrfs/extent_io.h > @@ -80,16 +80,16 @@ struct extent_buffer { > spinlock_t refs_lock; > atomic_t refs; > int read_mirror; > - struct rcu_head rcu_head; > - pid_t lock_owner; > /* >= 0 if eb belongs to a log tree, -1 otherwise */ > s8 log_index; > + struct rcu_head rcu_head; > > struct rw_semaphore lock; > > struct page *pages[INLINE_EXTENT_BUFFER_PAGES]; > #ifdef CONFIG_BTRFS_DEBUG > struct list_head leak_list; > + pid_t lock_owner; > #endif > }; > > diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c > index c3128cdf1177..6ac4fd8cc8dc 100644 > --- a/fs/btrfs/locking.c > +++ b/fs/btrfs/locking.c > @@ -103,6 +103,15 @@ void btrfs_maybe_reset_lockdep_class(struct btrfs_root *root, struct extent_buff > > #endif > > +#ifdef CONFIG_BTRFS_DEBUG > +static void btrfs_set_eb_lock_owner(struct extent_buffer *eb, pid_t owner) > +{ > + eb->lock_owner = owner; > +} > +#else > +static void btrfs_set_eb_lock_owner(struct extent_buffer *eb, pid_t owner) { } > +#endif > + > /* > * Extent buffer locking > * ===================== > @@ -165,7 +174,7 @@ int btrfs_try_tree_read_lock(struct extent_buffer *eb) > int btrfs_try_tree_write_lock(struct extent_buffer *eb) > { > if (down_write_trylock(&eb->lock)) { > - eb->lock_owner = current->pid; > + btrfs_set_eb_lock_owner(eb, current->pid); > trace_btrfs_try_tree_write_lock(eb); > return 1; > } > @@ -198,7 +207,7 @@ void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest) > start_ns = ktime_get_ns(); > > down_write_nested(&eb->lock, nest); > - eb->lock_owner = current->pid; > + btrfs_set_eb_lock_owner(eb, current->pid); > trace_btrfs_tree_lock(eb, start_ns); > } > > @@ -213,7 +222,7 @@ void btrfs_tree_lock(struct extent_buffer *eb) > void btrfs_tree_unlock(struct extent_buffer *eb) > { > trace_btrfs_tree_unlock(eb); > - eb->lock_owner = 0; > + btrfs_set_eb_lock_owner(eb, 0); > up_write(&eb->lock); > } >
On Fri, Sep 08, 2023 at 08:11:43AM +0800, Qu Wenruo wrote: > > > On 2023/9/8 07:09, David Sterba wrote: > > The lock_owner is used for a rare corruption case and we haven't seen > > any reports in years. Move it to the debugging section of eb. To close > > the holes also move log_index so the final layout looks like: > > Just found some extra space we can squish out: Yeah I have a few more patches to reduce extent buffer size. > > struct extent_buffer { > > u64 start; /* 0 8 */ > > long unsigned int len; /* 8 8 */ > > u32 is definitely enough. > (u16 is not enough for 64K nodesize, unfortunately) One idea is to store the bit shift and then calculate the length on each use but then we can simply not store the length in extent buffer at all because it's always fs_info->nodesize. In some functions it can be directly replaced by that, elsewhere it's needed to do eb->fs_info->nodesize. One more pointer dereference but from likely a cached memory, gaining 8 bytes. > > > long unsigned int bflags; /* 16 8 */ > > For now we don't need 64bit for bflags, 32bit is enough, but > unfortunately that's what the existing test/set/clear_bit() requires... Indeed, and it's not just here, in the inode or some other structures taht store a few flags an u32 would suffice. I have a prototype to add atomic bit manipulations for a u32 type, but this requires manually written assembly so for now it's just x86. This could be useful outside of btrfs so I'd need to make it a proper API and get some feedback from wider audience. > > struct btrfs_fs_info * fs_info; /* 24 8 */ > > spinlock_t refs_lock; /* 32 4 */ > > atomic_t refs; /* 36 4 */ > > int read_mirror; /* 40 4 */ > > We don't really need int for read_mirror, but that would be another > patch(set) to change them. Yeah that's another potential space saved, I tried to track down all the use of mirror values but it's passed around to bios and back. > > s8 log_index; /* 44 1 */ And log_index takes 3 values, this can be encoded into the flags, also a prototype but the code becomes ugly.
On 2023/9/8 08:56, David Sterba wrote: > On Fri, Sep 08, 2023 at 08:11:43AM +0800, Qu Wenruo wrote: >> >> >> On 2023/9/8 07:09, David Sterba wrote: >>> The lock_owner is used for a rare corruption case and we haven't seen >>> any reports in years. Move it to the debugging section of eb. To close >>> the holes also move log_index so the final layout looks like: >> >> Just found some extra space we can squish out: > > Yeah I have a few more patches to reduce extent buffer size. > >>> struct extent_buffer { >>> u64 start; /* 0 8 */ >>> long unsigned int len; /* 8 8 */ >> >> u32 is definitely enough. >> (u16 is not enough for 64K nodesize, unfortunately) > > One idea is to store the bit shift and then calculate the length on each > use but then we can simply not store the length in extent buffer at all > because it's always fs_info->nodesize. In some functions it can be > directly replaced by that, elsewhere it's needed to do > eb->fs_info->nodesize. One more pointer dereference but from likely a > cached memory, gaining 8 bytes. Oh, that's way better. We used to use dummy extent buffer with fixed size (4K) for sys chunk array read, but since commit e959d3c1df3a ("btrfs: use dummy extent buffer for super block sys chunk array read") we just use nodesize and leave the extra space untouched. But it may be better to dig deeper, as I'm not 100% sure if every (including dummy ones) extent buffer goes nodesize. > >> >>> long unsigned int bflags; /* 16 8 */ >> >> For now we don't need 64bit for bflags, 32bit is enough, but >> unfortunately that's what the existing test/set/clear_bit() requires... > > Indeed, and it's not just here, in the inode or some other structures > taht store a few flags an u32 would suffice. I have a prototype to add > atomic bit manipulations for a u32 type, but this requires manually > written assembly so for now it's just x86. This could be useful outside > of btrfs so I'd need to make it a proper API and get some feedback from > wider audience. > >>> struct btrfs_fs_info * fs_info; /* 24 8 */ >>> spinlock_t refs_lock; /* 32 4 */ >>> atomic_t refs; /* 36 4 */ >>> int read_mirror; /* 40 4 */ >> >> We don't really need int for read_mirror, but that would be another >> patch(set) to change them. > > Yeah that's another potential space saved, I tried to track down all the > use of mirror values but it's passed around to bios and back. My current plan is to go s8, which still gives us 127 mirrors. Which should be enough for RAID6 rebuilds. Although we may need extra attention to prevent incorrect width truncation/padding, as we're going to support minus number for scrub_logical ioctl to grab P/Q stripes directly. Thanks, Qu > >>> s8 log_index; /* 44 1 */ > > And log_index takes 3 values, this can be encoded into the flags, also a > prototype but the code becomes ugly.
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 3e46bb4cc957..6f6838226fe7 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4801,6 +4801,28 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, return ret; } +#ifdef CONFIG_BTRFS_DEBUG +/* + * Extra safety check in case the extent tree is corrupted and extent allocator + * chooses to use a tree block which is already used and locked. + */ +static bool check_eb_lock_owner(const struct extent_buffer *eb) +{ + if (eb->lock_owner == current->pid) { + btrfs_err_rl(eb->fs_info, +"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected", + eb->start, btrfs_header_owner(eb), current->pid); + return true; + } + return false; +} +#else +static bool check_eb_lock_owner(struct extent_buffer *eb) +{ + return false; +} +#endif + static struct extent_buffer * btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, int level, u64 owner, @@ -4814,15 +4836,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (IS_ERR(buf)) return buf; - /* - * Extra safety check in case the extent tree is corrupted and extent - * allocator chooses to use a tree block which is already used and - * locked. - */ - if (buf->lock_owner == current->pid) { - btrfs_err_rl(fs_info, -"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected", - buf->start, btrfs_header_owner(buf), current->pid); + if (check_eb_lock_owner(buf)) { free_extent_buffer(buf); return ERR_PTR(-EUCLEAN); } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 68368ba99321..2171057a4477 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -80,16 +80,16 @@ struct extent_buffer { spinlock_t refs_lock; atomic_t refs; int read_mirror; - struct rcu_head rcu_head; - pid_t lock_owner; /* >= 0 if eb belongs to a log tree, -1 otherwise */ s8 log_index; + struct rcu_head rcu_head; struct rw_semaphore lock; struct page *pages[INLINE_EXTENT_BUFFER_PAGES]; #ifdef CONFIG_BTRFS_DEBUG struct list_head leak_list; + pid_t lock_owner; #endif }; diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index c3128cdf1177..6ac4fd8cc8dc 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -103,6 +103,15 @@ void btrfs_maybe_reset_lockdep_class(struct btrfs_root *root, struct extent_buff #endif +#ifdef CONFIG_BTRFS_DEBUG +static void btrfs_set_eb_lock_owner(struct extent_buffer *eb, pid_t owner) +{ + eb->lock_owner = owner; +} +#else +static void btrfs_set_eb_lock_owner(struct extent_buffer *eb, pid_t owner) { } +#endif + /* * Extent buffer locking * ===================== @@ -165,7 +174,7 @@ int btrfs_try_tree_read_lock(struct extent_buffer *eb) int btrfs_try_tree_write_lock(struct extent_buffer *eb) { if (down_write_trylock(&eb->lock)) { - eb->lock_owner = current->pid; + btrfs_set_eb_lock_owner(eb, current->pid); trace_btrfs_try_tree_write_lock(eb); return 1; } @@ -198,7 +207,7 @@ void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest) start_ns = ktime_get_ns(); down_write_nested(&eb->lock, nest); - eb->lock_owner = current->pid; + btrfs_set_eb_lock_owner(eb, current->pid); trace_btrfs_tree_lock(eb, start_ns); } @@ -213,7 +222,7 @@ void btrfs_tree_lock(struct extent_buffer *eb) void btrfs_tree_unlock(struct extent_buffer *eb) { trace_btrfs_tree_unlock(eb); - eb->lock_owner = 0; + btrfs_set_eb_lock_owner(eb, 0); up_write(&eb->lock); }
The lock_owner is used for a rare corruption case and we haven't seen any reports in years. Move it to the debugging section of eb. To close the holes also move log_index so the final layout looks like: struct extent_buffer { u64 start; /* 0 8 */ long unsigned int len; /* 8 8 */ long unsigned int bflags; /* 16 8 */ struct btrfs_fs_info * fs_info; /* 24 8 */ spinlock_t refs_lock; /* 32 4 */ atomic_t refs; /* 36 4 */ int read_mirror; /* 40 4 */ s8 log_index; /* 44 1 */ /* XXX 3 bytes hole, try to pack */ struct callback_head callback_head __attribute__((__aligned__(8))); /* 48 16 */ /* --- cacheline 1 boundary (64 bytes) --- */ struct rw_semaphore lock; /* 64 40 */ struct page * pages[16]; /* 104 128 */ /* size: 232, cachelines: 4, members: 11 */ /* sum members: 229, holes: 1, sum holes: 3 */ /* forced alignments: 1, forced holes: 1, sum forced holes: 3 */ /* last cacheline: 40 bytes */ } __attribute__((__aligned__(8))); This saves 8 bytes in total and still keeps the lock on a separate cacheline. Signed-off-by: David Sterba <dsterba@suse.com> --- fs/btrfs/extent-tree.c | 32 +++++++++++++++++++++++--------- fs/btrfs/extent_io.h | 4 ++-- fs/btrfs/locking.c | 15 ++++++++++++--- 3 files changed, 37 insertions(+), 14 deletions(-)