diff mbox series

[POC/RFE] : Avoid silently growing struct file due to SLAB_TYPESAFE_BY_RCU

Message ID 20240826-benahm-sorgenfrei-245a41a9acb4@brauner (mailing list archive)
State New
Headers show
Series [POC/RFE] : Avoid silently growing struct file due to SLAB_TYPESAFE_BY_RCU | expand

Commit Message

Christian Brauner Aug. 26, 2024, 9:23 a.m. UTC
I've managed to shrink struct file to 192 bytes aka three cachelines.
What I wasn't aware of is that SLAB_TYPESAFE_BY_RCU silently grows the
struct by another cacheline to accomodate the freelist pointer as
pointed out by Vlastimil.

It's not too bad because we mostly care about "runtime" size but it's
still unfortunate that we're wasting a whole additional cacheline for no
good reason.

So I would like to find a way to avoid that. Below you see a hack
because adding a parameter to kmem_cache_create() seemed to require me
to fiddle with mm/slub.c and I'd rather not unless I'm told how.

For the POC I'm using the offset into the union at the end of struct
file so that SLUB places the freelist pointer into the union during
kmem_cache_free().

I'm just looking for ideas from mm to figure out an API where we can say
"use that as the freelist pointer in the struct" or at least a reason
why that's just not possible.

Sketched-by: Christian Brauner <brauner@kernel.org>
---
 fs/file_table.c    | 3 ++-
 include/linux/fs.h | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

Comments

Christian Brauner Aug. 26, 2024, 9:25 a.m. UTC | #1
[+ Jann]

On Mon, Aug 26, 2024 at 10:41:07AM GMT, Christian Brauner wrote:
> I've managed to shrink struct file to 192 bytes aka three cachelines.
> What I wasn't aware of is that SLAB_TYPESAFE_BY_RCU silently grows the
> struct by another cacheline to accomodate the freelist pointer as
> pointed out by Vlastimil.
> 
> It's not too bad because we mostly care about "runtime" size but it's
> still unfortunate that we're wasting a whole additional cacheline for no
> good reason.
> 
> So I would like to find a way to avoid that. Below you see a hack
> because adding a parameter to kmem_cache_create() seemed to require me
> to fiddle with mm/slub.c and I'd rather not unless I'm told how.
> 
> For the POC I'm using the offset into the union at the end of struct
> file so that SLUB places the freelist pointer into the union during
> kmem_cache_free().
> 
> I'm just looking for ideas from mm to figure out an API where we can say
> "use that as the freelist pointer in the struct" or at least a reason
> why that's just not possible.
> 
> Sketched-by: Christian Brauner <brauner@kernel.org>
> ---
>  fs/file_table.c    | 3 ++-
>  include/linux/fs.h | 2 ++
>  2 files changed, 4 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/file_table.c b/fs/file_table.c
> index 694199a1a966..7a5fac7d4bbd 100644
> --- a/fs/file_table.c
> +++ b/fs/file_table.c
> @@ -514,7 +514,8 @@ EXPORT_SYMBOL(__fput_sync);
>  
>  void __init files_init(void)
>  {
> -	filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
> +	filp_cachep = kmem_cache_create("filp",
> +				offsetof(struct file, __f_slab_rcu), 0,
>  				SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN |
>  				SLAB_PANIC | SLAB_ACCOUNT, NULL);
>  	percpu_counter_init(&nr_files, 0, GFP_KERNEL);
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 36bb8456eb38..6c3664d624a0 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -1010,6 +1010,7 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index)
>   * @f_task_work: task work entry point
>   * @f_llist: work queue entrypoint
>   * @f_ra: file's readahead state
> + * @f_slab_rcu: freelist pointer for SLAB_TYPESAFE_BY_RCU
>   */
>  struct file {
>  	atomic_long_t			f_count;
> @@ -1041,6 +1042,7 @@ struct file {
>  		struct callback_head	f_task_work;
>  		struct llist_node	f_llist;
>  		struct file_ra_state	f_ra;
> +		void			*__f_slab_rcu;
>  	};
>  	/* --- cacheline 3 boundary (192 bytes) --- */
>  } __randomize_layout
> -- 
> 2.45.2
diff mbox series

Patch

diff --git a/fs/file_table.c b/fs/file_table.c
index 694199a1a966..7a5fac7d4bbd 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -514,7 +514,8 @@  EXPORT_SYMBOL(__fput_sync);
 
 void __init files_init(void)
 {
-	filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
+	filp_cachep = kmem_cache_create("filp",
+				offsetof(struct file, __f_slab_rcu), 0,
 				SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN |
 				SLAB_PANIC | SLAB_ACCOUNT, NULL);
 	percpu_counter_init(&nr_files, 0, GFP_KERNEL);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 36bb8456eb38..6c3664d624a0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1010,6 +1010,7 @@  static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index)
  * @f_task_work: task work entry point
  * @f_llist: work queue entrypoint
  * @f_ra: file's readahead state
+ * @f_slab_rcu: freelist pointer for SLAB_TYPESAFE_BY_RCU
  */
 struct file {
 	atomic_long_t			f_count;
@@ -1041,6 +1042,7 @@  struct file {
 		struct callback_head	f_task_work;
 		struct llist_node	f_llist;
 		struct file_ra_state	f_ra;
+		void			*__f_slab_rcu;
 	};
 	/* --- cacheline 3 boundary (192 bytes) --- */
 } __randomize_layout