diff mbox series

[03/62] mm: Split slab into its own type

Message ID 20211004134650.4031813-4-willy@infradead.org (mailing list archive)
State New
Headers show
Series Separate struct slab from struct page | expand

Commit Message

Matthew Wilcox (Oracle) Oct. 4, 2021, 1:45 p.m. UTC
Make struct slab independent of struct page.  It still uses the
underlying memory in struct page for storing slab-specific data,
but slab and slub can now be weaned off using struct page directly.
Some of the wrapper functions (slab_address() and slab_order())
still need to cast to struct page, but this is a significant
disentanglement.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 include/linux/mm_types.h   | 56 +++++++++++++++++++++++++++++
 include/linux/page-flags.h | 29 +++++++++++++++
 mm/slab.h                  | 73 ++++++++++++++++++++++++++++++++++++++
 mm/slub.c                  |  8 ++---
 4 files changed, 162 insertions(+), 4 deletions(-)

Comments

David Hildenbrand Oct. 5, 2021, 4:10 p.m. UTC | #1
On 04.10.21 15:45, Matthew Wilcox (Oracle) wrote:
> Make struct slab independent of struct page.  It still uses the
> underlying memory in struct page for storing slab-specific data,
> but slab and slub can now be weaned off using struct page directly.
> Some of the wrapper functions (slab_address() and slab_order())
> still need to cast to struct page, but this is a significant
> disentanglement.
> 
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> ---
>   include/linux/mm_types.h   | 56 +++++++++++++++++++++++++++++
>   include/linux/page-flags.h | 29 +++++++++++++++
>   mm/slab.h                  | 73 ++++++++++++++++++++++++++++++++++++++
>   mm/slub.c                  |  8 ++---
>   4 files changed, 162 insertions(+), 4 deletions(-)
> 
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 7f8ee09c711f..c2ea71aba84e 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -239,6 +239,62 @@ struct page {
>   #endif
>   } _struct_page_alignment;
>   
> +/* Reuses the bits in struct page */
> +struct slab {
> +	unsigned long flags;
> +	union {
> +		struct list_head slab_list;
> +		struct {	/* Partial pages */
> +			struct slab *next;
> +#ifdef CONFIG_64BIT
> +			int slabs;	/* Nr of slabs left */
> +			int pobjects;	/* Approximate count */
> +#else
> +			short int slabs;
> +			short int pobjects;
> +#endif
> +		};
> +		struct rcu_head rcu_head;
> +	};
> +	struct kmem_cache *slab_cache; /* not slob */
> +	/* Double-word boundary */
> +	void *freelist;		/* first free object */
> +	union {
> +		void *s_mem;	/* slab: first object */
> +		unsigned long counters;		/* SLUB */
> +		struct {			/* SLUB */
> +			unsigned inuse:16;
> +			unsigned objects:15;
> +			unsigned frozen:1;
> +		};
> +	};
> +
> +	union {
> +		unsigned int active;		/* SLAB */
> +		int units;			/* SLOB */
> +	};
> +	atomic_t _refcount;
> +#ifdef CONFIG_MEMCG
> +	unsigned long memcg_data;
> +#endif
> +};

My 2 cents just from reading the first 3 mails:

I'm not particularly happy about the "/* Reuses the bits in struct page 
*/" part of thingy here, essentially really having to pay attention what 
whenever we change something in "struct page" to not mess up all the 
other special types we have. And I wasn't particularly happy scanning 
patch #1 and #2 for the same reason. Can't we avoid that?

What I can see is that we want (and must right now for generic 
infrastructure) keep some members of the the struct page" (e.g., flags, 
_refcount) at the very same place, because generic infrastructure relies 
on them.

Maybe that has already been discussed somewhere deep down in folio mail 
threads, but I would have expected that we keep struct-page generic 
inside struct-page and only have inside "struct slab" what's special for 
"struct slab".

I would have thought that we want something like this (but absolutely 
not this):

struct page_header {
	unsigned long flags;
}

struct page_footer {
	atomic_t _refcount;
#ifdef CONFIG_MEMCG
	unsigned long memcg_data;
#endif
}

struct page {
	struct page_header header;
	uint8_t reserved[$DO_THE_MATH]
	struct page_footer footer;
};

struct slab {
	...
};

struct slab_page {
	struct page_header header;
	struct slab;
	struct page_footer footer;
};

Instead of providing helpers for struct slab_page, simply cast to struct 
page and replace the structs in struct slab_page by simple placeholders 
with the same size.

That would to me look like a nice cleanup itself, ignoring all the other 
parallel discussions that are going on. But I imagine the problem is 
more involved, and a simple header/footer might not be sufficient.
Matthew Wilcox (Oracle) Oct. 5, 2021, 6:48 p.m. UTC | #2
On Tue, Oct 05, 2021 at 06:10:24PM +0200, David Hildenbrand wrote:
> My 2 cents just from reading the first 3 mails:
> 
> I'm not particularly happy about the "/* Reuses the bits in struct page */"
> part of thingy here, essentially really having to pay attention what
> whenever we change something in "struct page" to not mess up all the other
> special types we have. And I wasn't particularly happy scanning patch #1 and
> #2 for the same reason. Can't we avoid that?

I've tried to mitigate that with the compile-time assertions.  They're
actually a bit stronger than what we have now.

> What I can see is that we want (and must right now for generic
> infrastructure) keep some members of the the struct page" (e.g., flags,
> _refcount) at the very same place, because generic infrastructure relies on
> them.
> 
> Maybe that has already been discussed somewhere deep down in folio mail
> threads, but I would have expected that we keep struct-page generic inside
> struct-page and only have inside "struct slab" what's special for "struct
> slab".
> 
> I would have thought that we want something like this (but absolutely not
> this):
> 
> struct page_header {
> 	unsigned long flags;
> }
> 
> struct page_footer {
> 	atomic_t _refcount;
> #ifdef CONFIG_MEMCG
> 	unsigned long memcg_data;
> #endif
> }
> 
> struct page {
> 	struct page_header header;
> 	uint8_t reserved[$DO_THE_MATH]
> 	struct page_footer footer;
> };

The problem with this definition is the number of places which refer
to page->flags and must now be churned to page->header.flags.
_refcount is rather better encapsulated, and I'm not entirely sure
how much we'd have to do for memcg_data.  Maybe that was what you meant
by "this but absolutely not this"?  I don't quite understand what that
was supposed to mean.

> struct slab {
> 	...
> };
> 
> struct slab_page {
> 	struct page_header header;
> 	struct slab;
> 	struct page_footer footer;
> };
> 
> Instead of providing helpers for struct slab_page, simply cast to struct
> page and replace the structs in struct slab_page by simple placeholders with
> the same size.
> 
> That would to me look like a nice cleanup itself, ignoring all the other
> parallel discussions that are going on. But I imagine the problem is more
> involved, and a simple header/footer might not be sufficient.

Yes, exactly, the problems are more involved.  The location/contents of
page->mapping are special, the contents of bit zero of the second word
are special (determines compound_head or not) and slub particularly
relies on the 128-bit alignment of the { freelist, counters } pair.

The ultimate destination (and I think Kent/Johannes/I all agree
on this) is to dynamically allocate struct slab.  At _that_ point,
we can actually drop _refcount from struct slab and even change how
struct slab is defined based on CONFIG_SLUB / CONFIG_SLOB / CONFIG_SLAB.
I think we'll still need ->flags to be the first element of struct slab,
but bit 0 of the second word stops being special, we won't need to care
about where page->mapping aliases, and the 128-bit alignment becomes
solely the concern of the slub allocator instead of affecting everyone
who uses struct page.
David Hildenbrand Oct. 12, 2021, 7:25 a.m. UTC | #3
On 05.10.21 20:48, Matthew Wilcox wrote:
> On Tue, Oct 05, 2021 at 06:10:24PM +0200, David Hildenbrand wrote:
>> My 2 cents just from reading the first 3 mails:
>>
>> I'm not particularly happy about the "/* Reuses the bits in struct page */"
>> part of thingy here, essentially really having to pay attention what
>> whenever we change something in "struct page" to not mess up all the other
>> special types we have. And I wasn't particularly happy scanning patch #1 and
>> #2 for the same reason. Can't we avoid that?
> 
> I've tried to mitigate that with the compile-time assertions.  They're
> actually a bit stronger than what we have now.

Sorry for the late reply.

Okay, that's at least something :)

> 
>> What I can see is that we want (and must right now for generic
>> infrastructure) keep some members of the the struct page" (e.g., flags,
>> _refcount) at the very same place, because generic infrastructure relies on
>> them.
>>
>> Maybe that has already been discussed somewhere deep down in folio mail
>> threads, but I would have expected that we keep struct-page generic inside
>> struct-page and only have inside "struct slab" what's special for "struct
>> slab".
>>
>> I would have thought that we want something like this (but absolutely not
>> this):
>>
>> struct page_header {
>> 	unsigned long flags;
>> }
>>
>> struct page_footer {
>> 	atomic_t _refcount;
>> #ifdef CONFIG_MEMCG
>> 	unsigned long memcg_data;
>> #endif
>> }
>>
>> struct page {
>> 	struct page_header header;
>> 	uint8_t reserved[$DO_THE_MATH]
>> 	struct page_footer footer;
>> };
> 
> The problem with this definition is the number of places which refer
> to page->flags and must now be churned to page->header.flags.
> _refcount is rather better encapsulated, and I'm not entirely sure
> how much we'd have to do for memcg_data.  Maybe that was what you meant
> by "this but absolutely not this"?  I don't quite understand what that
> was supposed to mean.

Exactly what you mentioned above (changing all callers) is what I didn't 
want :)

I was thinking of a way to have these "fixed fields" be defined only 
once, and ideally, perform any access to these fields via the "struct 
page" instead of via the "struct slab".

Like, when wanting to set a page flag with a slab, access them

((struct page *)slab)->flags

instead of using slab->flags. Essentially not duplicating these fields 
and accessing them via the "refined" page types, but only putting a 
"reserved" placeholder in. That would mean that we wouldn't need patch 
#1 and #2, because we wouldn't be passing around pgflags (using whatever 
fancy type we will decide on), all such accesses would continue going 
via the "struct page" -- because that's where these common fields 
actually reside in right now at places we cannot simply change -- 
because common infrastructure (PFN walkers, ...) heavily relyies on the 
flags, the refcount and even the mapcount (page types) being set 
accordingly.

> 
>> struct slab {
>> 	...
>> };
>>
>> struct slab_page {
>> 	struct page_header header;
>> 	struct slab;
>> 	struct page_footer footer;
>> };
>>
>> Instead of providing helpers for struct slab_page, simply cast to struct
>> page and replace the structs in struct slab_page by simple placeholders with
>> the same size.
>>
>> That would to me look like a nice cleanup itself, ignoring all the other
>> parallel discussions that are going on. But I imagine the problem is more
>> involved, and a simple header/footer might not be sufficient.
> 
> Yes, exactly, the problems are more involved.  The location/contents of
> page->mapping are special, the contents of bit zero of the second word
> are special (determines compound_head or not) and slub particularly
> relies on the 128-bit alignment of the { freelist, counters } pair.
> 
> The ultimate destination (and I think Kent/Johannes/I all agree
> on this) is to dynamically allocate struct slab.  At _that_ point,
> we can actually drop _refcount from struct slab and even change how
> struct slab is defined based on CONFIG_SLUB / CONFIG_SLOB / CONFIG_SLAB.

I'm also sold on the idea of simplifying "struct page" (even when not 
shrinking it!) by moving out these "struct slab" parts that dictate the 
"struct page" layout heavily.

> I think we'll still need ->flags to be the first element of struct slab,
> but bit 0 of the second word stops being special, we won't need to care
> about where page->mapping aliases, and the 128-bit alignment becomes
> solely the concern of the slub allocator instead of affecting everyone
> who uses struct page.

Yes, that sounds good to me.
Matthew Wilcox (Oracle) Oct. 12, 2021, 2:13 p.m. UTC | #4
On Tue, Oct 12, 2021 at 09:25:02AM +0200, David Hildenbrand wrote:
> > > What I can see is that we want (and must right now for generic
> > > infrastructure) keep some members of the the struct page" (e.g., flags,
> > > _refcount) at the very same place, because generic infrastructure relies on
> > > them.
> > > 
> > > Maybe that has already been discussed somewhere deep down in folio mail
> > > threads, but I would have expected that we keep struct-page generic inside
> > > struct-page and only have inside "struct slab" what's special for "struct
> > > slab".
> > > 
> > > I would have thought that we want something like this (but absolutely not
> > > this):
> > > 
> > > struct page_header {
> > > 	unsigned long flags;
> > > }
> > > 
> > > struct page_footer {
> > > 	atomic_t _refcount;
> > > #ifdef CONFIG_MEMCG
> > > 	unsigned long memcg_data;
> > > #endif
> > > }
> > > 
> > > struct page {
> > > 	struct page_header header;
> > > 	uint8_t reserved[$DO_THE_MATH]
> > > 	struct page_footer footer;
> > > };
> > 
> > The problem with this definition is the number of places which refer
> > to page->flags and must now be churned to page->header.flags.
> > _refcount is rather better encapsulated, and I'm not entirely sure
> > how much we'd have to do for memcg_data.  Maybe that was what you meant
> > by "this but absolutely not this"?  I don't quite understand what that
> > was supposed to mean.
> 
> Exactly what you mentioned above (changing all callers) is what I didn't
> want :)
> 
> I was thinking of a way to have these "fixed fields" be defined only once,
> and ideally, perform any access to these fields via the "struct page"
> instead of via the "struct slab".
> 
> Like, when wanting to set a page flag with a slab, access them
> 
> ((struct page *)slab)->flags
> 
> instead of using slab->flags. Essentially not duplicating these fields and
> accessing them via the "refined" page types, but only putting a "reserved"
> placeholder in. That would mean that we wouldn't need patch #1 and #2,
> because we wouldn't be passing around pgflags (using whatever fancy type we
> will decide on), all such accesses would continue going via the "struct
> page" -- because that's where these common fields actually reside in right
> now at places we cannot simply change -- because common infrastructure (PFN
> walkers, ...) heavily relyies on the flags, the refcount and even the
> mapcount (page types) being set accordingly.

OK, I think I see what you want:

struct slab {
	struct page_header _1;
	... slab specific stuff ...
	struct page_footer _2;
};

and then cast struct slab to struct page inside slab_nid(),
slab_address(), slab_pgdat(), slab_order() and so on.

To a certain extent, I think that's just an implementation detail; the
important part is the use of struct slab, and then calling slab_nid()
instead of page_nid().  A wrinkle here is that slab does use some of
the bits in page->flags for its own purposes, eg PG_active becomes
PG_pfmemalloc for PageSlab pages.

One of the things I did in the folio patches that I'm not too fond of
now is:

struct folio {
	union {
		struct {
			...
		};
		struct page page;
	};
};

so that I could do &folio->page instead of casting to struct page.
But maybe both of these approaches are just bad ideas, and I should do:

static inline void slab_clear_pfmemalloc(struct slab *slab)
{
	PageClearActive(slab_page(slab));
}

instead of the current:
	clear_bit(PG_pfmemalloc, &slab->flags);
David Hildenbrand Oct. 12, 2021, 2:17 p.m. UTC | #5
On 12.10.21 16:13, Matthew Wilcox wrote:
> On Tue, Oct 12, 2021 at 09:25:02AM +0200, David Hildenbrand wrote:
>>>> What I can see is that we want (and must right now for generic
>>>> infrastructure) keep some members of the the struct page" (e.g., flags,
>>>> _refcount) at the very same place, because generic infrastructure relies on
>>>> them.
>>>>
>>>> Maybe that has already been discussed somewhere deep down in folio mail
>>>> threads, but I would have expected that we keep struct-page generic inside
>>>> struct-page and only have inside "struct slab" what's special for "struct
>>>> slab".
>>>>
>>>> I would have thought that we want something like this (but absolutely not
>>>> this):
>>>>
>>>> struct page_header {
>>>> 	unsigned long flags;
>>>> }
>>>>
>>>> struct page_footer {
>>>> 	atomic_t _refcount;
>>>> #ifdef CONFIG_MEMCG
>>>> 	unsigned long memcg_data;
>>>> #endif
>>>> }
>>>>
>>>> struct page {
>>>> 	struct page_header header;
>>>> 	uint8_t reserved[$DO_THE_MATH]
>>>> 	struct page_footer footer;
>>>> };
>>>
>>> The problem with this definition is the number of places which refer
>>> to page->flags and must now be churned to page->header.flags.
>>> _refcount is rather better encapsulated, and I'm not entirely sure
>>> how much we'd have to do for memcg_data.  Maybe that was what you meant
>>> by "this but absolutely not this"?  I don't quite understand what that
>>> was supposed to mean.
>>
>> Exactly what you mentioned above (changing all callers) is what I didn't
>> want :)
>>
>> I was thinking of a way to have these "fixed fields" be defined only once,
>> and ideally, perform any access to these fields via the "struct page"
>> instead of via the "struct slab".
>>
>> Like, when wanting to set a page flag with a slab, access them
>>
>> ((struct page *)slab)->flags
>>
>> instead of using slab->flags. Essentially not duplicating these fields and
>> accessing them via the "refined" page types, but only putting a "reserved"
>> placeholder in. That would mean that we wouldn't need patch #1 and #2,
>> because we wouldn't be passing around pgflags (using whatever fancy type we
>> will decide on), all such accesses would continue going via the "struct
>> page" -- because that's where these common fields actually reside in right
>> now at places we cannot simply change -- because common infrastructure (PFN
>> walkers, ...) heavily relyies on the flags, the refcount and even the
>> mapcount (page types) being set accordingly.
> 
> OK, I think I see what you want:
> 
> struct slab {
> 	struct page_header _1;
> 	... slab specific stuff ...
> 	struct page_footer _2;
> };
> 
> and then cast struct slab to struct page inside slab_nid(),
> slab_address(), slab_pgdat(), slab_order() and so on.
> 
> To a certain extent, I think that's just an implementation detail; the
> important part is the use of struct slab, and then calling slab_nid()
> instead of page_nid().  A wrinkle here is that slab does use some of
> the bits in page->flags for its own purposes, eg PG_active becomes
> PG_pfmemalloc for PageSlab pages.
> 
> One of the things I did in the folio patches that I'm not too fond of
> now is:
> 
> struct folio {
> 	union {
> 		struct {
> 			...
> 		};
> 		struct page page;
> 	};
> };
> 
> so that I could do &folio->page instead of casting to struct page.
> But maybe both of these approaches are just bad ideas, and I should do:
> 
> static inline void slab_clear_pfmemalloc(struct slab *slab)
> {
> 	PageClearActive(slab_page(slab));
> }


Yes, that's what I meant.
Johannes Weiner Oct. 13, 2021, 6:08 p.m. UTC | #6
On Tue, Oct 12, 2021 at 04:17:29PM +0200, David Hildenbrand wrote:
> On 12.10.21 16:13, Matthew Wilcox wrote:
> > One of the things I did in the folio patches that I'm not too fond of
> > now is:
> > 
> > struct folio {
> > 	union {
> > 		struct {
> > 			...
> > 		};
> > 		struct page page;
> > 	};
> > };
> > 
> > so that I could do &folio->page instead of casting to struct page.
> > But maybe both of these approaches are just bad ideas, and I should do:
> > 
> > static inline void slab_clear_pfmemalloc(struct slab *slab)
> > {
> > 	PageClearActive(slab_page(slab));
> > }
> 
> 
> Yes, that's what I meant.

That looks great to me. It abstracts a slab attribute, but is very
clear in how it implements that. The dependency between the data
structures is more obvious this way than with unions, which I think
will be really useful in further refactoring iterations.

Btw, I think slab_nid() is an interesting thing when it comes to page
polymorphy. We want to know the nid for all sorts of memory types:
slab, file, anon, buddy etc. In the goal of distilling page down to
the fewest number of bytes, this is probably something that should
remain in the page rather than be replicated in all subtypes.
Matthew Wilcox (Oracle) Oct. 13, 2021, 6:31 p.m. UTC | #7
On Wed, Oct 13, 2021 at 02:08:57PM -0400, Johannes Weiner wrote:
> Btw, I think slab_nid() is an interesting thing when it comes to page
> polymorphy. We want to know the nid for all sorts of memory types:
> slab, file, anon, buddy etc. In the goal of distilling page down to
> the fewest number of bytes, this is probably something that should
> remain in the page rather than be replicated in all subtypes.

Oh, this is a really interesting point.

Node ID is typically 10 bits (I checked Debian & Oracle configs for
various architectures).  That's far more than we can store in the bottom
bits of a single word, and it's even a good chunk of a second word.

I was assuming that, for the page allocator's memory descriptor and for
that of many allocators (such as slab), it would be stored *somewhere*
in the memory descriptor.  It wouldn't necessarily have to be the same
place for all memory descriptors, and maybe (if it's accessed rarely),
we delegate finding it to the page allocator's knowledge.

But not all memory descriptors want/need/can know this.  For example,
vmalloc() might well spread its memory across multiple nodes.  As long
as we can restore the node assignment again once the pages are vfree(),
there's no particular need for the vmalloc memory descriptor to know
what node an individual page came from (and the concept of asking
vmalloc what node a particular allocation came from is potentially
nonsense, unless somebody used vmalloc_node() or one of the variants).

Not sure there's an obviously right answer here.  I was assuming that at
first we'd enforce memdesc->flags being the first word of every memory
descriptor and so we could keep passing page->flags around.  That could
then change later, but it'd be a good first step?
David Hildenbrand Oct. 14, 2021, 7:22 a.m. UTC | #8
On 13.10.21 20:31, Matthew Wilcox wrote:
> On Wed, Oct 13, 2021 at 02:08:57PM -0400, Johannes Weiner wrote:
>> Btw, I think slab_nid() is an interesting thing when it comes to page
>> polymorphy. We want to know the nid for all sorts of memory types:
>> slab, file, anon, buddy etc. In the goal of distilling page down to
>> the fewest number of bytes, this is probably something that should
>> remain in the page rather than be replicated in all subtypes.
> 
> Oh, this is a really interesting point.
> 
> Node ID is typically 10 bits (I checked Debian & Oracle configs for
> various architectures).  That's far more than we can store in the bottom
> bits of a single word, and it's even a good chunk of a second word.
> 
> I was assuming that, for the page allocator's memory descriptor and for
> that of many allocators (such as slab), it would be stored *somewhere*
> in the memory descriptor.  It wouldn't necessarily have to be the same
> place for all memory descriptors, and maybe (if it's accessed rarely),
> we delegate finding it to the page allocator's knowledge.
> 
> But not all memory descriptors want/need/can know this.  For example,
> vmalloc() might well spread its memory across multiple nodes.  As long
> as we can restore the node assignment again once the pages are vfree(),
> there's no particular need for the vmalloc memory descriptor to know
> what node an individual page came from (and the concept of asking
> vmalloc what node a particular allocation came from is potentially
> nonsense, unless somebody used vmalloc_node() or one of the variants).
> 
> Not sure there's an obviously right answer here.  I was assuming that at
> first we'd enforce memdesc->flags being the first word of every memory
> descriptor and so we could keep passing page->flags around.  That could
> then change later, but it'd be a good first step?
> 

<offtopic>
It's really hard to make an educated guess here without having a full
design proposal of what we actually want to achieve and especially how
we're going to treat all the corner cases (as raised already in
different context).

I'm all for simplifying struct page and *eventually* being able to
shrink it, even if we end up only shrinking by a little. However, I'm
not sold on doing that by any means (e.g., I cannot agree to any
fundamental page allocator rewrite without an idea what it does to
performance but also complexity). We might always have a space vs.
performance cost and saving space by sacrificing performance isn't
necessarily always a good idea. But again, it's really hard to make an
educated guess.

Again, I'm all for cleanups and simplifications as long as they really
make things cleaner. So I'm going to comment on the current state and
how the cleanups make sense with the current state.
</offtopic>

Node/zone is a property of a base page and belongs into struct page OR
has to be very easily accessible without any kind of heavy locking. The
node/zone is determined once memory gets exposed to the system (e.g., to
the buddy during boot or during memory onlining) and is stable until
memory is offlined again (as of right now, one could imagine changing
zones at runtime).

For example, node/zone information is required for (almost) lockless PFN
walkers in memory offlining context, to figure out if all pages we're
dealing with belong to one node/zone, but also to properly shrink
zones+nodes to eventually be able to offline complete nodes. I recall
that there are other PFN walkers (page compaction) that need this
information easily accessible.
Johannes Weiner Oct. 14, 2021, 12:44 p.m. UTC | #9
On Thu, Oct 14, 2021 at 09:22:13AM +0200, David Hildenbrand wrote:
> On 13.10.21 20:31, Matthew Wilcox wrote:
> > On Wed, Oct 13, 2021 at 02:08:57PM -0400, Johannes Weiner wrote:
> >> Btw, I think slab_nid() is an interesting thing when it comes to page
> >> polymorphy. We want to know the nid for all sorts of memory types:
> >> slab, file, anon, buddy etc. In the goal of distilling page down to
> >> the fewest number of bytes, this is probably something that should
> >> remain in the page rather than be replicated in all subtypes.
> > 
> > Oh, this is a really interesting point.
> > 
> > Node ID is typically 10 bits (I checked Debian & Oracle configs for
> > various architectures).  That's far more than we can store in the bottom
> > bits of a single word, and it's even a good chunk of a second word.
> > 
> > I was assuming that, for the page allocator's memory descriptor and for
> > that of many allocators (such as slab), it would be stored *somewhere*
> > in the memory descriptor.  It wouldn't necessarily have to be the same
> > place for all memory descriptors, and maybe (if it's accessed rarely),
> > we delegate finding it to the page allocator's knowledge.
> > 
> > But not all memory descriptors want/need/can know this.  For example,
> > vmalloc() might well spread its memory across multiple nodes.  As long
> > as we can restore the node assignment again once the pages are vfree(),
> > there's no particular need for the vmalloc memory descriptor to know
> > what node an individual page came from (and the concept of asking
> > vmalloc what node a particular allocation came from is potentially
> > nonsense, unless somebody used vmalloc_node() or one of the variants).
> > 
> > Not sure there's an obviously right answer here.  I was assuming that at
> > first we'd enforce memdesc->flags being the first word of every memory
> > descriptor and so we could keep passing page->flags around.  That could
> > then change later, but it'd be a good first step?
> > 
> 
> <offtopic>
> It's really hard to make an educated guess here without having a full
> design proposal of what we actually want to achieve and especially how
> we're going to treat all the corner cases (as raised already in
> different context).
> 
> I'm all for simplifying struct page and *eventually* being able to
> shrink it, even if we end up only shrinking by a little. However, I'm
> not sold on doing that by any means (e.g., I cannot agree to any
> fundamental page allocator rewrite without an idea what it does to
> performance but also complexity). We might always have a space vs.
> performance cost and saving space by sacrificing performance isn't
> necessarily always a good idea. But again, it's really hard to make an
> educated guess.
> 
> Again, I'm all for cleanups and simplifications as long as they really
> make things cleaner. So I'm going to comment on the current state and
> how the cleanups make sense with the current state.
> </offtopic>

Very well put, and not off-topic at all.

A clearer overarching design proposal that exists in more than one
head, that people agree on, and that is concrete enough to allow
others to make educated guesses on how random snippets of code would
or should look like in the new world, would help immensely.

(This applies here, but to a certain degree to folio as well.)

HOWEVER, given the scope of struct page, I also think this is a very
difficult problem. There are a LOT of nooks and crannies that throw
curveballs at these refactors. We're finding new ones daily.

I think smaller iterations, with a manageable amount of novelty, that
can be reviewed and judged by everybody involved - against the current
code base, rather than against diverging future speculation - make
more sense. These can still push the paradigm in the direction we
want, but we can work out kinks in the overarching ideas as we go.

I think what isn't going to work is committing vast amounts of code to
open-ended projects and indefinite transitory chaos that makes the
codebase go through every conceptual dead end along the way. There are
just too many users and too much development work against each release
of the kernel nowadays to operate like this.

> Node/zone is a property of a base page and belongs into struct page OR
> has to be very easily accessible without any kind of heavy locking. The
> node/zone is determined once memory gets exposed to the system (e.g., to
> the buddy during boot or during memory onlining) and is stable until
> memory is offlined again (as of right now, one could imagine changing
> zones at runtime).

Yes, it's a property of the page frame, for which struct page is the
descriptor. Even if we could get away with stuffing them into the
higher-level memory descriptors, it's inevitable that this will lead
to duplication, make adding new types more cumbersome, and get in the
way of writing generic code that works on those shared attributes.

struct page really is the right place for it.

> For example, node/zone information is required for (almost) lockless PFN
> walkers in memory offlining context, to figure out if all pages we're
> dealing with belong to one node/zone, but also to properly shrink
> zones+nodes to eventually be able to offline complete nodes. I recall
> that there are other PFN walkers (page compaction) that need this
> information easily accessible.

kmemleak is another one of them.

page_ext is another lower-level plumbing thing. (Although this one we
*may* be able to incorporate into dynamically allocated higher-level
descriptors. Another future idea that is worth keeping on the map, but
need to be careful not to make assumptions on today.)

Then there is generic code that doesn't care about higher-level types:
vmstats comes to mind, the generic list_lru code, page table walkers
(gup, numa balancing, per-task numa stats), ...
Matthew Wilcox (Oracle) Oct. 14, 2021, 1:08 p.m. UTC | #10
On Thu, Oct 14, 2021 at 08:44:39AM -0400, Johannes Weiner wrote:
> A clearer overarching design proposal that exists in more than one
> head, that people agree on, and that is concrete enough to allow
> others to make educated guesses on how random snippets of code would
> or should look like in the new world, would help immensely.
> 
> (This applies here, but to a certain degree to folio as well.)

Folios really are a simple design proposal: They're a page which is not
a tail page.  I didn't want to do any of this work on slabs and, and,
and, and.  I want to get back to working on large pages in the page cache.

Yes, the same principles can also be used to split new types (such
as slab) out of struct page, but I don't want to be working on any of
that!  I don't even want to be working on separately allocated memory
descriptors.

I agree it's a lot less churn to split slab out of page first, then
convert the remaining page users to folios than it is to convert sl[aou]b
to folios, then later split it apart into its own type.
diff mbox series

Patch

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 7f8ee09c711f..c2ea71aba84e 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -239,6 +239,62 @@  struct page {
 #endif
 } _struct_page_alignment;
 
+/* Reuses the bits in struct page */
+struct slab {
+	unsigned long flags;
+	union {
+		struct list_head slab_list;
+		struct {	/* Partial pages */
+			struct slab *next;
+#ifdef CONFIG_64BIT
+			int slabs;	/* Nr of slabs left */
+			int pobjects;	/* Approximate count */
+#else
+			short int slabs;
+			short int pobjects;
+#endif
+		};
+		struct rcu_head rcu_head;
+	};
+	struct kmem_cache *slab_cache; /* not slob */
+	/* Double-word boundary */
+	void *freelist;		/* first free object */
+	union {
+		void *s_mem;	/* slab: first object */
+		unsigned long counters;		/* SLUB */
+		struct {			/* SLUB */
+			unsigned inuse:16;
+			unsigned objects:15;
+			unsigned frozen:1;
+		};
+	};
+
+	union {
+		unsigned int active;		/* SLAB */
+		int units;			/* SLOB */
+	};
+	atomic_t _refcount;
+#ifdef CONFIG_MEMCG
+	unsigned long memcg_data;
+#endif
+};
+
+#define SLAB_MATCH(pg, sl)						\
+	static_assert(offsetof(struct page, pg) == offsetof(struct slab, sl))
+SLAB_MATCH(flags, flags);
+SLAB_MATCH(compound_head, slab_list);	/* Ensure bit 0 is clear */
+SLAB_MATCH(slab_list, slab_list);
+SLAB_MATCH(rcu_head, rcu_head);
+SLAB_MATCH(slab_cache, slab_cache);
+SLAB_MATCH(s_mem, s_mem);
+SLAB_MATCH(active, active);
+SLAB_MATCH(_refcount, _refcount);
+#ifdef CONFIG_MEMCG
+SLAB_MATCH(memcg_data, memcg_data);
+#endif
+#undef SLAB_MATCH
+static_assert(sizeof(struct slab) <= sizeof(struct page));
+
 static inline atomic_t *compound_mapcount_ptr(struct page *page)
 {
 	return &page[1].compound_mapcount;
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index a558d67ee86f..57bdb1eb2f29 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -165,6 +165,8 @@  enum pageflags {
 	/* Remapped by swiotlb-xen. */
 	PG_xen_remapped = PG_owner_priv_1,
 
+	/* SLAB / SLUB / SLOB */
+	PG_pfmemalloc = PG_active,
 	/* SLOB */
 	PG_slob_free = PG_private,
 
@@ -193,6 +195,33 @@  static inline unsigned long _compound_head(const struct page *page)
 
 #define compound_head(page)	((typeof(page))_compound_head(page))
 
+/**
+ * page_slab - Converts from page to slab.
+ * @p: The page.
+ *
+ * This function cannot be called on a NULL pointer.  It can be called
+ * on a non-slab page; the caller should check is_slab() to be sure
+ * that the slab really is a slab.
+ *
+ * Return: The slab which contains this page.
+ */
+#define page_slab(p)		(_Generic((p),				\
+	const struct page *:	(const struct slab *)_compound_head(p),	\
+	struct page *:		(struct slab *)_compound_head(p)))
+
+/**
+ * slab_page - The first struct page allocated for a slab
+ * @slab: The slab.
+ *
+ * Slabs are allocated as one-or-more pages.  It is occasionally necessary
+ * to convert back to a struct page in order to communicate with the rest
+ * of the mm.  Please use this helper function instead of casting yourself,
+ * as the implementation may change in the future.
+ */
+#define slab_page(s)		(_Generic((s),				\
+	const struct slab *:	(const struct page *)s,			\
+	struct slab *:		(struct page *)s))
+
 static __always_inline int PageTail(struct page *page)
 {
 	return READ_ONCE(page->compound_head) & 1;
diff --git a/mm/slab.h b/mm/slab.h
index 58c01a34e5b8..54b05f4d9eb5 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -5,6 +5,79 @@ 
  * Internal slab definitions
  */
 
+/*
+ * Does this memory belong to a slab cache?  Slub can return page allocator
+ * memory for certain size allocations.
+ */
+static inline bool slab_test_cache(const struct slab *slab)
+{
+	return test_bit(PG_slab, &slab->flags);
+}
+
+static inline bool slab_test_multi_page(const struct slab *slab)
+{
+	return test_bit(PG_head, &slab->flags);
+}
+
+/*
+ * If network-based swap is enabled, sl*b must keep track of whether pages
+ * were allocated from pfmemalloc reserves.
+ */
+static inline bool slab_test_pfmemalloc(const struct slab *slab)
+{
+	return test_bit(PG_pfmemalloc, &slab->flags);
+}
+
+static inline void slab_set_pfmemalloc(struct slab *slab)
+{
+	set_bit(PG_pfmemalloc, &slab->flags);
+}
+
+static inline void slab_clear_pfmemalloc(struct slab *slab)
+{
+	clear_bit(PG_pfmemalloc, &slab->flags);
+}
+
+static inline void __slab_clear_pfmemalloc(struct slab *slab)
+{
+	__clear_bit(PG_pfmemalloc, &slab->flags);
+}
+
+static inline void *slab_address(const struct slab *slab)
+{
+	return page_address(slab_page(slab));
+}
+
+static inline int slab_nid(const struct slab *slab)
+{
+	return pgflags_nid(slab->flags);
+}
+
+static inline pg_data_t *slab_pgdat(const struct slab *slab)
+{
+	return NODE_DATA(slab_nid(slab));
+}
+
+static inline struct slab *virt_to_slab(const void *addr)
+{
+	struct page *page = virt_to_page(addr);
+
+	return page_slab(page);
+}
+
+static inline int slab_order(const struct slab *slab)
+{
+	if (!slab_test_multi_page(slab))
+		return 0;
+	return ((struct page *)slab)[1].compound_order;
+}
+
+static inline size_t slab_size(const struct slab *slab)
+{
+	return PAGE_SIZE << slab_order(slab);
+}
+
+
 #ifdef CONFIG_SLOB
 /*
  * Common fields provided in kmem_cache by all slab allocators
diff --git a/mm/slub.c b/mm/slub.c
index 3d2025f7163b..7e429a31b326 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3755,7 +3755,7 @@  static unsigned int slub_min_objects;
  * requested a higher minimum order then we start with that one instead of
  * the smallest order which will fit the object.
  */
-static inline unsigned int slab_order(unsigned int size,
+static inline unsigned int calc_slab_order(unsigned int size,
 		unsigned int min_objects, unsigned int max_order,
 		unsigned int fract_leftover)
 {
@@ -3819,7 +3819,7 @@  static inline int calculate_order(unsigned int size)
 
 		fraction = 16;
 		while (fraction >= 4) {
-			order = slab_order(size, min_objects,
+			order = calc_slab_order(size, min_objects,
 					slub_max_order, fraction);
 			if (order <= slub_max_order)
 				return order;
@@ -3832,14 +3832,14 @@  static inline int calculate_order(unsigned int size)
 	 * We were unable to place multiple objects in a slab. Now
 	 * lets see if we can place a single object there.
 	 */
-	order = slab_order(size, 1, slub_max_order, 1);
+	order = calc_slab_order(size, 1, slub_max_order, 1);
 	if (order <= slub_max_order)
 		return order;
 
 	/*
 	 * Doh this slab cannot be placed using slub_max_order.
 	 */
-	order = slab_order(size, 1, MAX_ORDER, 1);
+	order = calc_slab_order(size, 1, MAX_ORDER, 1);
 	if (order < MAX_ORDER)
 		return order;
 	return -ENOSYS;