diff mbox series

[net-next,v13,07/14] mm: page_frag: reuse existing space for 'size' and 'pfmemalloc'

Message ID 20240808123714.462740-8-linyunsheng@huawei.com (mailing list archive)
State Deferred
Delegated to: Netdev Maintainers
Headers show
Series Replace page_frag with page_frag_cache for sk_page_frag() | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for net-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 616 this patch: 616
netdev/build_tools success Errors and warnings before: 9 this patch: 9
netdev/cc_maintainers warning 1 maintainers not CCed: chuck.lever@oracle.com
netdev/build_clang success Errors and warnings before: 1106 this patch: 1106
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 15220 this patch: 15220
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 203 lines checked
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
netdev/contest success net-next-2024-08-09--09-00 (tests: 705)

Commit Message

Yunsheng Lin Aug. 8, 2024, 12:37 p.m. UTC
Currently there is one 'struct page_frag' for every 'struct
sock' and 'struct task_struct', we are about to replace the
'struct page_frag' with 'struct page_frag_cache' for them.
Before begin the replacing, we need to ensure the size of
'struct page_frag_cache' is not bigger than the size of
'struct page_frag', as there may be tens of thousands of
'struct sock' and 'struct task_struct' instances in the
system.

By or'ing the page order & pfmemalloc with lower bits of
'va' instead of using 'u16' or 'u32' for page size and 'u8'
for pfmemalloc, we are able to avoid 3 or 5 bytes space waste.
And page address & pfmemalloc & order is unchanged for the
same page in the same 'page_frag_cache' instance, it makes
sense to fit them together.

After this patch, the size of 'struct page_frag_cache' should be
the same as the size of 'struct page_frag'.

CC: Alexander Duyck <alexander.duyck@gmail.com>
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
---
 include/linux/mm_types_task.h   | 16 +++++-----
 include/linux/page_frag_cache.h | 52 +++++++++++++++++++++++++++++++--
 mm/page_frag_cache.c            | 49 +++++++++++++++++--------------
 3 files changed, 85 insertions(+), 32 deletions(-)

Comments

Alexander Duyck Aug. 14, 2024, 4:13 p.m. UTC | #1
On Thu, 2024-08-08 at 20:37 +0800, Yunsheng Lin wrote:
> Currently there is one 'struct page_frag' for every 'struct
> sock' and 'struct task_struct', we are about to replace the
> 'struct page_frag' with 'struct page_frag_cache' for them.
> Before begin the replacing, we need to ensure the size of
> 'struct page_frag_cache' is not bigger than the size of
> 'struct page_frag', as there may be tens of thousands of
> 'struct sock' and 'struct task_struct' instances in the
> system.
> 
> By or'ing the page order & pfmemalloc with lower bits of
> 'va' instead of using 'u16' or 'u32' for page size and 'u8'
> for pfmemalloc, we are able to avoid 3 or 5 bytes space waste.
> And page address & pfmemalloc & order is unchanged for the
> same page in the same 'page_frag_cache' instance, it makes
> sense to fit them together.
> 
> After this patch, the size of 'struct page_frag_cache' should be
> the same as the size of 'struct page_frag'.
> 
> CC: Alexander Duyck <alexander.duyck@gmail.com>
> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
> ---
>  include/linux/mm_types_task.h   | 16 +++++-----
>  include/linux/page_frag_cache.h | 52 +++++++++++++++++++++++++++++++--
>  mm/page_frag_cache.c            | 49 +++++++++++++++++--------------
>  3 files changed, 85 insertions(+), 32 deletions(-)
> 
> diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h
> index b1c54b2b9308..f2610112a642 100644
> --- a/include/linux/mm_types_task.h
> +++ b/include/linux/mm_types_task.h
> @@ -50,18 +50,18 @@ struct page_frag {
>  #define PAGE_FRAG_CACHE_MAX_SIZE	__ALIGN_MASK(32768, ~PAGE_MASK)
>  #define PAGE_FRAG_CACHE_MAX_ORDER	get_order(PAGE_FRAG_CACHE_MAX_SIZE)
>  struct page_frag_cache {
> -	void *va;
> -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> +	/* encoded_va consists of the virtual address, pfmemalloc bit and order
> +	 * of a page.
> +	 */
> +	unsigned long encoded_va;
> +

Rather than calling this an "encoded_va" we might want to call this an
"encoded_page" as that would be closer to what we are actually working
with. We are just using the virtual address as the page pointer instead
of the page struct itself since we need quicker access to the virtual
address than we do the page struct.

> +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) && (BITS_PER_LONG <= 32)
>  	__u16 remaining;
> -	__u16 size;
> +	__u16 pagecnt_bias;
>  #else
>  	__u32 remaining;
> +	__u32 pagecnt_bias;
>  #endif
> -	/* we maintain a pagecount bias, so that we dont dirty cache line
> -	 * containing page->_refcount every time we allocate a fragment.
> -	 */
> -	unsigned int		pagecnt_bias;
> -	bool pfmemalloc;
>  };
>  
>  /* Track pages that require TLB flushes */
> diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
> index 7c9125a9aed3..4ce924eaf1b1 100644
> --- a/include/linux/page_frag_cache.h
> +++ b/include/linux/page_frag_cache.h
> @@ -3,18 +3,66 @@
>  #ifndef _LINUX_PAGE_FRAG_CACHE_H
>  #define _LINUX_PAGE_FRAG_CACHE_H
>  
> +#include <linux/bits.h>
> +#include <linux/build_bug.h>
>  #include <linux/log2.h>
>  #include <linux/types.h>
>  #include <linux/mm_types_task.h>
>  
> +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> +/* Use a full byte here to enable assembler optimization as the shift
> + * operation is usually expecting a byte.
> + */
> +#define PAGE_FRAG_CACHE_ORDER_MASK		GENMASK(7, 0)
> +#define PAGE_FRAG_CACHE_PFMEMALLOC_BIT		BIT(8)
> +#define PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT	8
> +#else
> +/* Compiler should be able to figure out we don't read things as any value
> + * ANDed with 0 is 0.
> + */
> +#define PAGE_FRAG_CACHE_ORDER_MASK		0
> +#define PAGE_FRAG_CACHE_PFMEMALLOC_BIT		BIT(0)
> +#define PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT	0
> +#endif
> +

You should probably pull out PAGE_FRAG_CACHE_PFMEMALLOC_BIT and just
define it as:
#define PAGE_FRAG_CACHE_PFMEMALLOC_BIT \
	BIT(PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT)

That way there is no risk of the bit and the shift somehow getting
split up and being different values.

> +static inline unsigned long encode_aligned_va(void *va, unsigned int order,
> +					      bool pfmemalloc)

Rather than passing the virtual address it might make more sense to
pass the page. With that you know it should be PAGE_SIZE aligned versus
just being passed some random virtual address.

> +{
> +	BUILD_BUG_ON(PAGE_FRAG_CACHE_MAX_ORDER > PAGE_FRAG_CACHE_ORDER_MASK);
> +	BUILD_BUG_ON(PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT >= PAGE_SHIFT);

Rather than test the shift I would test the bit versus PAGE_SIZE.

> +
> +	return (unsigned long)va | (order & PAGE_FRAG_CACHE_ORDER_MASK) |
> +		((unsigned long)pfmemalloc << PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT);
> +}
> +
> +static inline unsigned long encoded_page_order(unsigned long encoded_va)
> +{
> +	return encoded_va & PAGE_FRAG_CACHE_ORDER_MASK;
> +}
> +
> +static inline bool encoded_page_pfmemalloc(unsigned long encoded_va)
> +{
> +	return !!(encoded_va & PAGE_FRAG_CACHE_PFMEMALLOC_BIT);
> +}
> +
> +static inline void *encoded_page_address(unsigned long encoded_va)
> +{
> +	return (void *)(encoded_va & PAGE_MASK);
> +}
> +

This is one of the reasons why I am thinking "encoded_page" might be a
better name for it. The 3 functions above all have their equivilent for
a page struct but we pulled that data out and packed it all into the
encoded_page.
Yunsheng Lin Aug. 15, 2024, 3:10 a.m. UTC | #2
On 2024/8/15 0:13, Alexander H Duyck wrote:
> On Thu, 2024-08-08 at 20:37 +0800, Yunsheng Lin wrote:
>> Currently there is one 'struct page_frag' for every 'struct
>> sock' and 'struct task_struct', we are about to replace the
>> 'struct page_frag' with 'struct page_frag_cache' for them.
>> Before begin the replacing, we need to ensure the size of
>> 'struct page_frag_cache' is not bigger than the size of
>> 'struct page_frag', as there may be tens of thousands of
>> 'struct sock' and 'struct task_struct' instances in the
>> system.
>>
>> By or'ing the page order & pfmemalloc with lower bits of
>> 'va' instead of using 'u16' or 'u32' for page size and 'u8'
>> for pfmemalloc, we are able to avoid 3 or 5 bytes space waste.
>> And page address & pfmemalloc & order is unchanged for the
>> same page in the same 'page_frag_cache' instance, it makes
>> sense to fit them together.
>>
>> After this patch, the size of 'struct page_frag_cache' should be
>> the same as the size of 'struct page_frag'.
>>
>> CC: Alexander Duyck <alexander.duyck@gmail.com>
>> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
>> ---
>>  include/linux/mm_types_task.h   | 16 +++++-----
>>  include/linux/page_frag_cache.h | 52 +++++++++++++++++++++++++++++++--
>>  mm/page_frag_cache.c            | 49 +++++++++++++++++--------------
>>  3 files changed, 85 insertions(+), 32 deletions(-)
>>
>> diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h
>> index b1c54b2b9308..f2610112a642 100644
>> --- a/include/linux/mm_types_task.h
>> +++ b/include/linux/mm_types_task.h
>> @@ -50,18 +50,18 @@ struct page_frag {
>>  #define PAGE_FRAG_CACHE_MAX_SIZE	__ALIGN_MASK(32768, ~PAGE_MASK)
>>  #define PAGE_FRAG_CACHE_MAX_ORDER	get_order(PAGE_FRAG_CACHE_MAX_SIZE)
>>  struct page_frag_cache {
>> -	void *va;
>> -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
>> +	/* encoded_va consists of the virtual address, pfmemalloc bit and order
>> +	 * of a page.
>> +	 */
>> +	unsigned long encoded_va;
>> +
> 
> Rather than calling this an "encoded_va" we might want to call this an
> "encoded_page" as that would be closer to what we are actually working
> with. We are just using the virtual address as the page pointer instead
> of the page struct itself since we need quicker access to the virtual
> address than we do the page struct.

Calling it "encoded_page" seems confusing enough when calling virt_to_page()
with "encoded_page" when virt_to_page() is expecting a 'va', no?

>
Alexander Duyck Aug. 15, 2024, 3:03 p.m. UTC | #3
On Wed, Aug 14, 2024 at 8:10 PM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>
> On 2024/8/15 0:13, Alexander H Duyck wrote:
> > On Thu, 2024-08-08 at 20:37 +0800, Yunsheng Lin wrote:
> >> Currently there is one 'struct page_frag' for every 'struct
> >> sock' and 'struct task_struct', we are about to replace the
> >> 'struct page_frag' with 'struct page_frag_cache' for them.
> >> Before begin the replacing, we need to ensure the size of
> >> 'struct page_frag_cache' is not bigger than the size of
> >> 'struct page_frag', as there may be tens of thousands of
> >> 'struct sock' and 'struct task_struct' instances in the
> >> system.
> >>
> >> By or'ing the page order & pfmemalloc with lower bits of
> >> 'va' instead of using 'u16' or 'u32' for page size and 'u8'
> >> for pfmemalloc, we are able to avoid 3 or 5 bytes space waste.
> >> And page address & pfmemalloc & order is unchanged for the
> >> same page in the same 'page_frag_cache' instance, it makes
> >> sense to fit them together.
> >>
> >> After this patch, the size of 'struct page_frag_cache' should be
> >> the same as the size of 'struct page_frag'.
> >>
> >> CC: Alexander Duyck <alexander.duyck@gmail.com>
> >> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
> >> ---
> >>  include/linux/mm_types_task.h   | 16 +++++-----
> >>  include/linux/page_frag_cache.h | 52 +++++++++++++++++++++++++++++++--
> >>  mm/page_frag_cache.c            | 49 +++++++++++++++++--------------
> >>  3 files changed, 85 insertions(+), 32 deletions(-)
> >>
> >> diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h
> >> index b1c54b2b9308..f2610112a642 100644
> >> --- a/include/linux/mm_types_task.h
> >> +++ b/include/linux/mm_types_task.h
> >> @@ -50,18 +50,18 @@ struct page_frag {
> >>  #define PAGE_FRAG_CACHE_MAX_SIZE    __ALIGN_MASK(32768, ~PAGE_MASK)
> >>  #define PAGE_FRAG_CACHE_MAX_ORDER   get_order(PAGE_FRAG_CACHE_MAX_SIZE)
> >>  struct page_frag_cache {
> >> -    void *va;
> >> -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> >> +    /* encoded_va consists of the virtual address, pfmemalloc bit and order
> >> +     * of a page.
> >> +     */
> >> +    unsigned long encoded_va;
> >> +
> >
> > Rather than calling this an "encoded_va" we might want to call this an
> > "encoded_page" as that would be closer to what we are actually working
> > with. We are just using the virtual address as the page pointer instead
> > of the page struct itself since we need quicker access to the virtual
> > address than we do the page struct.
>
> Calling it "encoded_page" seems confusing enough when calling virt_to_page()
> with "encoded_page" when virt_to_page() is expecting a 'va', no?

It makes about as much sense as calling it an "encoded_va". What you
have is essentially a packed page struct that contains the virtual
address, pfmemalloc flag, and order. So if you want you could call it
"packed_page" too I suppose. Basically this isn't a valid virtual
address it is a page pointer with some extra metadata packed in.
Yunsheng Lin Aug. 16, 2024, 11:55 a.m. UTC | #4
On 2024/8/15 23:03, Alexander Duyck wrote:
> On Wed, Aug 14, 2024 at 8:10 PM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>>
>> On 2024/8/15 0:13, Alexander H Duyck wrote:
>>> On Thu, 2024-08-08 at 20:37 +0800, Yunsheng Lin wrote:
>>>> Currently there is one 'struct page_frag' for every 'struct
>>>> sock' and 'struct task_struct', we are about to replace the
>>>> 'struct page_frag' with 'struct page_frag_cache' for them.
>>>> Before begin the replacing, we need to ensure the size of
>>>> 'struct page_frag_cache' is not bigger than the size of
>>>> 'struct page_frag', as there may be tens of thousands of
>>>> 'struct sock' and 'struct task_struct' instances in the
>>>> system.
>>>>
>>>> By or'ing the page order & pfmemalloc with lower bits of
>>>> 'va' instead of using 'u16' or 'u32' for page size and 'u8'
>>>> for pfmemalloc, we are able to avoid 3 or 5 bytes space waste.
>>>> And page address & pfmemalloc & order is unchanged for the
>>>> same page in the same 'page_frag_cache' instance, it makes
>>>> sense to fit them together.
>>>>
>>>> After this patch, the size of 'struct page_frag_cache' should be
>>>> the same as the size of 'struct page_frag'.
>>>>
>>>> CC: Alexander Duyck <alexander.duyck@gmail.com>
>>>> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
>>>> ---
>>>>  include/linux/mm_types_task.h   | 16 +++++-----
>>>>  include/linux/page_frag_cache.h | 52 +++++++++++++++++++++++++++++++--
>>>>  mm/page_frag_cache.c            | 49 +++++++++++++++++--------------
>>>>  3 files changed, 85 insertions(+), 32 deletions(-)
>>>>
>>>> diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h
>>>> index b1c54b2b9308..f2610112a642 100644
>>>> --- a/include/linux/mm_types_task.h
>>>> +++ b/include/linux/mm_types_task.h
>>>> @@ -50,18 +50,18 @@ struct page_frag {
>>>>  #define PAGE_FRAG_CACHE_MAX_SIZE    __ALIGN_MASK(32768, ~PAGE_MASK)
>>>>  #define PAGE_FRAG_CACHE_MAX_ORDER   get_order(PAGE_FRAG_CACHE_MAX_SIZE)
>>>>  struct page_frag_cache {
>>>> -    void *va;
>>>> -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
>>>> +    /* encoded_va consists of the virtual address, pfmemalloc bit and order
>>>> +     * of a page.
>>>> +     */
>>>> +    unsigned long encoded_va;
>>>> +
>>>
>>> Rather than calling this an "encoded_va" we might want to call this an
>>> "encoded_page" as that would be closer to what we are actually working
>>> with. We are just using the virtual address as the page pointer instead
>>> of the page struct itself since we need quicker access to the virtual
>>> address than we do the page struct.
>>
>> Calling it "encoded_page" seems confusing enough when calling virt_to_page()
>> with "encoded_page" when virt_to_page() is expecting a 'va', no?
> 
> It makes about as much sense as calling it an "encoded_va". What you
> have is essentially a packed page struct that contains the virtual
> address, pfmemalloc flag, and order. So if you want you could call it
> "packed_page" too I suppose. Basically this isn't a valid virtual
> address it is a page pointer with some extra metadata packed in.

I think we are all argeed that is not a valid virtual address by adding
the 'encoded_' part.
I am not really sure if "encoded_page" or "packed_page" is better than
'encoded_va' here, as there is no 'page pointer' that is implied by
"encoded_page" or "packed_page" here. For 'encoded_va', at least there
is 'virtual address' that is implied by 'encoded_va', and that 'virtual
address' just happen to be page pointer.

Yes, you may say the 'pfmemalloc flag and order' part is about page, not
about 'va', I guess there is trade-off we need to make here if there is
not a perfect name for it and 'va' does occupy most bits of 'encoded_va'.
Alexander Duyck Aug. 19, 2024, 4 p.m. UTC | #5
On Fri, Aug 16, 2024 at 4:56 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>
> On 2024/8/15 23:03, Alexander Duyck wrote:
> > On Wed, Aug 14, 2024 at 8:10 PM Yunsheng Lin <linyunsheng@huawei.com> wrote:
> >>
> >> On 2024/8/15 0:13, Alexander H Duyck wrote:
> >>> On Thu, 2024-08-08 at 20:37 +0800, Yunsheng Lin wrote:
> >>>> Currently there is one 'struct page_frag' for every 'struct
> >>>> sock' and 'struct task_struct', we are about to replace the
> >>>> 'struct page_frag' with 'struct page_frag_cache' for them.
> >>>> Before begin the replacing, we need to ensure the size of
> >>>> 'struct page_frag_cache' is not bigger than the size of
> >>>> 'struct page_frag', as there may be tens of thousands of
> >>>> 'struct sock' and 'struct task_struct' instances in the
> >>>> system.
> >>>>
> >>>> By or'ing the page order & pfmemalloc with lower bits of
> >>>> 'va' instead of using 'u16' or 'u32' for page size and 'u8'
> >>>> for pfmemalloc, we are able to avoid 3 or 5 bytes space waste.
> >>>> And page address & pfmemalloc & order is unchanged for the
> >>>> same page in the same 'page_frag_cache' instance, it makes
> >>>> sense to fit them together.
> >>>>
> >>>> After this patch, the size of 'struct page_frag_cache' should be
> >>>> the same as the size of 'struct page_frag'.
> >>>>
> >>>> CC: Alexander Duyck <alexander.duyck@gmail.com>
> >>>> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
> >>>> ---
> >>>>  include/linux/mm_types_task.h   | 16 +++++-----
> >>>>  include/linux/page_frag_cache.h | 52 +++++++++++++++++++++++++++++++--
> >>>>  mm/page_frag_cache.c            | 49 +++++++++++++++++--------------
> >>>>  3 files changed, 85 insertions(+), 32 deletions(-)
> >>>>
> >>>> diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h
> >>>> index b1c54b2b9308..f2610112a642 100644
> >>>> --- a/include/linux/mm_types_task.h
> >>>> +++ b/include/linux/mm_types_task.h
> >>>> @@ -50,18 +50,18 @@ struct page_frag {
> >>>>  #define PAGE_FRAG_CACHE_MAX_SIZE    __ALIGN_MASK(32768, ~PAGE_MASK)
> >>>>  #define PAGE_FRAG_CACHE_MAX_ORDER   get_order(PAGE_FRAG_CACHE_MAX_SIZE)
> >>>>  struct page_frag_cache {
> >>>> -    void *va;
> >>>> -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> >>>> +    /* encoded_va consists of the virtual address, pfmemalloc bit and order
> >>>> +     * of a page.
> >>>> +     */
> >>>> +    unsigned long encoded_va;
> >>>> +
> >>>
> >>> Rather than calling this an "encoded_va" we might want to call this an
> >>> "encoded_page" as that would be closer to what we are actually working
> >>> with. We are just using the virtual address as the page pointer instead
> >>> of the page struct itself since we need quicker access to the virtual
> >>> address than we do the page struct.
> >>
> >> Calling it "encoded_page" seems confusing enough when calling virt_to_page()
> >> with "encoded_page" when virt_to_page() is expecting a 'va', no?
> >
> > It makes about as much sense as calling it an "encoded_va". What you
> > have is essentially a packed page struct that contains the virtual
> > address, pfmemalloc flag, and order. So if you want you could call it
> > "packed_page" too I suppose. Basically this isn't a valid virtual
> > address it is a page pointer with some extra metadata packed in.
>
> I think we are all argeed that is not a valid virtual address by adding
> the 'encoded_' part.
> I am not really sure if "encoded_page" or "packed_page" is better than
> 'encoded_va' here, as there is no 'page pointer' that is implied by
> "encoded_page" or "packed_page" here. For 'encoded_va', at least there
> is 'virtual address' that is implied by 'encoded_va', and that 'virtual
> address' just happen to be page pointer.

Basically we are using the page's virtual address to encode the page
into the struct. If you look, "virtual" is a pointer stored in the
page to provide the virtual address on some architectures. It also
happens that we have virt_to_page which provides an easy way to get
back and forth between the values.

> Yes, you may say the 'pfmemalloc flag and order' part is about page, not
> about 'va', I guess there is trade-off we need to make here if there is
> not a perfect name for it and 'va' does occupy most bits of 'encoded_va'.

The naming isn't really a show stopper one way or another. It was more
the fact that you had several functions accessing it that were using
the name "encoded_page" as I recall. That is why I thought it might
make sense to rename it to that. Why have functions called
"encoded_page_order" work with an "encoded_va" versus an
"encoded_page". It makes it easier to logically lump them all
together.
diff mbox series

Patch

diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h
index b1c54b2b9308..f2610112a642 100644
--- a/include/linux/mm_types_task.h
+++ b/include/linux/mm_types_task.h
@@ -50,18 +50,18 @@  struct page_frag {
 #define PAGE_FRAG_CACHE_MAX_SIZE	__ALIGN_MASK(32768, ~PAGE_MASK)
 #define PAGE_FRAG_CACHE_MAX_ORDER	get_order(PAGE_FRAG_CACHE_MAX_SIZE)
 struct page_frag_cache {
-	void *va;
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
+	/* encoded_va consists of the virtual address, pfmemalloc bit and order
+	 * of a page.
+	 */
+	unsigned long encoded_va;
+
+#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) && (BITS_PER_LONG <= 32)
 	__u16 remaining;
-	__u16 size;
+	__u16 pagecnt_bias;
 #else
 	__u32 remaining;
+	__u32 pagecnt_bias;
 #endif
-	/* we maintain a pagecount bias, so that we dont dirty cache line
-	 * containing page->_refcount every time we allocate a fragment.
-	 */
-	unsigned int		pagecnt_bias;
-	bool pfmemalloc;
 };
 
 /* Track pages that require TLB flushes */
diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
index 7c9125a9aed3..4ce924eaf1b1 100644
--- a/include/linux/page_frag_cache.h
+++ b/include/linux/page_frag_cache.h
@@ -3,18 +3,66 @@ 
 #ifndef _LINUX_PAGE_FRAG_CACHE_H
 #define _LINUX_PAGE_FRAG_CACHE_H
 
+#include <linux/bits.h>
+#include <linux/build_bug.h>
 #include <linux/log2.h>
 #include <linux/types.h>
 #include <linux/mm_types_task.h>
 
+#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
+/* Use a full byte here to enable assembler optimization as the shift
+ * operation is usually expecting a byte.
+ */
+#define PAGE_FRAG_CACHE_ORDER_MASK		GENMASK(7, 0)
+#define PAGE_FRAG_CACHE_PFMEMALLOC_BIT		BIT(8)
+#define PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT	8
+#else
+/* Compiler should be able to figure out we don't read things as any value
+ * ANDed with 0 is 0.
+ */
+#define PAGE_FRAG_CACHE_ORDER_MASK		0
+#define PAGE_FRAG_CACHE_PFMEMALLOC_BIT		BIT(0)
+#define PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT	0
+#endif
+
+static inline unsigned long encode_aligned_va(void *va, unsigned int order,
+					      bool pfmemalloc)
+{
+	BUILD_BUG_ON(PAGE_FRAG_CACHE_MAX_ORDER > PAGE_FRAG_CACHE_ORDER_MASK);
+	BUILD_BUG_ON(PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT >= PAGE_SHIFT);
+
+	return (unsigned long)va | (order & PAGE_FRAG_CACHE_ORDER_MASK) |
+		((unsigned long)pfmemalloc << PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT);
+}
+
+static inline unsigned long encoded_page_order(unsigned long encoded_va)
+{
+	return encoded_va & PAGE_FRAG_CACHE_ORDER_MASK;
+}
+
+static inline bool encoded_page_pfmemalloc(unsigned long encoded_va)
+{
+	return !!(encoded_va & PAGE_FRAG_CACHE_PFMEMALLOC_BIT);
+}
+
+static inline void *encoded_page_address(unsigned long encoded_va)
+{
+	return (void *)(encoded_va & PAGE_MASK);
+}
+
 static inline void page_frag_cache_init(struct page_frag_cache *nc)
 {
-	nc->va = NULL;
+	nc->encoded_va = 0;
 }
 
 static inline bool page_frag_cache_is_pfmemalloc(struct page_frag_cache *nc)
 {
-	return !!nc->pfmemalloc;
+	return encoded_page_pfmemalloc(nc->encoded_va);
+}
+
+static inline unsigned int page_frag_cache_page_size(unsigned long encoded_va)
+{
+	return PAGE_SIZE << encoded_page_order(encoded_va);
 }
 
 void page_frag_cache_drain(struct page_frag_cache *nc);
diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
index 70fb6dead624..2544b292375a 100644
--- a/mm/page_frag_cache.c
+++ b/mm/page_frag_cache.c
@@ -22,6 +22,7 @@ 
 static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
 					     gfp_t gfp_mask)
 {
+	unsigned long order = PAGE_FRAG_CACHE_MAX_ORDER;
 	struct page *page = NULL;
 	gfp_t gfp = gfp_mask;
 
@@ -30,23 +31,31 @@  static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
 		   __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
 	page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
 				PAGE_FRAG_CACHE_MAX_ORDER);
-	nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
 #endif
-	if (unlikely(!page))
+	if (unlikely(!page)) {
 		page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
+		if (unlikely(!page)) {
+			nc->encoded_va = 0;
+			return NULL;
+		}
 
-	nc->va = page ? page_address(page) : NULL;
+		order = 0;
+	}
+
+	nc->encoded_va = encode_aligned_va(page_address(page), order,
+					   page_is_pfmemalloc(page));
 
 	return page;
 }
 
 void page_frag_cache_drain(struct page_frag_cache *nc)
 {
-	if (!nc->va)
+	if (!nc->encoded_va)
 		return;
 
-	__page_frag_cache_drain(virt_to_head_page(nc->va), nc->pagecnt_bias);
-	nc->va = NULL;
+	__page_frag_cache_drain(virt_to_head_page((void *)nc->encoded_va),
+				nc->pagecnt_bias);
+	nc->encoded_va = 0;
 }
 EXPORT_SYMBOL(page_frag_cache_drain);
 
@@ -63,33 +72,29 @@  void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
 				 unsigned int fragsz, gfp_t gfp_mask,
 				 unsigned int align_mask)
 {
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-	unsigned int size = nc->size;
-#else
-	unsigned int size = PAGE_SIZE;
-#endif
-	unsigned int remaining;
+	unsigned long encoded_va = nc->encoded_va;
+	unsigned int size, remaining;
 	struct page *page;
 
-	if (unlikely(!nc->va)) {
+	if (unlikely(!encoded_va)) {
 refill:
 		page = __page_frag_cache_refill(nc, gfp_mask);
 		if (!page)
 			return NULL;
 
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-		/* if size can vary use size else just use PAGE_SIZE */
-		size = nc->size;
-#endif
+		encoded_va = nc->encoded_va;
+		size = page_frag_cache_page_size(encoded_va);
+
 		/* Even if we own the page, we do not use atomic_set().
 		 * This would break get_page_unless_zero() users.
 		 */
 		page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
 
 		/* reset page count bias and remaining to start of new frag */
-		nc->pfmemalloc = page_is_pfmemalloc(page);
 		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
 		nc->remaining = size;
+	} else {
+		size = page_frag_cache_page_size(encoded_va);
 	}
 
 	remaining = nc->remaining & align_mask;
@@ -107,13 +112,13 @@  void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
 			return NULL;
 		}
 
-		page = virt_to_page(nc->va);
+		page = virt_to_page((void *)encoded_va);
 
 		if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
 			goto refill;
 
-		if (unlikely(nc->pfmemalloc)) {
-			free_unref_page(page, compound_order(page));
+		if (unlikely(encoded_page_pfmemalloc(encoded_va))) {
+			free_unref_page(page, encoded_page_order(encoded_va));
 			goto refill;
 		}
 
@@ -128,7 +133,7 @@  void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
 	nc->pagecnt_bias--;
 	nc->remaining = remaining - fragsz;
 
-	return nc->va + (size - remaining);
+	return encoded_page_address(encoded_va) + (size - remaining);
 }
 EXPORT_SYMBOL(__page_frag_alloc_va_align);