diff mbox series

[net-next,v22,10/14] mm: page_frag: introduce prepare/probe/commit API

Message ID 20241018105351.1960345-11-linyunsheng@huawei.com (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series Replace page_frag with page_frag_cache for sk_page_frag() | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for net-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 44 this patch: 44
netdev/build_tools success Errors and warnings before: 0 (+0) this patch: 0 (+0)
netdev/cc_maintainers success CCed 4 of 4 maintainers
netdev/build_clang success Errors and warnings before: 90 this patch: 90
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 5017 this patch: 5017
netdev/checkpatch warning WARNING: Do not crash the kernel unless it is absolutely unavoidable--use WARN_ON_ONCE() plus recovery code (if feasible) instead of BUG() or variants WARNING: line length of 81 exceeds 80 columns WARNING: line length of 82 exceeds 80 columns WARNING: line length of 83 exceeds 80 columns WARNING: line length of 84 exceeds 80 columns WARNING: line length of 86 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
netdev/contest success net-next-2024-10-19--00-00 (tests: 777)

Commit Message

Yunsheng Lin Oct. 18, 2024, 10:53 a.m. UTC
There are many use cases that need minimum memory in order
for forward progress, but more performant if more memory is
available or need to probe the cache info to use any memory
available for frag caoleasing reason.

Currently skb_page_frag_refill() API is used to solve the
above use cases, but caller needs to know about the internal
detail and access the data field of 'struct page_frag' to
meet the requirement of the above use cases and its
implementation is similar to the one in mm subsystem.

To unify those two page_frag implementations, introduce a
prepare API to ensure minimum memory is satisfied and return
how much the actual memory is available to the caller and a
probe API to report the current available memory to caller
without doing cache refilling. The caller needs to either call
the commit API to report how much memory it actually uses, or
not do so if deciding to not use any memory.

CC: Alexander Duyck <alexander.duyck@gmail.com>
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
---
 include/linux/page_frag_cache.h | 130 ++++++++++++++++++++++++++++++++
 mm/page_frag_cache.c            |  21 ++++++
 2 files changed, 151 insertions(+)

Comments

Alexander Duyck Oct. 18, 2024, 6:03 p.m. UTC | #1
On Fri, Oct 18, 2024 at 4:00 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>
> There are many use cases that need minimum memory in order
> for forward progress, but more performant if more memory is
> available or need to probe the cache info to use any memory
> available for frag caoleasing reason.
>
> Currently skb_page_frag_refill() API is used to solve the
> above use cases, but caller needs to know about the internal
> detail and access the data field of 'struct page_frag' to
> meet the requirement of the above use cases and its
> implementation is similar to the one in mm subsystem.
>
> To unify those two page_frag implementations, introduce a
> prepare API to ensure minimum memory is satisfied and return
> how much the actual memory is available to the caller and a
> probe API to report the current available memory to caller
> without doing cache refilling. The caller needs to either call
> the commit API to report how much memory it actually uses, or
> not do so if deciding to not use any memory.
>
> CC: Alexander Duyck <alexander.duyck@gmail.com>
> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
> ---
>  include/linux/page_frag_cache.h | 130 ++++++++++++++++++++++++++++++++
>  mm/page_frag_cache.c            |  21 ++++++
>  2 files changed, 151 insertions(+)
>
> diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
> index feed99d0cddb..1c0c11250b66 100644
> --- a/include/linux/page_frag_cache.h
> +++ b/include/linux/page_frag_cache.h
> @@ -46,6 +46,10 @@ void *__page_frag_cache_prepare(struct page_frag_cache *nc, unsigned int fragsz,
>  unsigned int __page_frag_cache_commit_noref(struct page_frag_cache *nc,
>                                             struct page_frag *pfrag,
>                                             unsigned int used_sz);
> +void *__page_frag_alloc_refill_probe_align(struct page_frag_cache *nc,
> +                                          unsigned int fragsz,
> +                                          struct page_frag *pfrag,
> +                                          unsigned int align_mask);
>
>  static inline unsigned int __page_frag_cache_commit(struct page_frag_cache *nc,
>                                                     struct page_frag *pfrag,
> @@ -88,6 +92,132 @@ static inline void *page_frag_alloc(struct page_frag_cache *nc,
>         return __page_frag_alloc_align(nc, fragsz, gfp_mask, ~0u);
>  }
>
> +static inline bool __page_frag_refill_align(struct page_frag_cache *nc,
> +                                           unsigned int fragsz,
> +                                           struct page_frag *pfrag,
> +                                           gfp_t gfp_mask,
> +                                           unsigned int align_mask)
> +{
> +       if (unlikely(!__page_frag_cache_prepare(nc, fragsz, pfrag, gfp_mask,
> +                                               align_mask)))
> +               return false;
> +
> +       __page_frag_cache_commit(nc, pfrag, fragsz);
> +       return true;
> +}
> +
> +static inline bool page_frag_refill_align(struct page_frag_cache *nc,
> +                                         unsigned int fragsz,
> +                                         struct page_frag *pfrag,
> +                                         gfp_t gfp_mask, unsigned int align)
> +{
> +       WARN_ON_ONCE(!is_power_of_2(align));
> +       return __page_frag_refill_align(nc, fragsz, pfrag, gfp_mask, -align);
> +}
> +
> +static inline bool page_frag_refill(struct page_frag_cache *nc,
> +                                   unsigned int fragsz,
> +                                   struct page_frag *pfrag, gfp_t gfp_mask)
> +{
> +       return __page_frag_refill_align(nc, fragsz, pfrag, gfp_mask, ~0u);
> +}
> +
> +static inline bool __page_frag_refill_prepare_align(struct page_frag_cache *nc,
> +                                                   unsigned int fragsz,
> +                                                   struct page_frag *pfrag,
> +                                                   gfp_t gfp_mask,
> +                                                   unsigned int align_mask)
> +{
> +       return !!__page_frag_cache_prepare(nc, fragsz, pfrag, gfp_mask,
> +                                          align_mask);
> +}
> +
> +static inline bool page_frag_refill_prepare_align(struct page_frag_cache *nc,
> +                                                 unsigned int fragsz,
> +                                                 struct page_frag *pfrag,
> +                                                 gfp_t gfp_mask,
> +                                                 unsigned int align)
> +{
> +       WARN_ON_ONCE(!is_power_of_2(align));
> +       return __page_frag_refill_prepare_align(nc, fragsz, pfrag, gfp_mask,
> +                                               -align);
> +}
> +
> +static inline bool page_frag_refill_prepare(struct page_frag_cache *nc,
> +                                           unsigned int fragsz,
> +                                           struct page_frag *pfrag,
> +                                           gfp_t gfp_mask)
> +{
> +       return __page_frag_refill_prepare_align(nc, fragsz, pfrag, gfp_mask,
> +                                               ~0u);
> +}
> +
> +static inline void *__page_frag_alloc_refill_prepare_align(struct page_frag_cache *nc,
> +                                                          unsigned int fragsz,
> +                                                          struct page_frag *pfrag,
> +                                                          gfp_t gfp_mask,
> +                                                          unsigned int align_mask)
> +{
> +       return __page_frag_cache_prepare(nc, fragsz, pfrag, gfp_mask, align_mask);
> +}
> +
> +static inline void *page_frag_alloc_refill_prepare_align(struct page_frag_cache *nc,
> +                                                        unsigned int fragsz,
> +                                                        struct page_frag *pfrag,
> +                                                        gfp_t gfp_mask,
> +                                                        unsigned int align)
> +{
> +       WARN_ON_ONCE(!is_power_of_2(align));
> +       return __page_frag_alloc_refill_prepare_align(nc, fragsz, pfrag,
> +                                                     gfp_mask, -align);
> +}
> +
> +static inline void *page_frag_alloc_refill_prepare(struct page_frag_cache *nc,
> +                                                  unsigned int fragsz,
> +                                                  struct page_frag *pfrag,
> +                                                  gfp_t gfp_mask)
> +{
> +       return __page_frag_alloc_refill_prepare_align(nc, fragsz, pfrag,
> +                                                     gfp_mask, ~0u);
> +}
> +
> +static inline void *page_frag_alloc_refill_probe(struct page_frag_cache *nc,
> +                                                unsigned int fragsz,
> +                                                struct page_frag *pfrag)
> +{
> +       return __page_frag_alloc_refill_probe_align(nc, fragsz, pfrag, ~0u);
> +}
> +
> +static inline bool page_frag_refill_probe(struct page_frag_cache *nc,
> +                                         unsigned int fragsz,
> +                                         struct page_frag *pfrag)
> +{
> +       return !!page_frag_alloc_refill_probe(nc, fragsz, pfrag);
> +}
> +
> +static inline void page_frag_commit(struct page_frag_cache *nc,
> +                                   struct page_frag *pfrag,
> +                                   unsigned int used_sz)
> +{
> +       __page_frag_cache_commit(nc, pfrag, used_sz);
> +}
> +
> +static inline void page_frag_commit_noref(struct page_frag_cache *nc,
> +                                         struct page_frag *pfrag,
> +                                         unsigned int used_sz)
> +{
> +       __page_frag_cache_commit_noref(nc, pfrag, used_sz);
> +}
> +

Not a huge fan of introducing a ton of new API calls and then having
to have them all applied at once in the follow-on patches. Ideally the
functions and the header documentation for them would be introduced in
the same patch as well as examples on how it would be used.

I really think we should break these up as some are used in one case,
and others in another and it is a pain to have a pile of abstractions
that are all using these functions in different ways.

> +static inline void page_frag_alloc_abort(struct page_frag_cache *nc,
> +                                        unsigned int fragsz)
> +{
> +       VM_BUG_ON(fragsz > nc->offset);
> +
> +       nc->pagecnt_bias++;
> +       nc->offset -= fragsz;
> +}
> +

We should probably have the same checks here you had on the earlier
commit. We should not be allowing blind changes. If we are using the
commit or abort interfaces we should be verifying a page frag with
them to verify that the request to modify this is legitimate.

>  void page_frag_free(void *addr);
>
>  #endif
> diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
> index f55d34cf7d43..5ea4b663ab8e 100644
> --- a/mm/page_frag_cache.c
> +++ b/mm/page_frag_cache.c
> @@ -112,6 +112,27 @@ unsigned int __page_frag_cache_commit_noref(struct page_frag_cache *nc,
>  }
>  EXPORT_SYMBOL(__page_frag_cache_commit_noref);
>
> +void *__page_frag_alloc_refill_probe_align(struct page_frag_cache *nc,
> +                                          unsigned int fragsz,
> +                                          struct page_frag *pfrag,
> +                                          unsigned int align_mask)
> +{
> +       unsigned long encoded_page = nc->encoded_page;
> +       unsigned int size, offset;
> +
> +       size = PAGE_SIZE << encoded_page_decode_order(encoded_page);
> +       offset = __ALIGN_KERNEL_MASK(nc->offset, ~align_mask);
> +       if (unlikely(!encoded_page || offset + fragsz > size))
> +               return NULL;
> +
> +       pfrag->page = encoded_page_decode_page(encoded_page);
> +       pfrag->size = size - offset;
> +       pfrag->offset = offset;
> +
> +       return encoded_page_decode_virt(encoded_page) + offset;
> +}
> +EXPORT_SYMBOL(__page_frag_alloc_refill_probe_align);
> +

If I am not mistaken this would be the equivalent of allocating a size
0 fragment right? The only difference is that you are copying out the
"remaining" size, but we could get that from the offset if we knew the
size couldn't we? Would it maybe make sense to look at limiting this
to PAGE_SIZE instead of passing the size of the actual fragment?

>  void *__page_frag_cache_prepare(struct page_frag_cache *nc, unsigned int fragsz,
>                                 struct page_frag *pfrag, gfp_t gfp_mask,
>                                 unsigned int align_mask)
> --
> 2.33.0
>
Yunsheng Lin Oct. 19, 2024, 8:33 a.m. UTC | #2
On 10/19/2024 2:03 AM, Alexander Duyck wrote:

> 
> Not a huge fan of introducing a ton of new API calls and then having
> to have them all applied at once in the follow-on patches. Ideally the
> functions and the header documentation for them would be introduced in
> the same patch as well as examples on how it would be used.
> 
> I really think we should break these up as some are used in one case,
> and others in another and it is a pain to have a pile of abstractions
> that are all using these functions in different ways.

I am guessing this patch may be split into three parts to make it more
reviewable and easier to discuss here:
1. Prepare & commit related API, which is still the large one.
2. Probe API related API.
3. Abort API.

And it is worthing mentioning that even if this patch is split into more
patches, it seems impossible to break patch 12 up as almost everything
related to changing "page_frag" to "page_frag_cache" need to be one
patch to avoid compile error.

> 
>> +static inline void page_frag_alloc_abort(struct page_frag_cache *nc,
>> +                                        unsigned int fragsz)
>> +{
>> +       VM_BUG_ON(fragsz > nc->offset);
>> +
>> +       nc->pagecnt_bias++;
>> +       nc->offset -= fragsz;
>> +}
>> +
> 
> We should probably have the same checks here you had on the earlier
> commit. We should not be allowing blind changes. If we are using the
> commit or abort interfaces we should be verifying a page frag with
> them to verify that the request to modify this is legitimate.

As an example in 'Preparation & committing API' section of patch 13, the
abort API is used to abort the operation of page_frag_alloc_*() related
API, so 'page_frag' is not available for doing those checking like the
commit API. For some case without the needing of complicated prepare &
commit API like tun_build_skb(), the abort API can be used to abort the
operation of page_frag_alloc_*() related API when bpf_prog_run_xdp()
returns XDP_DROP knowing that no one else is taking extra reference to
the just allocated fragment.

+Allocation & freeing API
+------------------------
+
+.. code-block:: c
+
+    void *va;
+
+    va = page_frag_alloc_align(nc, size, gfp, align);
+    if (!va)
+        goto do_error;
+
+    err = do_something(va, size);
+    if (err) {
+        page_frag_alloc_abort(nc, size);
+        goto do_error;
+    }
+
+    ...
+
+    page_frag_free(va);


If there is a need to abort the commit API operation, we probably call
it something like page_frag_commit_abort()?

> 
>>   void page_frag_free(void *addr);
>>
>>   #endif
>> diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
>> index f55d34cf7d43..5ea4b663ab8e 100644
>> --- a/mm/page_frag_cache.c
>> +++ b/mm/page_frag_cache.c
>> @@ -112,6 +112,27 @@ unsigned int __page_frag_cache_commit_noref(struct page_frag_cache *nc,
>>   }
>>   EXPORT_SYMBOL(__page_frag_cache_commit_noref);
>>
>> +void *__page_frag_alloc_refill_probe_align(struct page_frag_cache *nc,
>> +                                          unsigned int fragsz,
>> +                                          struct page_frag *pfrag,
>> +                                          unsigned int align_mask)
>> +{
>> +       unsigned long encoded_page = nc->encoded_page;
>> +       unsigned int size, offset;
>> +
>> +       size = PAGE_SIZE << encoded_page_decode_order(encoded_page);
>> +       offset = __ALIGN_KERNEL_MASK(nc->offset, ~align_mask);
>> +       if (unlikely(!encoded_page || offset + fragsz > size))
>> +               return NULL;
>> +
>> +       pfrag->page = encoded_page_decode_page(encoded_page);
>> +       pfrag->size = size - offset;
>> +       pfrag->offset = offset;
>> +
>> +       return encoded_page_decode_virt(encoded_page) + offset;
>> +}
>> +EXPORT_SYMBOL(__page_frag_alloc_refill_probe_align);
>> +
> 
> If I am not mistaken this would be the equivalent of allocating a size
> 0 fragment right? The only difference is that you are copying out the
> "remaining" size, but we could get that from the offset if we knew the
> size couldn't we? Would it maybe make sense to look at limiting this
> to PAGE_SIZE instead of passing the size of the actual fragment?

I am not sure if I understand what does "limiting this to PAGE_SIZE"
mean here.

I probably should mention the usecase of probe API here. For the usecase
of mptcp_sendmsg(), the minimum size of a fragment can be smaller when
the new fragment can be coalesced to previous fragment as there is an
extra memory needed for some header if the fragment can not be coalesced
to previous fragment. The probe API is mainly used to see if there is
any memory left in the 'page_frag_cache' that can be coalesced to
previous fragment.

> 
>>   void *__page_frag_cache_prepare(struct page_frag_cache *nc, unsigned int fragsz,
>>                                  struct page_frag *pfrag, gfp_t gfp_mask,
>>                                  unsigned int align_mask)
>> --
>> 2.33.0
>>
>
Alexander Duyck Oct. 20, 2024, 4:04 p.m. UTC | #3
On Sat, Oct 19, 2024 at 1:33 AM Yunsheng Lin <yunshenglin0825@gmail.com> wrote:
>
> On 10/19/2024 2:03 AM, Alexander Duyck wrote:
>
> >
> > Not a huge fan of introducing a ton of new API calls and then having
> > to have them all applied at once in the follow-on patches. Ideally the
> > functions and the header documentation for them would be introduced in
> > the same patch as well as examples on how it would be used.
> >
> > I really think we should break these up as some are used in one case,
> > and others in another and it is a pain to have a pile of abstractions
> > that are all using these functions in different ways.
>
> I am guessing this patch may be split into three parts to make it more
> reviewable and easier to discuss here:
> 1. Prepare & commit related API, which is still the large one.
> 2. Probe API related API.

In my mind the first two listed here are much more related to each
other than this abort api.

> 3. Abort API.

I wonder if we couldn't look at introducing this first as it is
actually closer to the existing API in terms of how you might use it.
The only spot of commonality I can think of in terms of all these is
that we would need to be able to verify the VA, offset, and size. I
partly wonder if for our page frag API we couldn't get away with
passing a virtual address instead of a page for the page frag. It
would save us having to do the virt_to_page or page_to_virt when
trying to verify a commit or a revert.


> And it is worthing mentioning that even if this patch is split into more
> patches, it seems impossible to break patch 12 up as almost everything
> related to changing "page_frag" to "page_frag_cache" need to be one
> patch to avoid compile error.

That is partly true. One issue is that there are more changes there
than just changing out the page APIs. It seems like you went in
performing optimizations as soon as you were changing out the page
allocation method used. For example one thing that jumps out at me was
the removal of linear_to_page and its replacement with
spd_fill_linear_page which seems to take on other pieces of the
function as well as you made it a return path of its own when that
section wasn't previously.

Ideally changing out the APIs used should be more about doing just
that and avoiding additional optimization or deviations from the
original coded path if possible.

> >
> >> +static inline void page_frag_alloc_abort(struct page_frag_cache *nc,
> >> +                                        unsigned int fragsz)
> >> +{
> >> +       VM_BUG_ON(fragsz > nc->offset);
> >> +
> >> +       nc->pagecnt_bias++;
> >> +       nc->offset -= fragsz;
> >> +}
> >> +
> >
> > We should probably have the same checks here you had on the earlier
> > commit. We should not be allowing blind changes. If we are using the
> > commit or abort interfaces we should be verifying a page frag with
> > them to verify that the request to modify this is legitimate.
>
> As an example in 'Preparation & committing API' section of patch 13, the
> abort API is used to abort the operation of page_frag_alloc_*() related
> API, so 'page_frag' is not available for doing those checking like the
> commit API. For some case without the needing of complicated prepare &
> commit API like tun_build_skb(), the abort API can be used to abort the
> operation of page_frag_alloc_*() related API when bpf_prog_run_xdp()
> returns XDP_DROP knowing that no one else is taking extra reference to
> the just allocated fragment.
>
> +Allocation & freeing API
> +------------------------
> +
> +.. code-block:: c
> +
> +    void *va;
> +
> +    va = page_frag_alloc_align(nc, size, gfp, align);
> +    if (!va)
> +        goto do_error;
> +
> +    err = do_something(va, size);
> +    if (err) {
> +        page_frag_alloc_abort(nc, size);
> +        goto do_error;
> +    }
> +
> +    ...
> +
> +    page_frag_free(va);
>
>
> If there is a need to abort the commit API operation, we probably call
> it something like page_frag_commit_abort()?

I would argue that using an abort API in such a case is likely not
valid then. What we most likely need to be doing is passing the va as
a part of the abort request. With that we should be able to work our
way backwards to get back to verifying the fragment came from the
correct page before we allow stuffing it back on the page.

> >
> >>   void page_frag_free(void *addr);
> >>
> >>   #endif
> >> diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
> >> index f55d34cf7d43..5ea4b663ab8e 100644
> >> --- a/mm/page_frag_cache.c
> >> +++ b/mm/page_frag_cache.c
> >> @@ -112,6 +112,27 @@ unsigned int __page_frag_cache_commit_noref(struct page_frag_cache *nc,
> >>   }
> >>   EXPORT_SYMBOL(__page_frag_cache_commit_noref);
> >>
> >> +void *__page_frag_alloc_refill_probe_align(struct page_frag_cache *nc,
> >> +                                          unsigned int fragsz,
> >> +                                          struct page_frag *pfrag,
> >> +                                          unsigned int align_mask)
> >> +{
> >> +       unsigned long encoded_page = nc->encoded_page;
> >> +       unsigned int size, offset;
> >> +
> >> +       size = PAGE_SIZE << encoded_page_decode_order(encoded_page);
> >> +       offset = __ALIGN_KERNEL_MASK(nc->offset, ~align_mask);
> >> +       if (unlikely(!encoded_page || offset + fragsz > size))
> >> +               return NULL;
> >> +
> >> +       pfrag->page = encoded_page_decode_page(encoded_page);
> >> +       pfrag->size = size - offset;
> >> +       pfrag->offset = offset;
> >> +
> >> +       return encoded_page_decode_virt(encoded_page) + offset;
> >> +}
> >> +EXPORT_SYMBOL(__page_frag_alloc_refill_probe_align);
> >> +
> >
> > If I am not mistaken this would be the equivalent of allocating a size
> > 0 fragment right? The only difference is that you are copying out the
> > "remaining" size, but we could get that from the offset if we knew the
> > size couldn't we? Would it maybe make sense to look at limiting this
> > to PAGE_SIZE instead of passing the size of the actual fragment?
>
> I am not sure if I understand what does "limiting this to PAGE_SIZE"
> mean here.

Right now you are returning pfrag->size = size - offset. I am
wondering if we should be returning something more like "pfrag->size =
PAGE_SIZE - (offset % PAGE_SIZE)".

> I probably should mention the usecase of probe API here. For the usecase
> of mptcp_sendmsg(), the minimum size of a fragment can be smaller when
> the new fragment can be coalesced to previous fragment as there is an
> extra memory needed for some header if the fragment can not be coalesced
> to previous fragment. The probe API is mainly used to see if there is
> any memory left in the 'page_frag_cache' that can be coalesced to
> previous fragment.

What is the fragment size we are talking about? In my example above we
would basically be looking at rounding the page off to the nearest
PAGE_SIZE block before we would have to repeat the call to grab the
next PAGE_SIZE block. Since the request size for the page frag alloc
API is supposed to be limited to 4K or less it would make sense to
limit the probe API similarly.
Yunsheng Lin Oct. 21, 2024, 9:36 a.m. UTC | #4
On 2024/10/21 0:04, Alexander Duyck wrote:
> On Sat, Oct 19, 2024 at 1:33 AM Yunsheng Lin <yunshenglin0825@gmail.com> wrote:
>>
>> On 10/19/2024 2:03 AM, Alexander Duyck wrote:
>>
>>>
>>> Not a huge fan of introducing a ton of new API calls and then having
>>> to have them all applied at once in the follow-on patches. Ideally the
>>> functions and the header documentation for them would be introduced in
>>> the same patch as well as examples on how it would be used.
>>>
>>> I really think we should break these up as some are used in one case,
>>> and others in another and it is a pain to have a pile of abstractions
>>> that are all using these functions in different ways.
>>
>> I am guessing this patch may be split into three parts to make it more
>> reviewable and easier to discuss here:
>> 1. Prepare & commit related API, which is still the large one.
>> 2. Probe API related API.
> 
> In my mind the first two listed here are much more related to each
> other than this abort api.
> 
>> 3. Abort API.
> 
> I wonder if we couldn't look at introducing this first as it is
> actually closer to the existing API in terms of how you might use it.
> The only spot of commonality I can think of in terms of all these is
> that we would need to be able to verify the VA, offset, and size. I
> partly wonder if for our page frag API we couldn't get away with
> passing a virtual address instead of a page for the page frag. It
> would save us having to do the virt_to_page or page_to_virt when
> trying to verify a commit or a revert.

Perhaps break this patch into the more patches as the order like below?
mm: page_frag: introduce page_frag_alloc_abort() API
mm: page_frag: introduce refill prepare & commit API
mm: page_frag: introduce alloc_refill prepare & commit API
mm: page_frag: introduce probe related API

> 
> 
>> And it is worthing mentioning that even if this patch is split into more
>> patches, it seems impossible to break patch 12 up as almost everything
>> related to changing "page_frag" to "page_frag_cache" need to be one
>> patch to avoid compile error.
> 
> That is partly true. One issue is that there are more changes there
> than just changing out the page APIs. It seems like you went in
> performing optimizations as soon as you were changing out the page
> allocation method used. For example one thing that jumps out at me was
> the removal of linear_to_page and its replacement with
> spd_fill_linear_page which seems to take on other pieces of the
> function as well as you made it a return path of its own when that
> section wasn't previously.

The reason for the new spd_fill_linear_page() is that the reference
counting in spd_fill_page() is not longer reusable for new API, which
uses page_frag_commit() and page_frag_commit_noref(), instead of using
get_page() in spd_fill_page().

> 
> Ideally changing out the APIs used should be more about doing just
> that and avoiding additional optimization or deviations from the
> original coded path if possible.

Yes, we can always do better, I am just not sure if it is worthing it.

> 
>>>
>>>> +static inline void page_frag_alloc_abort(struct page_frag_cache *nc,
>>>> +                                        unsigned int fragsz)
>>>> +{
>>>> +       VM_BUG_ON(fragsz > nc->offset);
>>>> +
>>>> +       nc->pagecnt_bias++;
>>>> +       nc->offset -= fragsz;
>>>> +}
>>>> +
>>>
>>> We should probably have the same checks here you had on the earlier
>>> commit. We should not be allowing blind changes. If we are using the
>>> commit or abort interfaces we should be verifying a page frag with
>>> them to verify that the request to modify this is legitimate.
>>
>> As an example in 'Preparation & committing API' section of patch 13, the
>> abort API is used to abort the operation of page_frag_alloc_*() related
>> API, so 'page_frag' is not available for doing those checking like the
>> commit API. For some case without the needing of complicated prepare &
>> commit API like tun_build_skb(), the abort API can be used to abort the
>> operation of page_frag_alloc_*() related API when bpf_prog_run_xdp()
>> returns XDP_DROP knowing that no one else is taking extra reference to
>> the just allocated fragment.
>>
>> +Allocation & freeing API
>> +------------------------
>> +
>> +.. code-block:: c
>> +
>> +    void *va;
>> +
>> +    va = page_frag_alloc_align(nc, size, gfp, align);
>> +    if (!va)
>> +        goto do_error;
>> +
>> +    err = do_something(va, size);
>> +    if (err) {
>> +        page_frag_alloc_abort(nc, size);
>> +        goto do_error;
>> +    }
>> +
>> +    ...
>> +
>> +    page_frag_free(va);
>>
>>
>> If there is a need to abort the commit API operation, we probably call
>> it something like page_frag_commit_abort()?
> 
> I would argue that using an abort API in such a case is likely not
> valid then. What we most likely need to be doing is passing the va as
> a part of the abort request. With that we should be able to work our
> way backwards to get back to verifying the fragment came from the
> correct page before we allow stuffing it back on the page.

How about something like below mentioned in the previous comment:
page_frag_alloc_abort(nc, va, size);

> 
>>>
>>>>   void page_frag_free(void *addr);
>>>>
>>>>   #endif
>>>> diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
>>>> index f55d34cf7d43..5ea4b663ab8e 100644
>>>> --- a/mm/page_frag_cache.c
>>>> +++ b/mm/page_frag_cache.c
>>>> @@ -112,6 +112,27 @@ unsigned int __page_frag_cache_commit_noref(struct page_frag_cache *nc,
>>>>   }
>>>>   EXPORT_SYMBOL(__page_frag_cache_commit_noref);
>>>>
>>>> +void *__page_frag_alloc_refill_probe_align(struct page_frag_cache *nc,
>>>> +                                          unsigned int fragsz,
>>>> +                                          struct page_frag *pfrag,
>>>> +                                          unsigned int align_mask)
>>>> +{
>>>> +       unsigned long encoded_page = nc->encoded_page;
>>>> +       unsigned int size, offset;
>>>> +
>>>> +       size = PAGE_SIZE << encoded_page_decode_order(encoded_page);
>>>> +       offset = __ALIGN_KERNEL_MASK(nc->offset, ~align_mask);
>>>> +       if (unlikely(!encoded_page || offset + fragsz > size))
>>>> +               return NULL;
>>>> +
>>>> +       pfrag->page = encoded_page_decode_page(encoded_page);
>>>> +       pfrag->size = size - offset;
>>>> +       pfrag->offset = offset;
>>>> +
>>>> +       return encoded_page_decode_virt(encoded_page) + offset;
>>>> +}
>>>> +EXPORT_SYMBOL(__page_frag_alloc_refill_probe_align);
>>>> +
>>>
>>> If I am not mistaken this would be the equivalent of allocating a size
>>> 0 fragment right? The only difference is that you are copying out the
>>> "remaining" size, but we could get that from the offset if we knew the
>>> size couldn't we? Would it maybe make sense to look at limiting this
>>> to PAGE_SIZE instead of passing the size of the actual fragment?
>>
>> I am not sure if I understand what does "limiting this to PAGE_SIZE"
>> mean here.
> 
> Right now you are returning pfrag->size = size - offset. I am
> wondering if we should be returning something more like "pfrag->size =
> PAGE_SIZE - (offset % PAGE_SIZE)".

Doesn't doing above defeat the purpose of the 'performant' part mentioned
in the commit log? With above, I would say the new page_frag API is not
providing the expected semantic of skb_page_frag_refill() as the caller
can use up the whole order-3 page by accessing pfrag->size directly.

"There are many use cases that need minimum memory in order
for forward progress, but more performant if more memory is
available"

> 
>> I probably should mention the usecase of probe API here. For the usecase
>> of mptcp_sendmsg(), the minimum size of a fragment can be smaller when
>> the new fragment can be coalesced to previous fragment as there is an
>> extra memory needed for some header if the fragment can not be coalesced
>> to previous fragment. The probe API is mainly used to see if there is
>> any memory left in the 'page_frag_cache' that can be coalesced to
>> previous fragment.
> 
> What is the fragment size we are talking about? In my example above we

I am talking about the minimum fragment size required by the caller, if
there is more space, the caller can decide how much it will use by using
the commit API passing the 'used_sz' parameter.

We only need to limit the caller not to pass a fragsz larger than
PAGE_SIZE when calling a prepare API, but not when calling commit API.

> would basically be looking at rounding the page off to the nearest
> PAGE_SIZE block before we would have to repeat the call to grab the
> next PAGE_SIZE block. Since the request size for the page frag alloc
> API is supposed to be limited to 4K or less it would make sense to
> limit the probe API similarly.
It is partly true for prepare/commit API, as it is true for prepare API,
but not true for commit API.
diff mbox series

Patch

diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
index feed99d0cddb..1c0c11250b66 100644
--- a/include/linux/page_frag_cache.h
+++ b/include/linux/page_frag_cache.h
@@ -46,6 +46,10 @@  void *__page_frag_cache_prepare(struct page_frag_cache *nc, unsigned int fragsz,
 unsigned int __page_frag_cache_commit_noref(struct page_frag_cache *nc,
 					    struct page_frag *pfrag,
 					    unsigned int used_sz);
+void *__page_frag_alloc_refill_probe_align(struct page_frag_cache *nc,
+					   unsigned int fragsz,
+					   struct page_frag *pfrag,
+					   unsigned int align_mask);
 
 static inline unsigned int __page_frag_cache_commit(struct page_frag_cache *nc,
 						    struct page_frag *pfrag,
@@ -88,6 +92,132 @@  static inline void *page_frag_alloc(struct page_frag_cache *nc,
 	return __page_frag_alloc_align(nc, fragsz, gfp_mask, ~0u);
 }
 
+static inline bool __page_frag_refill_align(struct page_frag_cache *nc,
+					    unsigned int fragsz,
+					    struct page_frag *pfrag,
+					    gfp_t gfp_mask,
+					    unsigned int align_mask)
+{
+	if (unlikely(!__page_frag_cache_prepare(nc, fragsz, pfrag, gfp_mask,
+						align_mask)))
+		return false;
+
+	__page_frag_cache_commit(nc, pfrag, fragsz);
+	return true;
+}
+
+static inline bool page_frag_refill_align(struct page_frag_cache *nc,
+					  unsigned int fragsz,
+					  struct page_frag *pfrag,
+					  gfp_t gfp_mask, unsigned int align)
+{
+	WARN_ON_ONCE(!is_power_of_2(align));
+	return __page_frag_refill_align(nc, fragsz, pfrag, gfp_mask, -align);
+}
+
+static inline bool page_frag_refill(struct page_frag_cache *nc,
+				    unsigned int fragsz,
+				    struct page_frag *pfrag, gfp_t gfp_mask)
+{
+	return __page_frag_refill_align(nc, fragsz, pfrag, gfp_mask, ~0u);
+}
+
+static inline bool __page_frag_refill_prepare_align(struct page_frag_cache *nc,
+						    unsigned int fragsz,
+						    struct page_frag *pfrag,
+						    gfp_t gfp_mask,
+						    unsigned int align_mask)
+{
+	return !!__page_frag_cache_prepare(nc, fragsz, pfrag, gfp_mask,
+					   align_mask);
+}
+
+static inline bool page_frag_refill_prepare_align(struct page_frag_cache *nc,
+						  unsigned int fragsz,
+						  struct page_frag *pfrag,
+						  gfp_t gfp_mask,
+						  unsigned int align)
+{
+	WARN_ON_ONCE(!is_power_of_2(align));
+	return __page_frag_refill_prepare_align(nc, fragsz, pfrag, gfp_mask,
+						-align);
+}
+
+static inline bool page_frag_refill_prepare(struct page_frag_cache *nc,
+					    unsigned int fragsz,
+					    struct page_frag *pfrag,
+					    gfp_t gfp_mask)
+{
+	return __page_frag_refill_prepare_align(nc, fragsz, pfrag, gfp_mask,
+						~0u);
+}
+
+static inline void *__page_frag_alloc_refill_prepare_align(struct page_frag_cache *nc,
+							   unsigned int fragsz,
+							   struct page_frag *pfrag,
+							   gfp_t gfp_mask,
+							   unsigned int align_mask)
+{
+	return __page_frag_cache_prepare(nc, fragsz, pfrag, gfp_mask, align_mask);
+}
+
+static inline void *page_frag_alloc_refill_prepare_align(struct page_frag_cache *nc,
+							 unsigned int fragsz,
+							 struct page_frag *pfrag,
+							 gfp_t gfp_mask,
+							 unsigned int align)
+{
+	WARN_ON_ONCE(!is_power_of_2(align));
+	return __page_frag_alloc_refill_prepare_align(nc, fragsz, pfrag,
+						      gfp_mask, -align);
+}
+
+static inline void *page_frag_alloc_refill_prepare(struct page_frag_cache *nc,
+						   unsigned int fragsz,
+						   struct page_frag *pfrag,
+						   gfp_t gfp_mask)
+{
+	return __page_frag_alloc_refill_prepare_align(nc, fragsz, pfrag,
+						      gfp_mask, ~0u);
+}
+
+static inline void *page_frag_alloc_refill_probe(struct page_frag_cache *nc,
+						 unsigned int fragsz,
+						 struct page_frag *pfrag)
+{
+	return __page_frag_alloc_refill_probe_align(nc, fragsz, pfrag, ~0u);
+}
+
+static inline bool page_frag_refill_probe(struct page_frag_cache *nc,
+					  unsigned int fragsz,
+					  struct page_frag *pfrag)
+{
+	return !!page_frag_alloc_refill_probe(nc, fragsz, pfrag);
+}
+
+static inline void page_frag_commit(struct page_frag_cache *nc,
+				    struct page_frag *pfrag,
+				    unsigned int used_sz)
+{
+	__page_frag_cache_commit(nc, pfrag, used_sz);
+}
+
+static inline void page_frag_commit_noref(struct page_frag_cache *nc,
+					  struct page_frag *pfrag,
+					  unsigned int used_sz)
+{
+	__page_frag_cache_commit_noref(nc, pfrag, used_sz);
+}
+
+static inline void page_frag_alloc_abort(struct page_frag_cache *nc,
+					 unsigned int fragsz)
+{
+	VM_BUG_ON(fragsz > nc->offset);
+
+	nc->pagecnt_bias++;
+	nc->offset -= fragsz;
+}
+
 void page_frag_free(void *addr);
 
 #endif
diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
index f55d34cf7d43..5ea4b663ab8e 100644
--- a/mm/page_frag_cache.c
+++ b/mm/page_frag_cache.c
@@ -112,6 +112,27 @@  unsigned int __page_frag_cache_commit_noref(struct page_frag_cache *nc,
 }
 EXPORT_SYMBOL(__page_frag_cache_commit_noref);
 
+void *__page_frag_alloc_refill_probe_align(struct page_frag_cache *nc,
+					   unsigned int fragsz,
+					   struct page_frag *pfrag,
+					   unsigned int align_mask)
+{
+	unsigned long encoded_page = nc->encoded_page;
+	unsigned int size, offset;
+
+	size = PAGE_SIZE << encoded_page_decode_order(encoded_page);
+	offset = __ALIGN_KERNEL_MASK(nc->offset, ~align_mask);
+	if (unlikely(!encoded_page || offset + fragsz > size))
+		return NULL;
+
+	pfrag->page = encoded_page_decode_page(encoded_page);
+	pfrag->size = size - offset;
+	pfrag->offset = offset;
+
+	return encoded_page_decode_virt(encoded_page) + offset;
+}
+EXPORT_SYMBOL(__page_frag_alloc_refill_probe_align);
+
 void *__page_frag_cache_prepare(struct page_frag_cache *nc, unsigned int fragsz,
 				struct page_frag *pfrag, gfp_t gfp_mask,
 				unsigned int align_mask)