diff mbox series

[net-next,v22,07/14] mm: page_frag: some minor refactoring before adding new API

Message ID 20241018105351.1960345-8-linyunsheng@huawei.com (mailing list archive)
State New
Headers show
Series [net-next,v22,01/14] mm: page_frag: add a test module for page_frag | expand

Commit Message

Yunsheng Lin Oct. 18, 2024, 10:53 a.m. UTC
Refactor common codes from __page_frag_alloc_va_align() to
__page_frag_cache_prepare() and __page_frag_cache_commit(),
so that the new API can make use of them.

CC: Alexander Duyck <alexander.duyck@gmail.com>
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
---
 include/linux/page_frag_cache.h | 36 +++++++++++++++++++++++++++--
 mm/page_frag_cache.c            | 40 ++++++++++++++++++++++++++-------
 2 files changed, 66 insertions(+), 10 deletions(-)

Comments

Alexander Duyck Oct. 18, 2024, 5:26 p.m. UTC | #1
On Fri, Oct 18, 2024 at 4:00 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>
> Refactor common codes from __page_frag_alloc_va_align() to
> __page_frag_cache_prepare() and __page_frag_cache_commit(),
> so that the new API can make use of them.
>
> CC: Alexander Duyck <alexander.duyck@gmail.com>
> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
> ---
>  include/linux/page_frag_cache.h | 36 +++++++++++++++++++++++++++--
>  mm/page_frag_cache.c            | 40 ++++++++++++++++++++++++++-------
>  2 files changed, 66 insertions(+), 10 deletions(-)
>
> diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
> index 41a91df82631..feed99d0cddb 100644
> --- a/include/linux/page_frag_cache.h
> +++ b/include/linux/page_frag_cache.h
> @@ -5,6 +5,7 @@
>
>  #include <linux/bits.h>
>  #include <linux/log2.h>
> +#include <linux/mmdebug.h>
>  #include <linux/mm_types_task.h>
>  #include <linux/types.h>
>
> @@ -39,8 +40,39 @@ static inline bool page_frag_cache_is_pfmemalloc(struct page_frag_cache *nc)
>
>  void page_frag_cache_drain(struct page_frag_cache *nc);
>  void __page_frag_cache_drain(struct page *page, unsigned int count);
> -void *__page_frag_alloc_align(struct page_frag_cache *nc, unsigned int fragsz,
> -                             gfp_t gfp_mask, unsigned int align_mask);
> +void *__page_frag_cache_prepare(struct page_frag_cache *nc, unsigned int fragsz,
> +                               struct page_frag *pfrag, gfp_t gfp_mask,
> +                               unsigned int align_mask);
> +unsigned int __page_frag_cache_commit_noref(struct page_frag_cache *nc,
> +                                           struct page_frag *pfrag,
> +                                           unsigned int used_sz);
> +
> +static inline unsigned int __page_frag_cache_commit(struct page_frag_cache *nc,
> +                                                   struct page_frag *pfrag,
> +                                                   unsigned int used_sz)
> +{
> +       VM_BUG_ON(!nc->pagecnt_bias);
> +       nc->pagecnt_bias--;
> +
> +       return __page_frag_cache_commit_noref(nc, pfrag, used_sz);
> +}
> +
> +static inline void *__page_frag_alloc_align(struct page_frag_cache *nc,
> +                                           unsigned int fragsz, gfp_t gfp_mask,
> +                                           unsigned int align_mask)
> +{
> +       struct page_frag page_frag;
> +       void *va;
> +
> +       va = __page_frag_cache_prepare(nc, fragsz, &page_frag, gfp_mask,
> +                                      align_mask);
> +       if (unlikely(!va))
> +               return NULL;
> +
> +       __page_frag_cache_commit(nc, &page_frag, fragsz);

Minor nit here. Rather than if (!va) return I think it might be better
to just go with if (likely(va)) __page_frag_cache_commit.

> +
> +       return va;
> +}
>
>  static inline void *page_frag_alloc_align(struct page_frag_cache *nc,
>                                           unsigned int fragsz, gfp_t gfp_mask,
> diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
> index a36fd09bf275..a852523bc8ca 100644
> --- a/mm/page_frag_cache.c
> +++ b/mm/page_frag_cache.c
> @@ -90,9 +90,31 @@ void __page_frag_cache_drain(struct page *page, unsigned int count)
>  }
>  EXPORT_SYMBOL(__page_frag_cache_drain);
>
> -void *__page_frag_alloc_align(struct page_frag_cache *nc,
> -                             unsigned int fragsz, gfp_t gfp_mask,
> -                             unsigned int align_mask)
> +unsigned int __page_frag_cache_commit_noref(struct page_frag_cache *nc,
> +                                           struct page_frag *pfrag,
> +                                           unsigned int used_sz)
> +{
> +       unsigned int orig_offset;
> +
> +       VM_BUG_ON(used_sz > pfrag->size);
> +       VM_BUG_ON(pfrag->page != encoded_page_decode_page(nc->encoded_page));
> +       VM_BUG_ON(pfrag->offset + pfrag->size >
> +                 (PAGE_SIZE << encoded_page_decode_order(nc->encoded_page)));
> +
> +       /* pfrag->offset might be bigger than the nc->offset due to alignment */
> +       VM_BUG_ON(nc->offset > pfrag->offset);
> +
> +       orig_offset = nc->offset;
> +       nc->offset = pfrag->offset + used_sz;
> +
> +       /* Return true size back to caller considering the offset alignment */
> +       return nc->offset - orig_offset;
> +}
> +EXPORT_SYMBOL(__page_frag_cache_commit_noref);
> +

I have a question. How often is it that we are committing versus just
dropping the fragment? It seems like this approach is designed around
optimizing for not commiting the page as we are having to take an
extra function call to commit the change every time. Would it make
more sense to have an abort versus a commit?

> +void *__page_frag_cache_prepare(struct page_frag_cache *nc, unsigned int fragsz,
> +                               struct page_frag *pfrag, gfp_t gfp_mask,
> +                               unsigned int align_mask)
>  {
>         unsigned long encoded_page = nc->encoded_page;
>         unsigned int size, offset;
> @@ -114,6 +136,8 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
>                 /* reset page count bias and offset to start of new frag */
>                 nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
>                 nc->offset = 0;
> +       } else {
> +               page = encoded_page_decode_page(encoded_page);
>         }
>
>         size = PAGE_SIZE << encoded_page_decode_order(encoded_page);

This makes no sense to me. Seems like there are scenarios where you
are grabbing the page even if you aren't going to use it? Why?

I think you would be better off just waiting to the end and then
fetching it instead of trying to grab it and potentially throw it away
if there is no space left in the page. Otherwise what you might do is
something along the lines of:
pfrag->page = page ? : encoded_page_decode_page(encoded_page);


> @@ -132,8 +156,6 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
>                         return NULL;
>                 }
>
> -               page = encoded_page_decode_page(encoded_page);
> -
>                 if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
>                         goto refill;
>
> @@ -148,15 +170,17 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
>
>                 /* reset page count bias and offset to start of new frag */
>                 nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
> +               nc->offset = 0;
>                 offset = 0;
>         }
>
> -       nc->pagecnt_bias--;
> -       nc->offset = offset + fragsz;
> +       pfrag->page = page;
> +       pfrag->offset = offset;
> +       pfrag->size = size - offset;

I really think we should still be moving the nc->offset forward at
least with each allocation. It seems like you end up doing two flavors
of commit, one with and one without the decrement of the bias. So I
would be okay with that being pulled out into some separate logic to
avoid the extra increment in the case of merging the pages. However in
both cases you need to move the offset, so I would recommend keeping
that bit there as it would allow us to essentially call this multiple
times without having to do a commit in between to keep the offset
correct. With that your commit logic only has to verify nothing
changes out from underneath us and then update the pagecnt_bias if
needed.

>
>         return encoded_page_decode_virt(encoded_page) + offset;
>  }
> -EXPORT_SYMBOL(__page_frag_alloc_align);
> +EXPORT_SYMBOL(__page_frag_cache_prepare);
>
>  /*
>   * Frees a page fragment allocated out of either a compound or order 0 page.
> --
> 2.33.0
>
Yunsheng Lin Oct. 19, 2024, 8:29 a.m. UTC | #2
On 10/19/2024 1:26 AM, Alexander Duyck wrote:

...

>> +static inline void *__page_frag_alloc_align(struct page_frag_cache *nc,
>> +                                           unsigned int fragsz, gfp_t gfp_mask,
>> +                                           unsigned int align_mask)
>> +{
>> +       struct page_frag page_frag;
>> +       void *va;
>> +
>> +       va = __page_frag_cache_prepare(nc, fragsz, &page_frag, gfp_mask,
>> +                                      align_mask);
>> +       if (unlikely(!va))
>> +               return NULL;
>> +
>> +       __page_frag_cache_commit(nc, &page_frag, fragsz);
> 
> Minor nit here. Rather than if (!va) return I think it might be better
> to just go with if (likely(va)) __page_frag_cache_commit.

Ack.

> 
>> +
>> +       return va;
>> +}
>>
>>   static inline void *page_frag_alloc_align(struct page_frag_cache *nc,
>>                                            unsigned int fragsz, gfp_t gfp_mask,
>> diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
>> index a36fd09bf275..a852523bc8ca 100644
>> --- a/mm/page_frag_cache.c
>> +++ b/mm/page_frag_cache.c
>> @@ -90,9 +90,31 @@ void __page_frag_cache_drain(struct page *page, unsigned int count)
>>   }
>>   EXPORT_SYMBOL(__page_frag_cache_drain);
>>
>> -void *__page_frag_alloc_align(struct page_frag_cache *nc,
>> -                             unsigned int fragsz, gfp_t gfp_mask,
>> -                             unsigned int align_mask)
>> +unsigned int __page_frag_cache_commit_noref(struct page_frag_cache *nc,
>> +                                           struct page_frag *pfrag,
>> +                                           unsigned int used_sz)
>> +{
>> +       unsigned int orig_offset;
>> +
>> +       VM_BUG_ON(used_sz > pfrag->size);
>> +       VM_BUG_ON(pfrag->page != encoded_page_decode_page(nc->encoded_page));
>> +       VM_BUG_ON(pfrag->offset + pfrag->size >
>> +                 (PAGE_SIZE << encoded_page_decode_order(nc->encoded_page)));
>> +
>> +       /* pfrag->offset might be bigger than the nc->offset due to alignment */
>> +       VM_BUG_ON(nc->offset > pfrag->offset);
>> +
>> +       orig_offset = nc->offset;
>> +       nc->offset = pfrag->offset + used_sz;
>> +
>> +       /* Return true size back to caller considering the offset alignment */
>> +       return nc->offset - orig_offset;
>> +}
>> +EXPORT_SYMBOL(__page_frag_cache_commit_noref);
>> +
> 
> I have a question. How often is it that we are committing versus just
> dropping the fragment? It seems like this approach is designed around
> optimizing for not commiting the page as we are having to take an
> extra function call to commit the change every time. Would it make
> more sense to have an abort versus a commit?

Before this patch, page_frag_alloc() related API seems to be mostly used
for skb data or frag for rx part, see napi_alloc_skb() or some drivers
like e1000, but with more drivers using the page_pool for skb rx frag,
it seems skb data for tx is the main usecase.

And the prepare and commit API added in the patchset seems to be mainly
used for skb frag for tx part except af_packet.

It seems it is not very clear which is mostly used one, mostly likely
the prepare and commit API might be the mostly used one if I have to
guess as there might be more memory needed for skb frag than skb data.

> 
>> +void *__page_frag_cache_prepare(struct page_frag_cache *nc, unsigned int fragsz,
>> +                               struct page_frag *pfrag, gfp_t gfp_mask,
>> +                               unsigned int align_mask)
>>   {
>>          unsigned long encoded_page = nc->encoded_page;
>>          unsigned int size, offset;
>> @@ -114,6 +136,8 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
>>                  /* reset page count bias and offset to start of new frag */
>>                  nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
>>                  nc->offset = 0;
>> +       } else {
>> +               page = encoded_page_decode_page(encoded_page);
>>          }
>>
>>          size = PAGE_SIZE << encoded_page_decode_order(encoded_page);
> 
> This makes no sense to me. Seems like there are scenarios where you
> are grabbing the page even if you aren't going to use it? Why?
> 
> I think you would be better off just waiting to the end and then
> fetching it instead of trying to grab it and potentially throw it away
> if there is no space left in the page. Otherwise what you might do is
> something along the lines of:
> pfrag->page = page ? : encoded_page_decode_page(encoded_page);

But doesn't that mean an additional checking is needed to decide if we
need to grab the page?

But the './scripts/bloat-o-meter' does show some binary size shrink
using the above.

> 
> 
>> @@ -132,8 +156,6 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
>>                          return NULL;
>>                  }
>>
>> -               page = encoded_page_decode_page(encoded_page);
>> -
>>                  if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
>>                          goto refill;
>>
>> @@ -148,15 +170,17 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
>>
>>                  /* reset page count bias and offset to start of new frag */
>>                  nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
>> +               nc->offset = 0;
>>                  offset = 0;
>>          }
>>
>> -       nc->pagecnt_bias--;
>> -       nc->offset = offset + fragsz;
>> +       pfrag->page = page;
>> +       pfrag->offset = offset;
>> +       pfrag->size = size - offset;
> 
> I really think we should still be moving the nc->offset forward at
> least with each allocation. It seems like you end up doing two flavors
> of commit, one with and one without the decrement of the bias. So I
> would be okay with that being pulled out into some separate logic to
> avoid the extra increment in the case of merging the pages. However in
> both cases you need to move the offset, so I would recommend keeping
> that bit there as it would allow us to essentially call this multiple
> times without having to do a commit in between to keep the offset
> correct. With that your commit logic only has to verify nothing
> changes out from underneath us and then update the pagecnt_bias if
> needed.

The problem is that we don't really know how much the nc->offset
need to be moved forward to and the caller needs the original offset
for skb_fill_page_desc() related calling when prepare API is used as
an example in 'Preparation & committing API' section of patch 13:

+Preparation & committing API
+----------------------------
+
+.. code-block:: c
+
+    struct page_frag page_frag, *pfrag;
+    bool merge = true;
+    void *va;
+
+    pfrag = &page_frag;
+    va = page_frag_alloc_refill_prepare(nc, 32U, pfrag, GFP_KERNEL);
+    if (!va)
+        goto wait_for_space;
+
+    copy = min_t(unsigned int, copy, pfrag->size);
+    if (!skb_can_coalesce(skb, i, pfrag->page, pfrag->offset)) {
+        if (i >= max_skb_frags)
+            goto new_segment;
+
+        merge = false;
+    }
+
+    copy = mem_schedule(copy);
+    if (!copy)
+        goto wait_for_space;
+
+    err = copy_from_iter_full_nocache(va, copy, iter);
+    if (err)
+        goto do_error;
+
+    if (merge) {
+        skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
+        page_frag_commit_noref(nc, pfrag, copy);
+    } else {
+        skb_fill_page_desc(skb, i, pfrag->page, pfrag->offset, copy);
+        page_frag_commit(nc, pfrag, copy);
+    }
Alexander Duyck Oct. 20, 2024, 3:45 p.m. UTC | #3
On Sat, Oct 19, 2024 at 1:30 AM Yunsheng Lin <yunshenglin0825@gmail.com> wrote:
>
> On 10/19/2024 1:26 AM, Alexander Duyck wrote:
>
> ...
>
> >> +static inline void *__page_frag_alloc_align(struct page_frag_cache *nc,
> >> +                                           unsigned int fragsz, gfp_t gfp_mask,
> >> +                                           unsigned int align_mask)
> >> +{
> >> +       struct page_frag page_frag;
> >> +       void *va;
> >> +
> >> +       va = __page_frag_cache_prepare(nc, fragsz, &page_frag, gfp_mask,
> >> +                                      align_mask);
> >> +       if (unlikely(!va))
> >> +               return NULL;
> >> +
> >> +       __page_frag_cache_commit(nc, &page_frag, fragsz);
> >
> > Minor nit here. Rather than if (!va) return I think it might be better
> > to just go with if (likely(va)) __page_frag_cache_commit.
>
> Ack.
>
> >
> >> +
> >> +       return va;
> >> +}
> >>
> >>   static inline void *page_frag_alloc_align(struct page_frag_cache *nc,
> >>                                            unsigned int fragsz, gfp_t gfp_mask,
> >> diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
> >> index a36fd09bf275..a852523bc8ca 100644
> >> --- a/mm/page_frag_cache.c
> >> +++ b/mm/page_frag_cache.c
> >> @@ -90,9 +90,31 @@ void __page_frag_cache_drain(struct page *page, unsigned int count)
> >>   }
> >>   EXPORT_SYMBOL(__page_frag_cache_drain);
> >>
> >> -void *__page_frag_alloc_align(struct page_frag_cache *nc,
> >> -                             unsigned int fragsz, gfp_t gfp_mask,
> >> -                             unsigned int align_mask)
> >> +unsigned int __page_frag_cache_commit_noref(struct page_frag_cache *nc,
> >> +                                           struct page_frag *pfrag,
> >> +                                           unsigned int used_sz)
> >> +{
> >> +       unsigned int orig_offset;
> >> +
> >> +       VM_BUG_ON(used_sz > pfrag->size);
> >> +       VM_BUG_ON(pfrag->page != encoded_page_decode_page(nc->encoded_page));
> >> +       VM_BUG_ON(pfrag->offset + pfrag->size >
> >> +                 (PAGE_SIZE << encoded_page_decode_order(nc->encoded_page)));
> >> +
> >> +       /* pfrag->offset might be bigger than the nc->offset due to alignment */
> >> +       VM_BUG_ON(nc->offset > pfrag->offset);
> >> +
> >> +       orig_offset = nc->offset;
> >> +       nc->offset = pfrag->offset + used_sz;
> >> +
> >> +       /* Return true size back to caller considering the offset alignment */
> >> +       return nc->offset - orig_offset;
> >> +}
> >> +EXPORT_SYMBOL(__page_frag_cache_commit_noref);
> >> +
> >
> > I have a question. How often is it that we are committing versus just
> > dropping the fragment? It seems like this approach is designed around
> > optimizing for not commiting the page as we are having to take an
> > extra function call to commit the change every time. Would it make
> > more sense to have an abort versus a commit?
>
> Before this patch, page_frag_alloc() related API seems to be mostly used
> for skb data or frag for rx part, see napi_alloc_skb() or some drivers
> like e1000, but with more drivers using the page_pool for skb rx frag,
> it seems skb data for tx is the main usecase.
>
> And the prepare and commit API added in the patchset seems to be mainly
> used for skb frag for tx part except af_packet.
>
> It seems it is not very clear which is mostly used one, mostly likely
> the prepare and commit API might be the mostly used one if I have to
> guess as there might be more memory needed for skb frag than skb data.

Well one of the things I am noticing is that you have essentially two
API setups in the later patches.

In one you are calling the page_frag_alloc_align and then later
calling an abort function that is added later. In the other you have
the probe/commit approach. In my mind it might make sense to think
about breaking those up to be handled as two seperate APIs rather than
trying to replace everything all at once.

> >
> >> +void *__page_frag_cache_prepare(struct page_frag_cache *nc, unsigned int fragsz,
> >> +                               struct page_frag *pfrag, gfp_t gfp_mask,
> >> +                               unsigned int align_mask)
> >>   {
> >>          unsigned long encoded_page = nc->encoded_page;
> >>          unsigned int size, offset;
> >> @@ -114,6 +136,8 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
> >>                  /* reset page count bias and offset to start of new frag */
> >>                  nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
> >>                  nc->offset = 0;
> >> +       } else {
> >> +               page = encoded_page_decode_page(encoded_page);
> >>          }
> >>
> >>          size = PAGE_SIZE << encoded_page_decode_order(encoded_page);
> >
> > This makes no sense to me. Seems like there are scenarios where you
> > are grabbing the page even if you aren't going to use it? Why?
> >
> > I think you would be better off just waiting to the end and then
> > fetching it instead of trying to grab it and potentially throw it away
> > if there is no space left in the page. Otherwise what you might do is
> > something along the lines of:
> > pfrag->page = page ? : encoded_page_decode_page(encoded_page);
>
> But doesn't that mean an additional checking is needed to decide if we
> need to grab the page?
>
> But the './scripts/bloat-o-meter' does show some binary size shrink
> using the above.

You are probably correct on this one. I think your approach may be
better. I think the only case my approach would be optimizing for
would probably be the size > 4K which isn't appropriate anyway.

> >
> >
> >> @@ -132,8 +156,6 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
> >>                          return NULL;
> >>                  }
> >>
> >> -               page = encoded_page_decode_page(encoded_page);
> >> -
> >>                  if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
> >>                          goto refill;
> >>
> >> @@ -148,15 +170,17 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
> >>
> >>                  /* reset page count bias and offset to start of new frag */
> >>                  nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
> >> +               nc->offset = 0;
> >>                  offset = 0;
> >>          }
> >>
> >> -       nc->pagecnt_bias--;
> >> -       nc->offset = offset + fragsz;
> >> +       pfrag->page = page;
> >> +       pfrag->offset = offset;
> >> +       pfrag->size = size - offset;
> >
> > I really think we should still be moving the nc->offset forward at
> > least with each allocation. It seems like you end up doing two flavors
> > of commit, one with and one without the decrement of the bias. So I
> > would be okay with that being pulled out into some separate logic to
> > avoid the extra increment in the case of merging the pages. However in
> > both cases you need to move the offset, so I would recommend keeping
> > that bit there as it would allow us to essentially call this multiple
> > times without having to do a commit in between to keep the offset
> > correct. With that your commit logic only has to verify nothing
> > changes out from underneath us and then update the pagecnt_bias if
> > needed.
>
> The problem is that we don't really know how much the nc->offset
> need to be moved forward to and the caller needs the original offset
> for skb_fill_page_desc() related calling when prepare API is used as
> an example in 'Preparation & committing API' section of patch 13:

The thing is you really have 2 different APIs. You have one you were
doing which was a alloc/abort approach and another that is a
probe/commit approach. I think for the probe/commit you could probably
get away with using an "alloc" type approach with a size of 0 which
would correctly set the start of your offset and then you would need
to update it later once you know the total size for your commit. For
the probe/commit we could use the nc->offset as a kind of cookie to
verify we are working with the expected page and offset.

For the alloc/abort it would be something similar but more the
reverse. With that one we would need to have the size + offset and
then verify the current offset is equal to that before we allow
reverting the previous nc->offset update. The current patch set is a
bit too permissive on the abort in my opinion and should be verifying
that we are updating the correct offset.
Yunsheng Lin Oct. 21, 2024, 9:34 a.m. UTC | #4
On 2024/10/20 23:45, Alexander Duyck wrote:

...

> 
>>>
>>>
>>>> @@ -132,8 +156,6 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
>>>>                          return NULL;
>>>>                  }
>>>>
>>>> -               page = encoded_page_decode_page(encoded_page);
>>>> -
>>>>                  if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
>>>>                          goto refill;
>>>>
>>>> @@ -148,15 +170,17 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
>>>>
>>>>                  /* reset page count bias and offset to start of new frag */
>>>>                  nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
>>>> +               nc->offset = 0;
>>>>                  offset = 0;
>>>>          }
>>>>
>>>> -       nc->pagecnt_bias--;
>>>> -       nc->offset = offset + fragsz;
>>>> +       pfrag->page = page;
>>>> +       pfrag->offset = offset;
>>>> +       pfrag->size = size - offset;
>>>
>>> I really think we should still be moving the nc->offset forward at
>>> least with each allocation. It seems like you end up doing two flavors
>>> of commit, one with and one without the decrement of the bias. So I
>>> would be okay with that being pulled out into some separate logic to
>>> avoid the extra increment in the case of merging the pages. However in
>>> both cases you need to move the offset, so I would recommend keeping
>>> that bit there as it would allow us to essentially call this multiple
>>> times without having to do a commit in between to keep the offset
>>> correct. With that your commit logic only has to verify nothing
>>> changes out from underneath us and then update the pagecnt_bias if
>>> needed.
>>
>> The problem is that we don't really know how much the nc->offset
>> need to be moved forward to and the caller needs the original offset
>> for skb_fill_page_desc() related calling when prepare API is used as
>> an example in 'Preparation & committing API' section of patch 13:
> 
> The thing is you really have 2 different APIs. You have one you were
> doing which was a alloc/abort approach and another that is a
> probe/commit approach. I think for the probe/commit you could probably
> get away with using an "alloc" type approach with a size of 0 which
> would correctly set the start of your offset and then you would need
> to update it later once you know the total size for your commit. For

It seems there are some issues with the above approach as below as I
can see for now:
1. when nc->encoded_page is 0, Calling the "alloc" type API with
fragsz being zero may still allocate a new page from allocator, which
seems to against the purpose of probe API, right?

2. It doesn't allow the caller to specify a fragsz for probing, instead
   it rely on the caller to check if the size of probed fragment is bigger
   enough for its use case.

> the probe/commit we could use the nc->offset as a kind of cookie to
> verify we are working with the expected page and offset.

I am not sure if I am following the above, but I should mention that
nc->offset is not updated for prepare/probe API because the original
offset might be used for calculating the truesize of the fragment
when commit API is called, and the offset returned to the caller might
need to be updated according to alignment requirement, so I am not sure
how nc->offset can be used to verify the exact offset here.

If it is realy about catching misuse of the page_frag API, it might be
better to add something like nc->last_offset to record the offset of
allocated fragment under some config like PAGE_FRAG_DEBUG, as there
are other ways that the caller might mess up here like messing up the
allocation context assumtion.

> 
> For the alloc/abort it would be something similar but more the
> reverse. With that one we would need to have the size + offset and
> then verify the current offset is equal to that before we allow
> reverting the previous nc->offset update. The current patch set is a
> bit too permissive on the abort in my opinion and should be verifying
> that we are updating the correct offset.

I am not sure if I understand what is your idea about how to do an
exact verifying for abort API here.
For abort API, it seems we can do an exact verifying if the 'va' is
also passed to the abort API as the nc->offset is already updated,
something like below:

static inline void page_frag_alloc_abort(struct page_frag_cache *nc,
					 void *va, unsigned int fragsz)
{
        VM_BUG_ON((nc->offset - fragsz) !=
		  (encoded_page_decode_virt(nc->encoded_page) - va));

        nc->pagecnt_bias++;
        nc->offset -= fragsz;
}

But it also might mean we need to put page_frag_alloc_abort() in
page_frag_cache.c instead of a inline helper in page_frag_cache.h, as
the encoded_page_decode_virt() is a static function in c file. Or put
encoded_page_decode_virt() in the h file.
diff mbox series

Patch

diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
index 41a91df82631..feed99d0cddb 100644
--- a/include/linux/page_frag_cache.h
+++ b/include/linux/page_frag_cache.h
@@ -5,6 +5,7 @@ 
 
 #include <linux/bits.h>
 #include <linux/log2.h>
+#include <linux/mmdebug.h>
 #include <linux/mm_types_task.h>
 #include <linux/types.h>
 
@@ -39,8 +40,39 @@  static inline bool page_frag_cache_is_pfmemalloc(struct page_frag_cache *nc)
 
 void page_frag_cache_drain(struct page_frag_cache *nc);
 void __page_frag_cache_drain(struct page *page, unsigned int count);
-void *__page_frag_alloc_align(struct page_frag_cache *nc, unsigned int fragsz,
-			      gfp_t gfp_mask, unsigned int align_mask);
+void *__page_frag_cache_prepare(struct page_frag_cache *nc, unsigned int fragsz,
+				struct page_frag *pfrag, gfp_t gfp_mask,
+				unsigned int align_mask);
+unsigned int __page_frag_cache_commit_noref(struct page_frag_cache *nc,
+					    struct page_frag *pfrag,
+					    unsigned int used_sz);
+
+static inline unsigned int __page_frag_cache_commit(struct page_frag_cache *nc,
+						    struct page_frag *pfrag,
+						    unsigned int used_sz)
+{
+	VM_BUG_ON(!nc->pagecnt_bias);
+	nc->pagecnt_bias--;
+
+	return __page_frag_cache_commit_noref(nc, pfrag, used_sz);
+}
+
+static inline void *__page_frag_alloc_align(struct page_frag_cache *nc,
+					    unsigned int fragsz, gfp_t gfp_mask,
+					    unsigned int align_mask)
+{
+	struct page_frag page_frag;
+	void *va;
+
+	va = __page_frag_cache_prepare(nc, fragsz, &page_frag, gfp_mask,
+				       align_mask);
+	if (unlikely(!va))
+		return NULL;
+
+	__page_frag_cache_commit(nc, &page_frag, fragsz);
+
+	return va;
+}
 
 static inline void *page_frag_alloc_align(struct page_frag_cache *nc,
 					  unsigned int fragsz, gfp_t gfp_mask,
diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
index a36fd09bf275..a852523bc8ca 100644
--- a/mm/page_frag_cache.c
+++ b/mm/page_frag_cache.c
@@ -90,9 +90,31 @@  void __page_frag_cache_drain(struct page *page, unsigned int count)
 }
 EXPORT_SYMBOL(__page_frag_cache_drain);
 
-void *__page_frag_alloc_align(struct page_frag_cache *nc,
-			      unsigned int fragsz, gfp_t gfp_mask,
-			      unsigned int align_mask)
+unsigned int __page_frag_cache_commit_noref(struct page_frag_cache *nc,
+					    struct page_frag *pfrag,
+					    unsigned int used_sz)
+{
+	unsigned int orig_offset;
+
+	VM_BUG_ON(used_sz > pfrag->size);
+	VM_BUG_ON(pfrag->page != encoded_page_decode_page(nc->encoded_page));
+	VM_BUG_ON(pfrag->offset + pfrag->size >
+		  (PAGE_SIZE << encoded_page_decode_order(nc->encoded_page)));
+
+	/* pfrag->offset might be bigger than the nc->offset due to alignment */
+	VM_BUG_ON(nc->offset > pfrag->offset);
+
+	orig_offset = nc->offset;
+	nc->offset = pfrag->offset + used_sz;
+
+	/* Return true size back to caller considering the offset alignment */
+	return nc->offset - orig_offset;
+}
+EXPORT_SYMBOL(__page_frag_cache_commit_noref);
+
+void *__page_frag_cache_prepare(struct page_frag_cache *nc, unsigned int fragsz,
+				struct page_frag *pfrag, gfp_t gfp_mask,
+				unsigned int align_mask)
 {
 	unsigned long encoded_page = nc->encoded_page;
 	unsigned int size, offset;
@@ -114,6 +136,8 @@  void *__page_frag_alloc_align(struct page_frag_cache *nc,
 		/* reset page count bias and offset to start of new frag */
 		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
 		nc->offset = 0;
+	} else {
+		page = encoded_page_decode_page(encoded_page);
 	}
 
 	size = PAGE_SIZE << encoded_page_decode_order(encoded_page);
@@ -132,8 +156,6 @@  void *__page_frag_alloc_align(struct page_frag_cache *nc,
 			return NULL;
 		}
 
-		page = encoded_page_decode_page(encoded_page);
-
 		if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
 			goto refill;
 
@@ -148,15 +170,17 @@  void *__page_frag_alloc_align(struct page_frag_cache *nc,
 
 		/* reset page count bias and offset to start of new frag */
 		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
+		nc->offset = 0;
 		offset = 0;
 	}
 
-	nc->pagecnt_bias--;
-	nc->offset = offset + fragsz;
+	pfrag->page = page;
+	pfrag->offset = offset;
+	pfrag->size = size - offset;
 
 	return encoded_page_decode_virt(encoded_page) + offset;
 }
-EXPORT_SYMBOL(__page_frag_alloc_align);
+EXPORT_SYMBOL(__page_frag_cache_prepare);
 
 /*
  * Frees a page fragment allocated out of either a compound or order 0 page.