diff mbox series

igb: avoid premature Rx buffer reuse

Message ID 1609990905-29220-1-git-send-email-lirongqing@baidu.com (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series igb: avoid premature Rx buffer reuse | expand

Checks

Context Check Description
netdev/cover_letter success Link
netdev/fixes_present success Link
netdev/patch_count success Link
netdev/tree_selection success Guessed tree name to be net-next
netdev/subject_prefix warning Target tree name not specified in the subject
netdev/cc_maintainers warning 10 maintainers not CCed: ast@kernel.org jesse.brandeburg@intel.com daniel@iogearbox.net bpf@vger.kernel.org hawk@kernel.org kuba@kernel.org sven.auhagen@voleatech.de john.fastabend@gmail.com davem@davemloft.net anthony.l.nguyen@intel.com
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit success Errors and warnings before: 17 this patch: 17
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/verify_fixes success Link
netdev/checkpatch warning CHECK: Alignment should match open parenthesis WARNING: line length of 82 exceeds 80 columns WARNING: line length of 90 exceeds 80 columns
netdev/build_allmodconfig_warn success Errors and warnings before: 17 this patch: 17
netdev/header_inline success Link
netdev/stable success Stable not CCed

Commit Message

Li RongQing Jan. 7, 2021, 3:41 a.m. UTC
The page recycle code, incorrectly, relied on that a page fragment
could not be freed inside xdp_do_redirect(). This assumption leads to
that page fragments that are used by the stack/XDP redirect can be
reused and overwritten.

To avoid this, store the page count prior invoking xdp_do_redirect().

Fixes: 9cbc948b5a20 ("igb: add XDP support")
Signed-off-by: Li RongQing <lirongqing@baidu.com>
Cc: Björn Töpel <bjorn.topel@intel.com>
---
 drivers/net/ethernet/intel/igb/igb_main.c | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

Comments

Alexander Duyck Jan. 11, 2021, 8:53 p.m. UTC | #1
On Wed, Jan 6, 2021 at 7:53 PM Li RongQing <lirongqing@baidu.com> wrote:
>
> The page recycle code, incorrectly, relied on that a page fragment
> could not be freed inside xdp_do_redirect(). This assumption leads to
> that page fragments that are used by the stack/XDP redirect can be
> reused and overwritten.
>
> To avoid this, store the page count prior invoking xdp_do_redirect().
>
> Fixes: 9cbc948b5a20 ("igb: add XDP support")
> Signed-off-by: Li RongQing <lirongqing@baidu.com>
> Cc: Björn Töpel <bjorn.topel@intel.com>

I'm not sure what you are talking about here. We allow for a 0 to 1
count difference in the pagecount bias. The idea is the driver should
be holding onto at least one reference from the driver at all times.
Are you saying that is not the case?

As far as the code itself we hold onto the page as long as our
difference does not exceed 1. So specifically if the XDP call is
freeing the page the page itself should still be valid as the
reference count shouldn't drop below 1, and in that case the driver
should be holding that one reference to the page.

When we perform our check we are performing it such at output of
either 0 if the page is freed, or 1 if the page is not freed are
acceptable for us to allow reuse. The key bit is in igb_clean_rx_irq
where we will flip the buffer for the IGB_XDP_TX | IGB_XDP_REDIR case
and just increment the pagecnt_bias indicating that the page was
dropped in the non-flipped case.

Are you perhaps seeing a function that is returning an error and still
consuming the page? If so that might explain what you are seeing.
However the bug would be in the other driver not this one. The
xdp_do_redirect function is not supposed to free the page if it
returns an error. It is supposed to leave that up to the function that
called xdp_do_redirect.

> ---
>  drivers/net/ethernet/intel/igb/igb_main.c | 22 +++++++++++++++-------
>  1 file changed, 15 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
> index 03f78fdb0dcd..3e0d903cf919 100644
> --- a/drivers/net/ethernet/intel/igb/igb_main.c
> +++ b/drivers/net/ethernet/intel/igb/igb_main.c
> @@ -8232,7 +8232,8 @@ static inline bool igb_page_is_reserved(struct page *page)
>         return (page_to_nid(page) != numa_mem_id()) || page_is_pfmemalloc(page);
>  }
>
> -static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer)
> +static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer,
> +                                                                int rx_buf_pgcnt)
>  {
>         unsigned int pagecnt_bias = rx_buffer->pagecnt_bias;
>         struct page *page = rx_buffer->page;
> @@ -8243,7 +8244,7 @@ static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer)
>
>  #if (PAGE_SIZE < 8192)
>         /* if we are only owner of page we can reuse it */
> -       if (unlikely((page_ref_count(page) - pagecnt_bias) > 1))
> +       if (unlikely((rx_buf_pgcnt - pagecnt_bias) > 1))
>                 return false;
>  #else
>  #define IGB_LAST_OFFSET \

So the difference between page_ref_count and pagecnt_bias should be 1
or 0. The 0 would assume the page fragment was freed. What is the
value you are seeing here in the error case? My concern here is that
the pagecnt_bias may be getting incremented because IGB_XDP_CONSUMED
is being returned from igb_run_xdp instead of IGB_XDP_REDIR and so it
thinks the buffer was dropped instead of being transmitted.

> @@ -8632,11 +8633,17 @@ static unsigned int igb_rx_offset(struct igb_ring *rx_ring)
>  }
>
>  static struct igb_rx_buffer *igb_get_rx_buffer(struct igb_ring *rx_ring,
> -                                              const unsigned int size)
> +                                              const unsigned int size, int *rx_buf_pgcnt)
>  {
>         struct igb_rx_buffer *rx_buffer;
>
>         rx_buffer = &rx_ring->rx_buffer_info[rx_ring->next_to_clean];
> +       *rx_buf_pgcnt =
> +#if (PAGE_SIZE < 8192)
> +               page_count(rx_buffer->page);
> +#else
> +               0;
> +#endif
>         prefetchw(rx_buffer->page);
>
>         /* we are reusing so sync this buffer for CPU use */

It should be page_ref_count used here, not page_count. Also caching
this value can be problematic since the value is supposed to be an
atomic count.

> @@ -8652,9 +8659,9 @@ static struct igb_rx_buffer *igb_get_rx_buffer(struct igb_ring *rx_ring,
>  }
>
>  static void igb_put_rx_buffer(struct igb_ring *rx_ring,
> -                             struct igb_rx_buffer *rx_buffer)
> +                             struct igb_rx_buffer *rx_buffer, int rx_buf_pgcnt)
>  {
> -       if (igb_can_reuse_rx_page(rx_buffer)) {
> +       if (igb_can_reuse_rx_page(rx_buffer, rx_buf_pgcnt)) {
>                 /* hand second half of page back to the ring */
>                 igb_reuse_rx_page(rx_ring, rx_buffer);
>         } else {
> @@ -8681,6 +8688,7 @@ static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget)
>         u16 cleaned_count = igb_desc_unused(rx_ring);
>         unsigned int xdp_xmit = 0;
>         struct xdp_buff xdp;
> +       int rx_buf_pgcnt;
>
>         xdp.rxq = &rx_ring->xdp_rxq;
>
> @@ -8711,7 +8719,7 @@ static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget)
>                  */
>                 dma_rmb();
>
> -               rx_buffer = igb_get_rx_buffer(rx_ring, size);
> +               rx_buffer = igb_get_rx_buffer(rx_ring, size, &rx_buf_pgcnt);
>
>                 /* retrieve a buffer from the ring */
>                 if (!skb) {
> @@ -8754,7 +8762,7 @@ static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget)
>                         break;
>                 }
>
> -               igb_put_rx_buffer(rx_ring, rx_buffer);
> +               igb_put_rx_buffer(rx_ring, rx_buffer, rx_buf_pgcnt);
>                 cleaned_count++;
>
>                 /* fetch next buffer in frame if non-eop */

After reviewing the patch I don't see what this is solving. The
buffers should be reusable as long as the refcount is 1 or 0. We
assume with 1 that the stack/XDP is holding onto the page, and with 0
the page was freed.

I would need more info on the actual issue. If nothing else it might
be useful to have an example where you print out the page_ref_count
versus the pagecnt_bias at a few points to verify exactly what is
going on. As I said before if the issue is the xdp_do_redirect
returning an error and still consuming the page then the bug is
elsewhere and not here.
Li RongQing Jan. 12, 2021, 2:54 a.m. UTC | #2
> -----Original Message-----
> From: Alexander Duyck [mailto:alexander.duyck@gmail.com]
> Sent: Tuesday, January 12, 2021 4:54 AM
> To: Li,Rongqing <lirongqing@baidu.com>
> Cc: Netdev <netdev@vger.kernel.org>; intel-wired-lan
> <intel-wired-lan@lists.osuosl.org>; Björn Töpel <bjorn.topel@intel.com>
> Subject: Re: [PATCH] igb: avoid premature Rx buffer reuse
> 
> On Wed, Jan 6, 2021 at 7:53 PM Li RongQing <lirongqing@baidu.com> wrote:
> >
> > The page recycle code, incorrectly, relied on that a page fragment
> > could not be freed inside xdp_do_redirect(). This assumption leads to
> > that page fragments that are used by the stack/XDP redirect can be
> > reused and overwritten.
> >
> > To avoid this, store the page count prior invoking xdp_do_redirect().
> >
> > Fixes: 9cbc948b5a20 ("igb: add XDP support")
> > Signed-off-by: Li RongQing <lirongqing@baidu.com>
> > Cc: Björn Töpel <bjorn.topel@intel.com>
> 
> I'm not sure what you are talking about here. We allow for a 0 to 1 count
> difference in the pagecount bias. The idea is the driver should be holding onto
> at least one reference from the driver at all times.
> Are you saying that is not the case?
> 
> As far as the code itself we hold onto the page as long as our difference does
> not exceed 1. So specifically if the XDP call is freeing the page the page itself
> should still be valid as the reference count shouldn't drop below 1, and in that
> case the driver should be holding that one reference to the page.
> 
> When we perform our check we are performing it such at output of either 0 if
> the page is freed, or 1 if the page is not freed are acceptable for us to allow
> reuse. The key bit is in igb_clean_rx_irq where we will flip the buffer for the
> IGB_XDP_TX | IGB_XDP_REDIR case and just increment the pagecnt_bias
> indicating that the page was dropped in the non-flipped case.
> 
> Are you perhaps seeing a function that is returning an error and still consuming
> the page? If so that might explain what you are seeing.
> However the bug would be in the other driver not this one. The
> xdp_do_redirect function is not supposed to free the page if it returns an error.
> It is supposed to leave that up to the function that called xdp_do_redirect.
> 
> > ---
> >  drivers/net/ethernet/intel/igb/igb_main.c | 22 +++++++++++++++-------
> >  1 file changed, 15 insertions(+), 7 deletions(-)
> >
> > diff --git a/drivers/net/ethernet/intel/igb/igb_main.c
> > b/drivers/net/ethernet/intel/igb/igb_main.c
> > index 03f78fdb0dcd..3e0d903cf919 100644
> > --- a/drivers/net/ethernet/intel/igb/igb_main.c
> > +++ b/drivers/net/ethernet/intel/igb/igb_main.c
> > @@ -8232,7 +8232,8 @@ static inline bool igb_page_is_reserved(struct
> page *page)
> >         return (page_to_nid(page) != numa_mem_id()) ||
> > page_is_pfmemalloc(page);  }
> >
> > -static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer)
> > +static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer,
> > +
> int
> > +rx_buf_pgcnt)
> >  {
> >         unsigned int pagecnt_bias = rx_buffer->pagecnt_bias;
> >         struct page *page = rx_buffer->page; @@ -8243,7 +8244,7 @@
> > static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer)
> >
> >  #if (PAGE_SIZE < 8192)
> >         /* if we are only owner of page we can reuse it */
> > -       if (unlikely((page_ref_count(page) - pagecnt_bias) > 1))
> > +       if (unlikely((rx_buf_pgcnt - pagecnt_bias) > 1))
> >                 return false;
> >  #else
> I would need more info on the actual issue. If nothing else it might be useful to
> have an example where you print out the page_ref_count versus the
> pagecnt_bias at a few points to verify exactly what is going on. As I said before
> if the issue is the xdp_do_redirect returning an error and still consuming the
> page then the bug is elsewhere and not here.


This patch is same as 75aab4e10ae6a4593a60f66d13de755d4e91f400


commit 75aab4e10ae6a4593a60f66d13de755d4e91f400
Author: Björn Töpel <bjorn.topel@intel.com>
Date:   Tue Aug 25 19:27:34 2020 +0200

    i40e: avoid premature Rx buffer reuse
    
    The page recycle code, incorrectly, relied on that a page fragment
    could not be freed inside xdp_do_redirect(). This assumption leads to
    that page fragments that are used by the stack/XDP redirect can be
    reused and overwritten.
    
    To avoid this, store the page count prior invoking xdp_do_redirect().
    
    Longer explanation:
    
    Intel NICs have a recycle mechanism. The main idea is that a page is
    split into two parts. One part is owned by the driver, one part might
    be owned by someone else, such as the stack.
    
    t0: Page is allocated, and put on the Rx ring
                  +---------------
    used by NIC ->| upper buffer
    (rx_buffer)   +---------------
                  | lower buffer
                  +---------------
      page count  == USHRT_MAX
      rx_buffer->pagecnt_bias == USHRT_MAX
    
    t1: Buffer is received, and passed to the stack (e.g.)
                  +---------------
                  | upper buff (skb)
                  +---------------
    used by NIC ->| lower buffer
    (rx_buffer)   +---------------
      page count  == USHRT_MAX
      rx_buffer->pagecnt_bias == USHRT_MAX - 1
    t2: Buffer is received, and redirected
                  +---------------
                  | upper buff (skb)
                  +---------------
    used by NIC ->| lower buffer
    (rx_buffer)   +---------------
    
    Now, prior calling xdp_do_redirect():
      page count  == USHRT_MAX
      rx_buffer->pagecnt_bias == USHRT_MAX - 2
    
    This means that buffer *cannot* be flipped/reused, because the skb is
    still using it.
    
    The problem arises when xdp_do_redirect() actually frees the
    segment. Then we get:
      page count  == USHRT_MAX - 1
      rx_buffer->pagecnt_bias == USHRT_MAX - 2
    
    From a recycle perspective, the buffer can be flipped and reused,
    which means that the skb data area is passed to the Rx HW ring!
    
    To work around this, the page count is stored prior calling
    xdp_do_redirect().
    
    Note that this is not optimal, since the NIC could actually reuse the
    "lower buffer" again. However, then we need to track whether
    XDP_REDIRECT consumed the buffer or not.
    
    Fixes: d9314c474d4f ("i40e: add support for XDP_REDIRECT")
    Reported-and-analyzed-by: Li RongQing <lirongqing@baidu.com>
    Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
    Tested-by: George Kuruvinakunnel <george.kuruvinakunnel@intel.com>
    Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>


Thanks

-Li
Alexander Duyck Jan. 12, 2021, 9:22 p.m. UTC | #3
On Mon, Jan 11, 2021 at 6:54 PM Li,Rongqing <lirongqing@baidu.com> wrote:
>
>
>
> > -----Original Message-----
> > From: Alexander Duyck [mailto:alexander.duyck@gmail.com]
> > Sent: Tuesday, January 12, 2021 4:54 AM
> > To: Li,Rongqing <lirongqing@baidu.com>
> > Cc: Netdev <netdev@vger.kernel.org>; intel-wired-lan
> > <intel-wired-lan@lists.osuosl.org>; Björn Töpel <bjorn.topel@intel.com>
> > Subject: Re: [PATCH] igb: avoid premature Rx buffer reuse
> >
> > On Wed, Jan 6, 2021 at 7:53 PM Li RongQing <lirongqing@baidu.com> wrote:
> > >
> > > The page recycle code, incorrectly, relied on that a page fragment
> > > could not be freed inside xdp_do_redirect(). This assumption leads to
> > > that page fragments that are used by the stack/XDP redirect can be
> > > reused and overwritten.
> > >
> > > To avoid this, store the page count prior invoking xdp_do_redirect().
> > >
> > > Fixes: 9cbc948b5a20 ("igb: add XDP support")
> > > Signed-off-by: Li RongQing <lirongqing@baidu.com>
> > > Cc: Björn Töpel <bjorn.topel@intel.com>
> >
> > I'm not sure what you are talking about here. We allow for a 0 to 1 count
> > difference in the pagecount bias. The idea is the driver should be holding onto
> > at least one reference from the driver at all times.
> > Are you saying that is not the case?
> >
> > As far as the code itself we hold onto the page as long as our difference does
> > not exceed 1. So specifically if the XDP call is freeing the page the page itself
> > should still be valid as the reference count shouldn't drop below 1, and in that
> > case the driver should be holding that one reference to the page.
> >
> > When we perform our check we are performing it such at output of either 0 if
> > the page is freed, or 1 if the page is not freed are acceptable for us to allow
> > reuse. The key bit is in igb_clean_rx_irq where we will flip the buffer for the
> > IGB_XDP_TX | IGB_XDP_REDIR case and just increment the pagecnt_bias
> > indicating that the page was dropped in the non-flipped case.
> >
> > Are you perhaps seeing a function that is returning an error and still consuming
> > the page? If so that might explain what you are seeing.
> > However the bug would be in the other driver not this one. The
> > xdp_do_redirect function is not supposed to free the page if it returns an error.
> > It is supposed to leave that up to the function that called xdp_do_redirect.
> >
> > > ---
> > >  drivers/net/ethernet/intel/igb/igb_main.c | 22 +++++++++++++++-------
> > >  1 file changed, 15 insertions(+), 7 deletions(-)
> > >
> > > diff --git a/drivers/net/ethernet/intel/igb/igb_main.c
> > > b/drivers/net/ethernet/intel/igb/igb_main.c
> > > index 03f78fdb0dcd..3e0d903cf919 100644
> > > --- a/drivers/net/ethernet/intel/igb/igb_main.c
> > > +++ b/drivers/net/ethernet/intel/igb/igb_main.c
> > > @@ -8232,7 +8232,8 @@ static inline bool igb_page_is_reserved(struct
> > page *page)
> > >         return (page_to_nid(page) != numa_mem_id()) ||
> > > page_is_pfmemalloc(page);  }
> > >
> > > -static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer)
> > > +static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer,
> > > +
> > int
> > > +rx_buf_pgcnt)
> > >  {
> > >         unsigned int pagecnt_bias = rx_buffer->pagecnt_bias;
> > >         struct page *page = rx_buffer->page; @@ -8243,7 +8244,7 @@
> > > static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer)
> > >
> > >  #if (PAGE_SIZE < 8192)
> > >         /* if we are only owner of page we can reuse it */
> > > -       if (unlikely((page_ref_count(page) - pagecnt_bias) > 1))
> > > +       if (unlikely((rx_buf_pgcnt - pagecnt_bias) > 1))
> > >                 return false;
> > >  #else
> > I would need more info on the actual issue. If nothing else it might be useful to
> > have an example where you print out the page_ref_count versus the
> > pagecnt_bias at a few points to verify exactly what is going on. As I said before
> > if the issue is the xdp_do_redirect returning an error and still consuming the
> > page then the bug is elsewhere and not here.
>
>
> This patch is same as 75aab4e10ae6a4593a60f66d13de755d4e91f400
>
>
> commit 75aab4e10ae6a4593a60f66d13de755d4e91f400
> Author: Björn Töpel <bjorn.topel@intel.com>
> Date:   Tue Aug 25 19:27:34 2020 +0200
>
>     i40e: avoid premature Rx buffer reuse
>
>     The page recycle code, incorrectly, relied on that a page fragment
>     could not be freed inside xdp_do_redirect(). This assumption leads to
>     that page fragments that are used by the stack/XDP redirect can be
>     reused and overwritten.
>
>     To avoid this, store the page count prior invoking xdp_do_redirect().
>
>     Longer explanation:
>
>     Intel NICs have a recycle mechanism. The main idea is that a page is
>     split into two parts. One part is owned by the driver, one part might
>     be owned by someone else, such as the stack.
>
>     t0: Page is allocated, and put on the Rx ring
>                   +---------------
>     used by NIC ->| upper buffer
>     (rx_buffer)   +---------------
>                   | lower buffer
>                   +---------------
>       page count  == USHRT_MAX
>       rx_buffer->pagecnt_bias == USHRT_MAX
>
>     t1: Buffer is received, and passed to the stack (e.g.)
>                   +---------------
>                   | upper buff (skb)
>                   +---------------
>     used by NIC ->| lower buffer
>     (rx_buffer)   +---------------
>       page count  == USHRT_MAX
>       rx_buffer->pagecnt_bias == USHRT_MAX - 1
>     t2: Buffer is received, and redirected
>                   +---------------
>                   | upper buff (skb)
>                   +---------------
>     used by NIC ->| lower buffer
>     (rx_buffer)   +---------------
>
>     Now, prior calling xdp_do_redirect():
>       page count  == USHRT_MAX
>       rx_buffer->pagecnt_bias == USHRT_MAX - 2
>
>     This means that buffer *cannot* be flipped/reused, because the skb is
>     still using it.
>
>     The problem arises when xdp_do_redirect() actually frees the
>     segment. Then we get:
>       page count  == USHRT_MAX - 1
>       rx_buffer->pagecnt_bias == USHRT_MAX - 2
>
>     From a recycle perspective, the buffer can be flipped and reused,
>     which means that the skb data area is passed to the Rx HW ring!
>
>     To work around this, the page count is stored prior calling
>     xdp_do_redirect().
>
>     Note that this is not optimal, since the NIC could actually reuse the
>     "lower buffer" again. However, then we need to track whether
>     XDP_REDIRECT consumed the buffer or not.
>
>     Fixes: d9314c474d4f ("i40e: add support for XDP_REDIRECT")
>     Reported-and-analyzed-by: Li RongQing <lirongqing@baidu.com>
>     Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
>     Tested-by: George Kuruvinakunnel <george.kuruvinakunnel@intel.com>
>     Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
>
>
> Thanks
>
> -Li

Okay, this explanation makes much more sense. Could you please either
include this explanation in your patch, or include a reference to this
patch as this explains clearly what the issue is while yours didn't
and led to the confusion as I was assuming the freeing was happening
closer to the t0 case, and really the problem is t1.

Thanks.

- Alex
Li RongQing Jan. 13, 2021, 1:36 a.m. UTC | #4
> -----Original Message-----
> From: Alexander Duyck [mailto:alexander.duyck@gmail.com]
> Sent: Wednesday, January 13, 2021 5:23 AM
> To: Li,Rongqing <lirongqing@baidu.com>
> Cc: Netdev <netdev@vger.kernel.org>; intel-wired-lan
> <intel-wired-lan@lists.osuosl.org>; Björn Töpel <bjorn.topel@intel.com>
> Subject: Re: [PATCH] igb: avoid premature Rx buffer r 
> Okay, this explanation makes much more sense. Could you please either include
> this explanation in your patch, or include a reference to this patch as this
> explains clearly what the issue is while yours didn't and led to the confusion as I
> was assuming the freeing was happening closer to the t0 case, and really the
> problem is t1.
> 
> Thanks.
> 
> - Alex


Ok, I will send V2

Thanks

-Li
Jambekar, Vishakha March 12, 2021, 10:24 a.m. UTC | #5
> From: Intel-wired-lan <intel-wired-lan-bounces@osuosl.org> On Behalf Of
> Li,Rongqing
> Sent: Tuesday, January 12, 2021 8:24 AM
> To: Alexander Duyck <alexander.duyck@gmail.com>
> Cc: Netdev <netdev@vger.kernel.org>; Topel, Bjorn <bjorn.topel@intel.com>;
> intel-wired-lan <intel-wired-lan@lists.osuosl.org>
> Subject: Re: [Intel-wired-lan] [PATCH] igb: avoid premature Rx buffer reuse
> 
> 
> 
> > -----Original Message-----
> > From: Alexander Duyck [mailto:alexander.duyck@gmail.com]
> > Sent: Tuesday, January 12, 2021 4:54 AM
> > To: Li,Rongqing <lirongqing@baidu.com>
> > Cc: Netdev <netdev@vger.kernel.org>; intel-wired-lan
> > <intel-wired-lan@lists.osuosl.org>; Björn Töpel
> > <bjorn.topel@intel.com>
> > Subject: Re: [PATCH] igb: avoid premature Rx buffer reuse
> >
> > On Wed, Jan 6, 2021 at 7:53 PM Li RongQing <lirongqing@baidu.com> wrote:
> > >
> > > The page recycle code, incorrectly, relied on that a page fragment
> > > could not be freed inside xdp_do_redirect(). This assumption leads
> > > to that page fragments that are used by the stack/XDP redirect can
> > > be reused and overwritten.
> > >
> > > To avoid this, store the page count prior invoking xdp_do_redirect().
> > >
> > > Fixes: 9cbc948b5a20 ("igb: add XDP support")
> > > Signed-off-by: Li RongQing <lirongqing@baidu.com>
> > > Cc: Björn Töpel <bjorn.topel@intel.com>
> >
> > I'm not sure what you are talking about here. We allow for a 0 to 1
> > count difference in the pagecount bias. The idea is the driver should
> > be holding onto at least one reference from the driver at all times.
> > Are you saying that is not the case?
> >
> > As far as the code itself we hold onto the page as long as our
> > difference does not exceed 1. So specifically if the XDP call is
> > freeing the page the page itself should still be valid as the
> > reference count shouldn't drop below 1, and in that case the driver should be
> holding that one reference to the page.
> >
> > When we perform our check we are performing it such at output of
> > either 0 if the page is freed, or 1 if the page is not freed are
> > acceptable for us to allow reuse. The key bit is in igb_clean_rx_irq
> > where we will flip the buffer for the IGB_XDP_TX | IGB_XDP_REDIR case
> > and just increment the pagecnt_bias indicating that the page was dropped in
> the non-flipped case.
> >
> > Are you perhaps seeing a function that is returning an error and still
> > consuming the page? If so that might explain what you are seeing.
> > However the bug would be in the other driver not this one. The
> > xdp_do_redirect function is not supposed to free the page if it returns an
> error.
> > It is supposed to leave that up to the function that called xdp_do_redirect.
> >
> > > ---
> > >  drivers/net/ethernet/intel/igb/igb_main.c | 22
> > > +++++++++++++++-------
> > >  1 file changed, 15 insertions(+), 7 deletions(-)
> > >
> Tested-by: Vishakha Jambekar <vishakha.jambekar@intel.com>
diff mbox series

Patch

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 03f78fdb0dcd..3e0d903cf919 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -8232,7 +8232,8 @@  static inline bool igb_page_is_reserved(struct page *page)
 	return (page_to_nid(page) != numa_mem_id()) || page_is_pfmemalloc(page);
 }
 
-static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer)
+static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer,
+								 int rx_buf_pgcnt)
 {
 	unsigned int pagecnt_bias = rx_buffer->pagecnt_bias;
 	struct page *page = rx_buffer->page;
@@ -8243,7 +8244,7 @@  static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer)
 
 #if (PAGE_SIZE < 8192)
 	/* if we are only owner of page we can reuse it */
-	if (unlikely((page_ref_count(page) - pagecnt_bias) > 1))
+	if (unlikely((rx_buf_pgcnt - pagecnt_bias) > 1))
 		return false;
 #else
 #define IGB_LAST_OFFSET \
@@ -8632,11 +8633,17 @@  static unsigned int igb_rx_offset(struct igb_ring *rx_ring)
 }
 
 static struct igb_rx_buffer *igb_get_rx_buffer(struct igb_ring *rx_ring,
-					       const unsigned int size)
+					       const unsigned int size, int *rx_buf_pgcnt)
 {
 	struct igb_rx_buffer *rx_buffer;
 
 	rx_buffer = &rx_ring->rx_buffer_info[rx_ring->next_to_clean];
+	*rx_buf_pgcnt =
+#if (PAGE_SIZE < 8192)
+		page_count(rx_buffer->page);
+#else
+		0;
+#endif
 	prefetchw(rx_buffer->page);
 
 	/* we are reusing so sync this buffer for CPU use */
@@ -8652,9 +8659,9 @@  static struct igb_rx_buffer *igb_get_rx_buffer(struct igb_ring *rx_ring,
 }
 
 static void igb_put_rx_buffer(struct igb_ring *rx_ring,
-			      struct igb_rx_buffer *rx_buffer)
+			      struct igb_rx_buffer *rx_buffer, int rx_buf_pgcnt)
 {
-	if (igb_can_reuse_rx_page(rx_buffer)) {
+	if (igb_can_reuse_rx_page(rx_buffer, rx_buf_pgcnt)) {
 		/* hand second half of page back to the ring */
 		igb_reuse_rx_page(rx_ring, rx_buffer);
 	} else {
@@ -8681,6 +8688,7 @@  static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget)
 	u16 cleaned_count = igb_desc_unused(rx_ring);
 	unsigned int xdp_xmit = 0;
 	struct xdp_buff xdp;
+	int rx_buf_pgcnt;
 
 	xdp.rxq = &rx_ring->xdp_rxq;
 
@@ -8711,7 +8719,7 @@  static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget)
 		 */
 		dma_rmb();
 
-		rx_buffer = igb_get_rx_buffer(rx_ring, size);
+		rx_buffer = igb_get_rx_buffer(rx_ring, size, &rx_buf_pgcnt);
 
 		/* retrieve a buffer from the ring */
 		if (!skb) {
@@ -8754,7 +8762,7 @@  static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget)
 			break;
 		}
 
-		igb_put_rx_buffer(rx_ring, rx_buffer);
+		igb_put_rx_buffer(rx_ring, rx_buffer, rx_buf_pgcnt);
 		cleaned_count++;
 
 		/* fetch next buffer in frame if non-eop */