Message ID | 20230124124300.94886-1-nbd@nbd.name (mailing list archive) |
---|---|
State | Changes Requested |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | net: page_pool: fix refcounting issues with fragmented allocation | expand |
Hi Felix, ++cc Alexander and Yunsheng. Thanks for the report On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote: > > While testing fragmented page_pool allocation in the mt76 driver, I was able > to reliably trigger page refcount underflow issues, which did not occur with > full-page page_pool allocation. > It appears to me, that handling refcounting in two separate counters > (page->pp_frag_count and page refcount) is racy when page refcount gets > incremented by code dealing with skb fragments directly, and > page_pool_return_skb_page is called multiple times for the same fragment. > > Dropping page->pp_frag_count and relying entirely on the page refcount makes > these underflow issues and crashes go away. > This has been discussed here [1]. TL;DR changing this to page refcount might blow up in other colorful ways. Can we look closer and figure out why the underflow happens? [1] https://lore.kernel.org/netdev/1625903002-31619-4-git-send-email-linyunsheng@huawei.com/ Thanks /Ilias > Cc: Lorenzo Bianconi <lorenzo@kernel.org> > Signed-off-by: Felix Fietkau <nbd@nbd.name> > --- > include/linux/mm_types.h | 17 +++++------------ > include/net/page_pool.h | 19 ++++--------------- > net/core/page_pool.c | 12 ++++-------- > 3 files changed, 13 insertions(+), 35 deletions(-) > > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h > index 9757067c3053..96ec3b19a86d 100644 > --- a/include/linux/mm_types.h > +++ b/include/linux/mm_types.h > @@ -125,18 +125,11 @@ struct page { > struct page_pool *pp; > unsigned long _pp_mapping_pad; > unsigned long dma_addr; > - union { > - /** > - * dma_addr_upper: might require a 64-bit > - * value on 32-bit architectures. > - */ > - unsigned long dma_addr_upper; > - /** > - * For frag page support, not supported in > - * 32-bit architectures with 64-bit DMA. > - */ > - atomic_long_t pp_frag_count; > - }; > + /** > + * dma_addr_upper: might require a 64-bit > + * value on 32-bit architectures. > + */ > + unsigned long dma_addr_upper; > }; > struct { /* Tail pages of compound page */ > unsigned long compound_head; /* Bit zero is set */ > diff --git a/include/net/page_pool.h b/include/net/page_pool.h > index 813c93499f20..28e1fdbdcd53 100644 > --- a/include/net/page_pool.h > +++ b/include/net/page_pool.h > @@ -279,14 +279,14 @@ void page_pool_put_defragged_page(struct page_pool *pool, struct page *page, > > static inline void page_pool_fragment_page(struct page *page, long nr) > { > - atomic_long_set(&page->pp_frag_count, nr); > + page_ref_add(page, nr); > } > > static inline long page_pool_defrag_page(struct page *page, long nr) > { > long ret; > > - /* If nr == pp_frag_count then we have cleared all remaining > + /* If nr == page_ref_count then we have cleared all remaining > * references to the page. No need to actually overwrite it, instead > * we can leave this to be overwritten by the calling function. > * > @@ -295,22 +295,14 @@ static inline long page_pool_defrag_page(struct page *page, long nr) > * especially when dealing with a page that may be partitioned > * into only 2 or 3 pieces. > */ > - if (atomic_long_read(&page->pp_frag_count) == nr) > + if (page_ref_count(page) == nr) > return 0; > > - ret = atomic_long_sub_return(nr, &page->pp_frag_count); > + ret = page_ref_sub_return(page, nr); > WARN_ON(ret < 0); > return ret; > } > > -static inline bool page_pool_is_last_frag(struct page_pool *pool, > - struct page *page) > -{ > - /* If fragments aren't enabled or count is 0 we were the last user */ > - return !(pool->p.flags & PP_FLAG_PAGE_FRAG) || > - (page_pool_defrag_page(page, 1) == 0); > -} > - > static inline void page_pool_put_page(struct page_pool *pool, > struct page *page, > unsigned int dma_sync_size, > @@ -320,9 +312,6 @@ static inline void page_pool_put_page(struct page_pool *pool, > * allow registering MEM_TYPE_PAGE_POOL, but shield linker. > */ > #ifdef CONFIG_PAGE_POOL > - if (!page_pool_is_last_frag(pool, page)) > - return; > - > page_pool_put_defragged_page(pool, page, dma_sync_size, allow_direct); > #endif > } > diff --git a/net/core/page_pool.c b/net/core/page_pool.c > index 9b203d8660e4..0defcadae225 100644 > --- a/net/core/page_pool.c > +++ b/net/core/page_pool.c > @@ -25,7 +25,7 @@ > #define DEFER_TIME (msecs_to_jiffies(1000)) > #define DEFER_WARN_INTERVAL (60 * HZ) > > -#define BIAS_MAX LONG_MAX > +#define BIAS_MAX(pool) (PAGE_SIZE << ((pool)->p.order)) > > #ifdef CONFIG_PAGE_POOL_STATS > /* alloc_stat_inc is intended to be used in softirq context */ > @@ -619,10 +619,6 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data, > for (i = 0; i < count; i++) { > struct page *page = virt_to_head_page(data[i]); > > - /* It is not the last user for the page frag case */ > - if (!page_pool_is_last_frag(pool, page)) > - continue; > - > page = __page_pool_put_page(pool, page, -1, false); > /* Approved for bulk recycling in ptr_ring cache */ > if (page) > @@ -659,7 +655,7 @@ EXPORT_SYMBOL(page_pool_put_page_bulk); > static struct page *page_pool_drain_frag(struct page_pool *pool, > struct page *page) > { > - long drain_count = BIAS_MAX - pool->frag_users; > + long drain_count = BIAS_MAX(pool) - pool->frag_users; > > /* Some user is still using the page frag */ > if (likely(page_pool_defrag_page(page, drain_count))) > @@ -678,7 +674,7 @@ static struct page *page_pool_drain_frag(struct page_pool *pool, > > static void page_pool_free_frag(struct page_pool *pool) > { > - long drain_count = BIAS_MAX - pool->frag_users; > + long drain_count = BIAS_MAX(pool) - pool->frag_users; > struct page *page = pool->frag_page; > > pool->frag_page = NULL; > @@ -724,7 +720,7 @@ struct page *page_pool_alloc_frag(struct page_pool *pool, > pool->frag_users = 1; > *offset = 0; > pool->frag_offset = size; > - page_pool_fragment_page(page, BIAS_MAX); > + page_pool_fragment_page(page, BIAS_MAX(pool)); > return page; > } > > -- > 2.39.0 >
On Tue, 2023-01-24 at 16:11 +0200, Ilias Apalodimas wrote: > Hi Felix, > > ++cc Alexander and Yunsheng. > > Thanks for the report > > On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote: > > > > While testing fragmented page_pool allocation in the mt76 driver, I was able > > to reliably trigger page refcount underflow issues, which did not occur with > > full-page page_pool allocation. > > It appears to me, that handling refcounting in two separate counters > > (page->pp_frag_count and page refcount) is racy when page refcount gets > > incremented by code dealing with skb fragments directly, and > > page_pool_return_skb_page is called multiple times for the same fragment. > > > > Dropping page->pp_frag_count and relying entirely on the page refcount makes > > these underflow issues and crashes go away. > > > > This has been discussed here [1]. TL;DR changing this to page > refcount might blow up in other colorful ways. Can we look closer and > figure out why the underflow happens? > > [1] https://lore.kernel.org/netdev/1625903002-31619-4-git-send-email-linyunsheng@huawei.com/ > > Thanks > /Ilias > > The logic should be safe in terms of the page pool itself as it should be holding one reference to the page while the pp_frag_count is non- zero. That one reference is what keeps the two halfs in sync as the page shouldn't be able to be freed until we exhaust the pp_frag_count. To have an underflow there are two possible scenarios. One is that either put_page or free_page is being called somewhere that the page_pool freeing functions should be used. The other possibility is that a pp_frag_count reference was taken somewhere a page reference should have. Do we have a backtrace for the spots that are showing this underrun? If nothing else we may want to look at tracking down the spots that are freeing the page pool pages via put_page or free_page to determine what paths these pages are taking.
On 24.01.23 16:57, Alexander H Duyck wrote: > On Tue, 2023-01-24 at 16:11 +0200, Ilias Apalodimas wrote: >> Hi Felix, >> >> ++cc Alexander and Yunsheng. >> >> Thanks for the report >> >> On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote: >> > >> > While testing fragmented page_pool allocation in the mt76 driver, I was able >> > to reliably trigger page refcount underflow issues, which did not occur with >> > full-page page_pool allocation. >> > It appears to me, that handling refcounting in two separate counters >> > (page->pp_frag_count and page refcount) is racy when page refcount gets >> > incremented by code dealing with skb fragments directly, and >> > page_pool_return_skb_page is called multiple times for the same fragment. >> > >> > Dropping page->pp_frag_count and relying entirely on the page refcount makes >> > these underflow issues and crashes go away. >> > >> >> This has been discussed here [1]. TL;DR changing this to page >> refcount might blow up in other colorful ways. Can we look closer and >> figure out why the underflow happens? >> >> [1] https://lore.kernel.org/netdev/1625903002-31619-4-git-send-email-linyunsheng@huawei.com/ >> >> Thanks >> /Ilias >> >> > > The logic should be safe in terms of the page pool itself as it should > be holding one reference to the page while the pp_frag_count is non- > zero. That one reference is what keeps the two halfs in sync as the > page shouldn't be able to be freed until we exhaust the pp_frag_count. > > To have an underflow there are two possible scenarios. One is that > either put_page or free_page is being called somewhere that the > page_pool freeing functions should be used. The other possibility is > that a pp_frag_count reference was taken somewhere a page reference > should have. > > Do we have a backtrace for the spots that are showing this underrun? If > nothing else we may want to look at tracking down the spots that are > freeing the page pool pages via put_page or free_page to determine what > paths these pages are taking. Here's an example of the kind of traces that I was seeing with v6.1: https://nbd.name/p/61a6617e On v5.15 I also occasionally got traces like this: https://nbd.name/p/0b9e4f0d From what I can tell, it also triggered the warning that shows up when page->pp_frag_count underflows. Unfortunately these traces don't directly point to the place where things go wrong. I do wonder if the pp_frag_count is maybe racy when we have a mix of get_page + page_pool_put_page calls. In case you're wondering what I was doing to trigger the crash: I simply create 4 wireless client mode interfaces on the same card and pushed TCP traffic from an AP to all 4 simultaneously. I can trigger it pretty much immediately after TCP traffic ramps up. - Felix
On 24.01.23 15:11, Ilias Apalodimas wrote: > Hi Felix, > > ++cc Alexander and Yunsheng. > > Thanks for the report > > On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote: >> >> While testing fragmented page_pool allocation in the mt76 driver, I was able >> to reliably trigger page refcount underflow issues, which did not occur with >> full-page page_pool allocation. >> It appears to me, that handling refcounting in two separate counters >> (page->pp_frag_count and page refcount) is racy when page refcount gets >> incremented by code dealing with skb fragments directly, and >> page_pool_return_skb_page is called multiple times for the same fragment. >> >> Dropping page->pp_frag_count and relying entirely on the page refcount makes >> these underflow issues and crashes go away. >> > > This has been discussed here [1]. TL;DR changing this to page > refcount might blow up in other colorful ways. Can we look closer and > figure out why the underflow happens? I don't see how the approch taken in my patch would blow up. From what I can tell, it should be fairly close to how refcount is handled in page_frag_alloc. The main improvement it adds is to prevent it from blowing up if pool-allocated fragments get shared across multiple skbs with corresponding get_page and page_pool_return_skb_page calls. - Felix
On Tue, 2023-01-24 at 18:22 +0100, Felix Fietkau wrote: > On 24.01.23 15:11, Ilias Apalodimas wrote: > > Hi Felix, > > > > ++cc Alexander and Yunsheng. > > > > Thanks for the report > > > > On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote: > > > > > > While testing fragmented page_pool allocation in the mt76 driver, I was able > > > to reliably trigger page refcount underflow issues, which did not occur with > > > full-page page_pool allocation. > > > It appears to me, that handling refcounting in two separate counters > > > (page->pp_frag_count and page refcount) is racy when page refcount gets > > > incremented by code dealing with skb fragments directly, and > > > page_pool_return_skb_page is called multiple times for the same fragment. > > > > > > Dropping page->pp_frag_count and relying entirely on the page refcount makes > > > these underflow issues and crashes go away. > > > > > > > This has been discussed here [1]. TL;DR changing this to page > > refcount might blow up in other colorful ways. Can we look closer and > > figure out why the underflow happens? > I don't see how the approch taken in my patch would blow up. From what I > can tell, it should be fairly close to how refcount is handled in > page_frag_alloc. The main improvement it adds is to prevent it from > blowing up if pool-allocated fragments get shared across multiple skbs > with corresponding get_page and page_pool_return_skb_page calls. > > - Felix > Do you have the patch available to review as an RFC? From what I am seeing it looks like you are underrunning on the pp_frag_count itself. I would suspect the issue to be something like starting with a bad count in terms of the total number of references, or deducing the wrong amount when you finally free the page assuming you are tracking your frag count using a non-atomic value in the driver. Thanks, - Alex
On 24.01.23 22:10, Alexander H Duyck wrote: > On Tue, 2023-01-24 at 18:22 +0100, Felix Fietkau wrote: >> On 24.01.23 15:11, Ilias Apalodimas wrote: >> > Hi Felix, >> > >> > ++cc Alexander and Yunsheng. >> > >> > Thanks for the report >> > >> > On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote: >> > > >> > > While testing fragmented page_pool allocation in the mt76 driver, I was able >> > > to reliably trigger page refcount underflow issues, which did not occur with >> > > full-page page_pool allocation. >> > > It appears to me, that handling refcounting in two separate counters >> > > (page->pp_frag_count and page refcount) is racy when page refcount gets >> > > incremented by code dealing with skb fragments directly, and >> > > page_pool_return_skb_page is called multiple times for the same fragment. >> > > >> > > Dropping page->pp_frag_count and relying entirely on the page refcount makes >> > > these underflow issues and crashes go away. >> > > >> > >> > This has been discussed here [1]. TL;DR changing this to page >> > refcount might blow up in other colorful ways. Can we look closer and >> > figure out why the underflow happens? >> I don't see how the approch taken in my patch would blow up. From what I >> can tell, it should be fairly close to how refcount is handled in >> page_frag_alloc. The main improvement it adds is to prevent it from >> blowing up if pool-allocated fragments get shared across multiple skbs >> with corresponding get_page and page_pool_return_skb_page calls. >> >> - Felix >> > > Do you have the patch available to review as an RFC? From what I am > seeing it looks like you are underrunning on the pp_frag_count itself. > I would suspect the issue to be something like starting with a bad > count in terms of the total number of references, or deducing the wrong > amount when you finally free the page assuming you are tracking your > frag count using a non-atomic value in the driver. The driver patches for page pool are here: https://patchwork.kernel.org/project/linux-wireless/patch/64abb23f4867c075c19d704beaae5a0a2f8e8821.1673963374.git.lorenzo@kernel.org/ https://patchwork.kernel.org/project/linux-wireless/patch/68081e02cbe2afa2d35c8aa93194f0adddbd0f05.1673963374.git.lorenzo@kernel.org/ They are also applied in my mt76 tree at: https://github.com/nbd168/wireless - Felix
On Tue, 2023-01-24 at 22:30 +0100, Felix Fietkau wrote: > On 24.01.23 22:10, Alexander H Duyck wrote: > > On Tue, 2023-01-24 at 18:22 +0100, Felix Fietkau wrote: > > > On 24.01.23 15:11, Ilias Apalodimas wrote: > > > > Hi Felix, > > > > > > > > ++cc Alexander and Yunsheng. > > > > > > > > Thanks for the report > > > > > > > > On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote: > > > > > > > > > > While testing fragmented page_pool allocation in the mt76 driver, I was able > > > > > to reliably trigger page refcount underflow issues, which did not occur with > > > > > full-page page_pool allocation. > > > > > It appears to me, that handling refcounting in two separate counters > > > > > (page->pp_frag_count and page refcount) is racy when page refcount gets > > > > > incremented by code dealing with skb fragments directly, and > > > > > page_pool_return_skb_page is called multiple times for the same fragment. > > > > > > > > > > Dropping page->pp_frag_count and relying entirely on the page refcount makes > > > > > these underflow issues and crashes go away. > > > > > > > > > > > > > This has been discussed here [1]. TL;DR changing this to page > > > > refcount might blow up in other colorful ways. Can we look closer and > > > > figure out why the underflow happens? > > > I don't see how the approch taken in my patch would blow up. From what I > > > can tell, it should be fairly close to how refcount is handled in > > > page_frag_alloc. The main improvement it adds is to prevent it from > > > blowing up if pool-allocated fragments get shared across multiple skbs > > > with corresponding get_page and page_pool_return_skb_page calls. > > > > > > - Felix > > > > > > > Do you have the patch available to review as an RFC? From what I am > > seeing it looks like you are underrunning on the pp_frag_count itself. > > I would suspect the issue to be something like starting with a bad > > count in terms of the total number of references, or deducing the wrong > > amount when you finally free the page assuming you are tracking your > > frag count using a non-atomic value in the driver. > The driver patches for page pool are here: > https://patchwork.kernel.org/project/linux-wireless/patch/64abb23f4867c075c19d704beaae5a0a2f8e8821.1673963374.git.lorenzo@kernel.org/ > https://patchwork.kernel.org/project/linux-wireless/patch/68081e02cbe2afa2d35c8aa93194f0adddbd0f05.1673963374.git.lorenzo@kernel.org/ > > They are also applied in my mt76 tree at: > https://github.com/nbd168/wireless > > - Felix So one thing I am thinking is that we may be seeing an issue where we are somehow getting a mix of frag and non-frag based page pool pages. That is the only case I can think of where we might be underflowing negative. If you could add some additional debug info on the underflow WARN_ON case in page_pool_defrag_page that might be useful. Specifically I would be curious what the actual return value is. I'm assuming we are only hitting negative 1, but I would want to verify we aren't seeing something else. Also just to confirm this is building on 64b kernel correct? Just want to make sure we don't have this running on a 32b setup where the frag count and the upper 32b of the DMA address are overlapped. As far as the patch set I only really see a few minor issues which I am going to post a few snippets below. > diff --git a/drivers/net/wireless/mediatek/mt76/dma.c > b/drivers/net/wireless/mediatek/mt76/dma.c > index 611769e445fa..7fd9aa9c3d9e 100644 ... > @@ -593,25 +593,28 @@ mt76_dma_rx_fill(struct mt76_dev *dev, struct > mt76_queue *q) > > while (q->queued < q->ndesc - 1) { > struct mt76_queue_buf qbuf; > - void *buf = NULL; > + dma_addr_t addr; > + int offset; > + void *buf; > > - buf = page_frag_alloc(&q->rx_page, q->buf_size, > GFP_ATOMIC); > + buf = mt76_get_page_pool_buf(q, &offset, q- > >buf_size); > if (!buf) > break; > > - addr = dma_map_single(dev->dma_dev, buf, len, > DMA_FROM_DEVICE); > + addr = dma_map_single(dev->dma_dev, buf + offset, > len, > + DMA_FROM_DEVICE); Offset was already added to buf in mt76_get_page_pool_buf so the DMA mapping offset doesn't look right to me. > if (unlikely(dma_mapping_error(dev->dma_dev, addr))) > { > - skb_free_frag(buf); > + mt76_put_page_pool_buf(buf, allow_direct); > break; > } > I'm not a fan of the defensive programming in mt76_put_page_pool_buf. If you are in an area that is using page pool you should be using the page pool version of the freeing operations instead of adding additional overhead that can mess things up by having it have to also check for if the page is a page pool page or not. > - qbuf.addr = addr + offset; > - qbuf.len = len - offset; > + qbuf.addr = addr + q->buf_offset; > + qbuf.len = len - q->buf_offset; > qbuf.skip_unmap = false; > if (mt76_dma_add_rx_buf(dev, q, &qbuf, buf) < 0) { > dma_unmap_single(dev->dma_dev, addr, len, > DMA_FROM_DEVICE); > - skb_free_frag(buf); > + mt76_put_page_pool_buf(buf, allow_direct); > break; > } > frames++; ... > @@ -848,6 +847,8 @@ mt76_dma_rx_process(struct mt76_dev *dev, struct > mt76_queue *q, int budget) > goto free_frag; > > skb_reserve(skb, q->buf_offset); > + if (mt76_is_page_from_pp(data)) > + skb_mark_for_recycle(skb); > > *(u32 *)skb->cb = info; > More defensive programming here. Is there a path that allows for a mixed setup? The only spot where I can see there being anything like that is in /drivers/net/wireless/mediatek/mt76/mt7915/mmio.c. But it doesn't make any sense to me as to why it was included in the patch. It might be easier to sort out the issue if we were to get rid of some of the defensive programming.
On 25.01.23 18:11, Alexander H Duyck wrote: > On Tue, 2023-01-24 at 22:30 +0100, Felix Fietkau wrote: >> On 24.01.23 22:10, Alexander H Duyck wrote: >> > On Tue, 2023-01-24 at 18:22 +0100, Felix Fietkau wrote: >> > > On 24.01.23 15:11, Ilias Apalodimas wrote: >> > > > Hi Felix, >> > > > >> > > > ++cc Alexander and Yunsheng. >> > > > >> > > > Thanks for the report >> > > > >> > > > On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote: >> > > > > >> > > > > While testing fragmented page_pool allocation in the mt76 driver, I was able >> > > > > to reliably trigger page refcount underflow issues, which did not occur with >> > > > > full-page page_pool allocation. >> > > > > It appears to me, that handling refcounting in two separate counters >> > > > > (page->pp_frag_count and page refcount) is racy when page refcount gets >> > > > > incremented by code dealing with skb fragments directly, and >> > > > > page_pool_return_skb_page is called multiple times for the same fragment. >> > > > > >> > > > > Dropping page->pp_frag_count and relying entirely on the page refcount makes >> > > > > these underflow issues and crashes go away. >> > > > > >> > > > >> > > > This has been discussed here [1]. TL;DR changing this to page >> > > > refcount might blow up in other colorful ways. Can we look closer and >> > > > figure out why the underflow happens? >> > > I don't see how the approch taken in my patch would blow up. From what I >> > > can tell, it should be fairly close to how refcount is handled in >> > > page_frag_alloc. The main improvement it adds is to prevent it from >> > > blowing up if pool-allocated fragments get shared across multiple skbs >> > > with corresponding get_page and page_pool_return_skb_page calls. >> > > >> > > - Felix >> > > >> > >> > Do you have the patch available to review as an RFC? From what I am >> > seeing it looks like you are underrunning on the pp_frag_count itself. >> > I would suspect the issue to be something like starting with a bad >> > count in terms of the total number of references, or deducing the wrong >> > amount when you finally free the page assuming you are tracking your >> > frag count using a non-atomic value in the driver. >> The driver patches for page pool are here: >> https://patchwork.kernel.org/project/linux-wireless/patch/64abb23f4867c075c19d704beaae5a0a2f8e8821.1673963374.git.lorenzo@kernel.org/ >> https://patchwork.kernel.org/project/linux-wireless/patch/68081e02cbe2afa2d35c8aa93194f0adddbd0f05.1673963374.git.lorenzo@kernel.org/ >> >> They are also applied in my mt76 tree at: >> https://github.com/nbd168/wireless >> >> - Felix > > So one thing I am thinking is that we may be seeing an issue where we > are somehow getting a mix of frag and non-frag based page pool pages. > That is the only case I can think of where we might be underflowing > negative. If you could add some additional debug info on the underflow > WARN_ON case in page_pool_defrag_page that might be useful. > Specifically I would be curious what the actual return value is. I'm > assuming we are only hitting negative 1, but I would want to verify we > aren't seeing something else. I'll try to run some more tests soon. However, I think I found the piece of code that is incompatible with using pp_frag_count. When receiving an A-MSDU packet (multiple MSDUs within a single 802.11 packet), and it is not split by the hardware, a cfg80211 function extracts the individual MSDUs into separate skbs. In that case, a fragment can be shared across multiple skbs, and get_page is used to increase the refcount. You can find this in net/wireless/util.c: ieee80211_amsdu_to_8023s (and its helper functions). This code also has a bug where it doesn't set pp_recycle on the newly allocated skb if the previous one has it, but that's a separate matter and fixing it doesn't make the crash go away. Is there any way I can make that part of the code work with the current page pool frag implementation? > Also just to confirm this is building on 64b kernel correct? Just want > to make sure we don't have this running on a 32b setup where the frag > count and the upper 32b of the DMA address are overlapped. Yes, I'm using a 64b kernel. > As far as the patch set I only really see a few minor issues which I am > going to post a few snippets below. > > >> diff --git a/drivers/net/wireless/mediatek/mt76/dma.c >> b/drivers/net/wireless/mediatek/mt76/dma.c >> index 611769e445fa..7fd9aa9c3d9e 100644 > > ... > >> @@ -593,25 +593,28 @@ mt76_dma_rx_fill(struct mt76_dev *dev, struct >> mt76_queue *q) >> >> while (q->queued < q->ndesc - 1) { >> struct mt76_queue_buf qbuf; >> - void *buf = NULL; >> + dma_addr_t addr; >> + int offset; >> + void *buf; >> >> - buf = page_frag_alloc(&q->rx_page, q->buf_size, >> GFP_ATOMIC); >> + buf = mt76_get_page_pool_buf(q, &offset, q- >> >buf_size); >> if (!buf) >> break; >> >> - addr = dma_map_single(dev->dma_dev, buf, len, >> DMA_FROM_DEVICE); >> + addr = dma_map_single(dev->dma_dev, buf + offset, >> len, >> + DMA_FROM_DEVICE); > > Offset was already added to buf in mt76_get_page_pool_buf so the DMA > mapping offset doesn't look right to me. Right. This is resolved by the follow-up patch which keeps pages DMA mapped. I plan on squashing both patches into one and adding some fixes on top when the underlying page pool issue is resolved. >> if (unlikely(dma_mapping_error(dev->dma_dev, addr))) >> { >> - skb_free_frag(buf); >> + mt76_put_page_pool_buf(buf, allow_direct); >> break; >> } >> > > I'm not a fan of the defensive programming in mt76_put_page_pool_buf. > If you are in an area that is using page pool you should be using the > page pool version of the freeing operations instead of adding > additional overhead that can mess things up by having it have to also > check for if the page is a page pool page or not. See below. >> - qbuf.addr = addr + offset; >> - qbuf.len = len - offset; >> + qbuf.addr = addr + q->buf_offset; >> + qbuf.len = len - q->buf_offset; >> qbuf.skip_unmap = false; >> if (mt76_dma_add_rx_buf(dev, q, &qbuf, buf) < 0) { >> dma_unmap_single(dev->dma_dev, addr, len, >> DMA_FROM_DEVICE); >> - skb_free_frag(buf); >> + mt76_put_page_pool_buf(buf, allow_direct); >> break; >> } >> frames++; > > ... > >> @@ -848,6 +847,8 @@ mt76_dma_rx_process(struct mt76_dev *dev, struct >> mt76_queue *q, int budget) >> goto free_frag; >> >> skb_reserve(skb, q->buf_offset); >> + if (mt76_is_page_from_pp(data)) >> + skb_mark_for_recycle(skb); >> >> *(u32 *)skb->cb = info; >> > > More defensive programming here. Is there a path that allows for a > mixed setup? > > The only spot where I can see there being anything like that is in > /drivers/net/wireless/mediatek/mt76/mt7915/mmio.c. But it doesn't make > any sense to me as to why it was included in the patch. It might be > easier to sort out the issue if we were to get rid of some of the > defensive programming. This is not defensive programming. In its current state, there is a scenario where we can have a mix of pp and non-pp pages (if hardware offload support is enabled). However in my tests, offload support was disabled and all pages are PP ones. I also have some unpublished pending changes to always allocate from the pool (even for the initial buffers allocated for offloading). This did not make a difference in my tests though. - Felix
On Wed, 2023-01-25 at 18:32 +0100, Felix Fietkau wrote: > On 25.01.23 18:11, Alexander H Duyck wrote: > > On Tue, 2023-01-24 at 22:30 +0100, Felix Fietkau wrote: > > > On 24.01.23 22:10, Alexander H Duyck wrote: > > > > On Tue, 2023-01-24 at 18:22 +0100, Felix Fietkau wrote: > > > > > On 24.01.23 15:11, Ilias Apalodimas wrote: > > > > > > Hi Felix, > > > > > > > > > > > > ++cc Alexander and Yunsheng. > > > > > > > > > > > > Thanks for the report > > > > > > > > > > > > On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote: > > > > > > > > > > > > > > While testing fragmented page_pool allocation in the mt76 driver, I was able > > > > > > > to reliably trigger page refcount underflow issues, which did not occur with > > > > > > > full-page page_pool allocation. > > > > > > > It appears to me, that handling refcounting in two separate counters > > > > > > > (page->pp_frag_count and page refcount) is racy when page refcount gets > > > > > > > incremented by code dealing with skb fragments directly, and > > > > > > > page_pool_return_skb_page is called multiple times for the same fragment. > > > > > > > > > > > > > > Dropping page->pp_frag_count and relying entirely on the page refcount makes > > > > > > > these underflow issues and crashes go away. > > > > > > > > > > > > > > > > > > > This has been discussed here [1]. TL;DR changing this to page > > > > > > refcount might blow up in other colorful ways. Can we look closer and > > > > > > figure out why the underflow happens? > > > > > I don't see how the approch taken in my patch would blow up. From what I > > > > > can tell, it should be fairly close to how refcount is handled in > > > > > page_frag_alloc. The main improvement it adds is to prevent it from > > > > > blowing up if pool-allocated fragments get shared across multiple skbs > > > > > with corresponding get_page and page_pool_return_skb_page calls. > > > > > > > > > > - Felix > > > > > > > > > > > > > Do you have the patch available to review as an RFC? From what I am > > > > seeing it looks like you are underrunning on the pp_frag_count itself. > > > > I would suspect the issue to be something like starting with a bad > > > > count in terms of the total number of references, or deducing the wrong > > > > amount when you finally free the page assuming you are tracking your > > > > frag count using a non-atomic value in the driver. > > > The driver patches for page pool are here: > > > https://patchwork.kernel.org/project/linux-wireless/patch/64abb23f4867c075c19d704beaae5a0a2f8e8821.1673963374.git.lorenzo@kernel.org/ > > > https://patchwork.kernel.org/project/linux-wireless/patch/68081e02cbe2afa2d35c8aa93194f0adddbd0f05.1673963374.git.lorenzo@kernel.org/ > > > > > > They are also applied in my mt76 tree at: > > > https://github.com/nbd168/wireless > > > > > > - Felix > > > > So one thing I am thinking is that we may be seeing an issue where we > > are somehow getting a mix of frag and non-frag based page pool pages. > > That is the only case I can think of where we might be underflowing > > negative. If you could add some additional debug info on the underflow > > WARN_ON case in page_pool_defrag_page that might be useful. > > Specifically I would be curious what the actual return value is. I'm > > assuming we are only hitting negative 1, but I would want to verify we > > aren't seeing something else. > I'll try to run some more tests soon. However, I think I found the piece > of code that is incompatible with using pp_frag_count. > When receiving an A-MSDU packet (multiple MSDUs within a single 802.11 > packet), and it is not split by the hardware, a cfg80211 function > extracts the individual MSDUs into separate skbs. In that case, a > fragment can be shared across multiple skbs, and get_page is used to > increase the refcount. > You can find this in net/wireless/util.c: ieee80211_amsdu_to_8023s (and > its helper functions). I'm not sure if it is problematic or not. Basically it is trading off by copying over the frags, calling get_page on each frag, and then using dev_kfree_skb to disassemble and release the pp_frag references. There should be other paths in the kernel that are doing something similar. > This code also has a bug where it doesn't set pp_recycle on the newly > allocated skb if the previous one has it, but that's a separate matter > and fixing it doesn't make the crash go away. Adding the recycle would cause this bug. So one thing we might be seeing is something like that triggering this error. Specifically if the page is taken via get_page when assembling the new skb then we cannot set the recycle flag in the new skb otherwise it will result in the reference undercount we are seeing. What we are doing is shifting the references away from the pp_frag_count to the page reference count in this case. If we set the pp_recycle flag then it would cause us to decrement pp_frag_count instead of the page reference count resulting in the underrun. > Is there any way I can make that part of the code work with the current > page pool frag implementation? The current code should work. Basically as long as the references are taken w/ get_page and skb->pp_recycle is not set then we shouldn't run into this issue because the pp_frag_count will be dropped when the original skb is freed and the page reference count will be decremented when the new one is freed. For page pool page fragments the main thing to keep in mind is that if pp_recycle is set it will update the pp_frag_count and if it is not then it will just decrement the page reference count.
On 25.01.23 19:26, Alexander H Duyck wrote: > On Wed, 2023-01-25 at 18:32 +0100, Felix Fietkau wrote: >> On 25.01.23 18:11, Alexander H Duyck wrote: >> > On Tue, 2023-01-24 at 22:30 +0100, Felix Fietkau wrote: >> > > On 24.01.23 22:10, Alexander H Duyck wrote: >> > > > On Tue, 2023-01-24 at 18:22 +0100, Felix Fietkau wrote: >> > > > > On 24.01.23 15:11, Ilias Apalodimas wrote: >> > > > > > Hi Felix, >> > > > > > >> > > > > > ++cc Alexander and Yunsheng. >> > > > > > >> > > > > > Thanks for the report >> > > > > > >> > > > > > On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote: >> > > > > > > >> > > > > > > While testing fragmented page_pool allocation in the mt76 driver, I was able >> > > > > > > to reliably trigger page refcount underflow issues, which did not occur with >> > > > > > > full-page page_pool allocation. >> > > > > > > It appears to me, that handling refcounting in two separate counters >> > > > > > > (page->pp_frag_count and page refcount) is racy when page refcount gets >> > > > > > > incremented by code dealing with skb fragments directly, and >> > > > > > > page_pool_return_skb_page is called multiple times for the same fragment. >> > > > > > > >> > > > > > > Dropping page->pp_frag_count and relying entirely on the page refcount makes >> > > > > > > these underflow issues and crashes go away. >> > > > > > > >> > > > > > >> > > > > > This has been discussed here [1]. TL;DR changing this to page >> > > > > > refcount might blow up in other colorful ways. Can we look closer and >> > > > > > figure out why the underflow happens? >> > > > > I don't see how the approch taken in my patch would blow up. From what I >> > > > > can tell, it should be fairly close to how refcount is handled in >> > > > > page_frag_alloc. The main improvement it adds is to prevent it from >> > > > > blowing up if pool-allocated fragments get shared across multiple skbs >> > > > > with corresponding get_page and page_pool_return_skb_page calls. >> > > > > >> > > > > - Felix >> > > > > >> > > > >> > > > Do you have the patch available to review as an RFC? From what I am >> > > > seeing it looks like you are underrunning on the pp_frag_count itself. >> > > > I would suspect the issue to be something like starting with a bad >> > > > count in terms of the total number of references, or deducing the wrong >> > > > amount when you finally free the page assuming you are tracking your >> > > > frag count using a non-atomic value in the driver. >> > > The driver patches for page pool are here: >> > > https://patchwork.kernel.org/project/linux-wireless/patch/64abb23f4867c075c19d704beaae5a0a2f8e8821.1673963374.git.lorenzo@kernel.org/ >> > > https://patchwork.kernel.org/project/linux-wireless/patch/68081e02cbe2afa2d35c8aa93194f0adddbd0f05.1673963374.git.lorenzo@kernel.org/ >> > > >> > > They are also applied in my mt76 tree at: >> > > https://github.com/nbd168/wireless >> > > >> > > - Felix >> > >> > So one thing I am thinking is that we may be seeing an issue where we >> > are somehow getting a mix of frag and non-frag based page pool pages. >> > That is the only case I can think of where we might be underflowing >> > negative. If you could add some additional debug info on the underflow >> > WARN_ON case in page_pool_defrag_page that might be useful. >> > Specifically I would be curious what the actual return value is. I'm >> > assuming we are only hitting negative 1, but I would want to verify we >> > aren't seeing something else. >> I'll try to run some more tests soon. However, I think I found the piece >> of code that is incompatible with using pp_frag_count. >> When receiving an A-MSDU packet (multiple MSDUs within a single 802.11 >> packet), and it is not split by the hardware, a cfg80211 function >> extracts the individual MSDUs into separate skbs. In that case, a >> fragment can be shared across multiple skbs, and get_page is used to >> increase the refcount. >> You can find this in net/wireless/util.c: ieee80211_amsdu_to_8023s (and >> its helper functions). > > I'm not sure if it is problematic or not. Basically it is trading off > by copying over the frags, calling get_page on each frag, and then > using dev_kfree_skb to disassemble and release the pp_frag references. > There should be other paths in the kernel that are doing something > similar. > >> This code also has a bug where it doesn't set pp_recycle on the newly >> allocated skb if the previous one has it, but that's a separate matter >> and fixing it doesn't make the crash go away. > > Adding the recycle would cause this bug. So one thing we might be > seeing is something like that triggering this error. Specifically if > the page is taken via get_page when assembling the new skb then we > cannot set the recycle flag in the new skb otherwise it will result in > the reference undercount we are seeing. What we are doing is shifting > the references away from the pp_frag_count to the page reference count > in this case. If we set the pp_recycle flag then it would cause us to > decrement pp_frag_count instead of the page reference count resulting > in the underrun. Couldn't leaving out the pp_recycle flag potentially lead to a case where the last user of the page drops it via page_frag_free instead of page_pool_return_skb_page? Is that valid? >> Is there any way I can make that part of the code work with the current >> page pool frag implementation? > > The current code should work. Basically as long as the references are > taken w/ get_page and skb->pp_recycle is not set then we shouldn't run > into this issue because the pp_frag_count will be dropped when the > original skb is freed and the page reference count will be decremented > when the new one is freed. > > For page pool page fragments the main thing to keep in mind is that if > pp_recycle is set it will update the pp_frag_count and if it is not > then it will just decrement the page reference count. What takes care of DMA unmap and other cleanup if the last reference to the page is dropped via page_frag_free? - Felix
On Wed, 2023-01-25 at 19:42 +0100, Felix Fietkau wrote: > On 25.01.23 19:26, Alexander H Duyck wrote: > > On Wed, 2023-01-25 at 18:32 +0100, Felix Fietkau wrote: > > > On 25.01.23 18:11, Alexander H Duyck wrote: > > > > On Tue, 2023-01-24 at 22:30 +0100, Felix Fietkau wrote: > > > > > On 24.01.23 22:10, Alexander H Duyck wrote: > > > > > > On Tue, 2023-01-24 at 18:22 +0100, Felix Fietkau wrote: > > > > > > > On 24.01.23 15:11, Ilias Apalodimas wrote: > > > > > > > > Hi Felix, > > > > > > > > > > > > > > > > ++cc Alexander and Yunsheng. > > > > > > > > > > > > > > > > Thanks for the report > > > > > > > > > > > > > > > > On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote: > > > > > > > > > > > > > > > > > > While testing fragmented page_pool allocation in the mt76 driver, I was able > > > > > > > > > to reliably trigger page refcount underflow issues, which did not occur with > > > > > > > > > full-page page_pool allocation. > > > > > > > > > It appears to me, that handling refcounting in two separate counters > > > > > > > > > (page->pp_frag_count and page refcount) is racy when page refcount gets > > > > > > > > > incremented by code dealing with skb fragments directly, and > > > > > > > > > page_pool_return_skb_page is called multiple times for the same fragment. > > > > > > > > > > > > > > > > > > Dropping page->pp_frag_count and relying entirely on the page refcount makes > > > > > > > > > these underflow issues and crashes go away. > > > > > > > > > > > > > > > > > > > > > > > > > This has been discussed here [1]. TL;DR changing this to page > > > > > > > > refcount might blow up in other colorful ways. Can we look closer and > > > > > > > > figure out why the underflow happens? > > > > > > > I don't see how the approch taken in my patch would blow up. From what I > > > > > > > can tell, it should be fairly close to how refcount is handled in > > > > > > > page_frag_alloc. The main improvement it adds is to prevent it from > > > > > > > blowing up if pool-allocated fragments get shared across multiple skbs > > > > > > > with corresponding get_page and page_pool_return_skb_page calls. > > > > > > > > > > > > > > - Felix > > > > > > > > > > > > > > > > > > > Do you have the patch available to review as an RFC? From what I am > > > > > > seeing it looks like you are underrunning on the pp_frag_count itself. > > > > > > I would suspect the issue to be something like starting with a bad > > > > > > count in terms of the total number of references, or deducing the wrong > > > > > > amount when you finally free the page assuming you are tracking your > > > > > > frag count using a non-atomic value in the driver. > > > > > The driver patches for page pool are here: > > > > > https://patchwork.kernel.org/project/linux-wireless/patch/64abb23f4867c075c19d704beaae5a0a2f8e8821.1673963374.git.lorenzo@kernel.org/ > > > > > https://patchwork.kernel.org/project/linux-wireless/patch/68081e02cbe2afa2d35c8aa93194f0adddbd0f05.1673963374.git.lorenzo@kernel.org/ > > > > > > > > > > They are also applied in my mt76 tree at: > > > > > https://github.com/nbd168/wireless > > > > > > > > > > - Felix > > > > > > > > So one thing I am thinking is that we may be seeing an issue where we > > > > are somehow getting a mix of frag and non-frag based page pool pages. > > > > That is the only case I can think of where we might be underflowing > > > > negative. If you could add some additional debug info on the underflow > > > > WARN_ON case in page_pool_defrag_page that might be useful. > > > > Specifically I would be curious what the actual return value is. I'm > > > > assuming we are only hitting negative 1, but I would want to verify we > > > > aren't seeing something else. > > > I'll try to run some more tests soon. However, I think I found the piece > > > of code that is incompatible with using pp_frag_count. > > > When receiving an A-MSDU packet (multiple MSDUs within a single 802.11 > > > packet), and it is not split by the hardware, a cfg80211 function > > > extracts the individual MSDUs into separate skbs. In that case, a > > > fragment can be shared across multiple skbs, and get_page is used to > > > increase the refcount. > > > You can find this in net/wireless/util.c: ieee80211_amsdu_to_8023s (and > > > its helper functions). > > > > I'm not sure if it is problematic or not. Basically it is trading off > > by copying over the frags, calling get_page on each frag, and then > > using dev_kfree_skb to disassemble and release the pp_frag references. > > There should be other paths in the kernel that are doing something > > similar. > > > > > This code also has a bug where it doesn't set pp_recycle on the newly > > > allocated skb if the previous one has it, but that's a separate matter > > > and fixing it doesn't make the crash go away. > > > > Adding the recycle would cause this bug. So one thing we might be > > seeing is something like that triggering this error. Specifically if > > the page is taken via get_page when assembling the new skb then we > > cannot set the recycle flag in the new skb otherwise it will result in > > the reference undercount we are seeing. What we are doing is shifting > > the references away from the pp_frag_count to the page reference count > > in this case. If we set the pp_recycle flag then it would cause us to > > decrement pp_frag_count instead of the page reference count resulting > > in the underrun. > Couldn't leaving out the pp_recycle flag potentially lead to a case > where the last user of the page drops it via page_frag_free instead of > page_pool_return_skb_page? Is that valid? No. What will happen is that when the pp_frag_count is exhausted the page will be unmapped and evicted from the page pool. When the page is then finally freed it will end up going back to the page allocator instead of page pool. Basically the idea is that until pp_frag_count reaches 0 there will be at least 1 page reference held. > > > Is there any way I can make that part of the code work with the current > > > page pool frag implementation? > > > > The current code should work. Basically as long as the references are > > taken w/ get_page and skb->pp_recycle is not set then we shouldn't run > > into this issue because the pp_frag_count will be dropped when the > > original skb is freed and the page reference count will be decremented > > when the new one is freed. > > > > For page pool page fragments the main thing to keep in mind is that if > > pp_recycle is set it will update the pp_frag_count and if it is not > > then it will just decrement the page reference count. > What takes care of DMA unmap and other cleanup if the last reference to > the page is dropped via page_frag_free? > > - Felix When the page is freed on the skb w/ pp_recycle set it will unmap the page and evict it from the page pool. Basically in these cases the page goes from the page pool back to the page allocator. The general idea with this is that if we are using fragments that there will be enough of them floating around that if one or two frags have a temporeary detour through a non-recycling path that hopefully by the time the last fragment is freed the other instances holding the additional page reference will have let them go. If not then the page will go back to the page allocator and it will have to be replaced in the page pool.
On 25.01.23 20:02, Alexander H Duyck wrote: > On Wed, 2023-01-25 at 19:42 +0100, Felix Fietkau wrote: >> On 25.01.23 19:26, Alexander H Duyck wrote: >> > On Wed, 2023-01-25 at 18:32 +0100, Felix Fietkau wrote: >> > > On 25.01.23 18:11, Alexander H Duyck wrote: >> > > > On Tue, 2023-01-24 at 22:30 +0100, Felix Fietkau wrote: >> > > > > On 24.01.23 22:10, Alexander H Duyck wrote: >> > > > > > On Tue, 2023-01-24 at 18:22 +0100, Felix Fietkau wrote: >> > > > > > > On 24.01.23 15:11, Ilias Apalodimas wrote: >> > > > > > > > Hi Felix, >> > > > > > > > >> > > > > > > > ++cc Alexander and Yunsheng. >> > > > > > > > >> > > > > > > > Thanks for the report >> > > > > > > > >> > > > > > > > On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote: >> > > > > > > > > >> > > > > > > > > While testing fragmented page_pool allocation in the mt76 driver, I was able >> > > > > > > > > to reliably trigger page refcount underflow issues, which did not occur with >> > > > > > > > > full-page page_pool allocation. >> > > > > > > > > It appears to me, that handling refcounting in two separate counters >> > > > > > > > > (page->pp_frag_count and page refcount) is racy when page refcount gets >> > > > > > > > > incremented by code dealing with skb fragments directly, and >> > > > > > > > > page_pool_return_skb_page is called multiple times for the same fragment. >> > > > > > > > > >> > > > > > > > > Dropping page->pp_frag_count and relying entirely on the page refcount makes >> > > > > > > > > these underflow issues and crashes go away. >> > > > > > > > > >> > > > > > > > >> > > > > > > > This has been discussed here [1]. TL;DR changing this to page >> > > > > > > > refcount might blow up in other colorful ways. Can we look closer and >> > > > > > > > figure out why the underflow happens? >> > > > > > > I don't see how the approch taken in my patch would blow up. From what I >> > > > > > > can tell, it should be fairly close to how refcount is handled in >> > > > > > > page_frag_alloc. The main improvement it adds is to prevent it from >> > > > > > > blowing up if pool-allocated fragments get shared across multiple skbs >> > > > > > > with corresponding get_page and page_pool_return_skb_page calls. >> > > > > > > >> > > > > > > - Felix >> > > > > > > >> > > > > > >> > > > > > Do you have the patch available to review as an RFC? From what I am >> > > > > > seeing it looks like you are underrunning on the pp_frag_count itself. >> > > > > > I would suspect the issue to be something like starting with a bad >> > > > > > count in terms of the total number of references, or deducing the wrong >> > > > > > amount when you finally free the page assuming you are tracking your >> > > > > > frag count using a non-atomic value in the driver. >> > > > > The driver patches for page pool are here: >> > > > > https://patchwork.kernel.org/project/linux-wireless/patch/64abb23f4867c075c19d704beaae5a0a2f8e8821.1673963374.git.lorenzo@kernel.org/ >> > > > > https://patchwork.kernel.org/project/linux-wireless/patch/68081e02cbe2afa2d35c8aa93194f0adddbd0f05.1673963374.git.lorenzo@kernel.org/ >> > > > > >> > > > > They are also applied in my mt76 tree at: >> > > > > https://github.com/nbd168/wireless >> > > > > >> > > > > - Felix >> > > > >> > > > So one thing I am thinking is that we may be seeing an issue where we >> > > > are somehow getting a mix of frag and non-frag based page pool pages. >> > > > That is the only case I can think of where we might be underflowing >> > > > negative. If you could add some additional debug info on the underflow >> > > > WARN_ON case in page_pool_defrag_page that might be useful. >> > > > Specifically I would be curious what the actual return value is. I'm >> > > > assuming we are only hitting negative 1, but I would want to verify we >> > > > aren't seeing something else. >> > > I'll try to run some more tests soon. However, I think I found the piece >> > > of code that is incompatible with using pp_frag_count. >> > > When receiving an A-MSDU packet (multiple MSDUs within a single 802.11 >> > > packet), and it is not split by the hardware, a cfg80211 function >> > > extracts the individual MSDUs into separate skbs. In that case, a >> > > fragment can be shared across multiple skbs, and get_page is used to >> > > increase the refcount. >> > > You can find this in net/wireless/util.c: ieee80211_amsdu_to_8023s (and >> > > its helper functions). >> > >> > I'm not sure if it is problematic or not. Basically it is trading off >> > by copying over the frags, calling get_page on each frag, and then >> > using dev_kfree_skb to disassemble and release the pp_frag references. >> > There should be other paths in the kernel that are doing something >> > similar. >> > >> > > This code also has a bug where it doesn't set pp_recycle on the newly >> > > allocated skb if the previous one has it, but that's a separate matter >> > > and fixing it doesn't make the crash go away. >> > >> > Adding the recycle would cause this bug. So one thing we might be >> > seeing is something like that triggering this error. Specifically if >> > the page is taken via get_page when assembling the new skb then we >> > cannot set the recycle flag in the new skb otherwise it will result in >> > the reference undercount we are seeing. What we are doing is shifting >> > the references away from the pp_frag_count to the page reference count >> > in this case. If we set the pp_recycle flag then it would cause us to >> > decrement pp_frag_count instead of the page reference count resulting >> > in the underrun. >> Couldn't leaving out the pp_recycle flag potentially lead to a case >> where the last user of the page drops it via page_frag_free instead of >> page_pool_return_skb_page? Is that valid? > > No. What will happen is that when the pp_frag_count is exhausted the > page will be unmapped and evicted from the page pool. When the page is > then finally freed it will end up going back to the page allocator > instead of page pool. > > Basically the idea is that until pp_frag_count reaches 0 there will be > at least 1 page reference held. > >> > > Is there any way I can make that part of the code work with the current >> > > page pool frag implementation? >> > >> > The current code should work. Basically as long as the references are >> > taken w/ get_page and skb->pp_recycle is not set then we shouldn't run >> > into this issue because the pp_frag_count will be dropped when the >> > original skb is freed and the page reference count will be decremented >> > when the new one is freed. >> > >> > For page pool page fragments the main thing to keep in mind is that if >> > pp_recycle is set it will update the pp_frag_count and if it is not >> > then it will just decrement the page reference count. >> What takes care of DMA unmap and other cleanup if the last reference to >> the page is dropped via page_frag_free? >> >> - Felix > > When the page is freed on the skb w/ pp_recycle set it will unmap the > page and evict it from the page pool. Basically in these cases the page > goes from the page pool back to the page allocator. > > The general idea with this is that if we are using fragments that there > will be enough of them floating around that if one or two frags have a > temporeary detour through a non-recycling path that hopefully by the > time the last fragment is freed the other instances holding the > additional page reference will have let them go. If not then the page > will go back to the page allocator and it will have to be replaced in > the page pool. Thanks for the explanation, it makes sense to me now. Unfortunately it also means that I have no idea what could cause this issue. I will finish my mt76 patch rework which gets rid of the pp vs non-pp allocation mix and re-run my tests to provide updated traces. - Felix
On 25.01.23 20:10, Felix Fietkau wrote: > On 25.01.23 20:02, Alexander H Duyck wrote: >> On Wed, 2023-01-25 at 19:42 +0100, Felix Fietkau wrote: >>> On 25.01.23 19:26, Alexander H Duyck wrote: >>> > On Wed, 2023-01-25 at 18:32 +0100, Felix Fietkau wrote: >>> > > On 25.01.23 18:11, Alexander H Duyck wrote: >>> > > > On Tue, 2023-01-24 at 22:30 +0100, Felix Fietkau wrote: >>> > > > > On 24.01.23 22:10, Alexander H Duyck wrote: >>> > > > > > On Tue, 2023-01-24 at 18:22 +0100, Felix Fietkau wrote: >>> > > > > > > On 24.01.23 15:11, Ilias Apalodimas wrote: >>> > > > > > > > Hi Felix, >>> > > > > > > > >>> > > > > > > > ++cc Alexander and Yunsheng. >>> > > > > > > > >>> > > > > > > > Thanks for the report >>> > > > > > > > >>> > > > > > > > On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote: >>> > > > > > > > > >>> > > > > > > > > While testing fragmented page_pool allocation in the mt76 driver, I was able >>> > > > > > > > > to reliably trigger page refcount underflow issues, which did not occur with >>> > > > > > > > > full-page page_pool allocation. >>> > > > > > > > > It appears to me, that handling refcounting in two separate counters >>> > > > > > > > > (page->pp_frag_count and page refcount) is racy when page refcount gets >>> > > > > > > > > incremented by code dealing with skb fragments directly, and >>> > > > > > > > > page_pool_return_skb_page is called multiple times for the same fragment. >>> > > > > > > > > >>> > > > > > > > > Dropping page->pp_frag_count and relying entirely on the page refcount makes >>> > > > > > > > > these underflow issues and crashes go away. >>> > > > > > > > > >>> > > > > > > > >>> > > > > > > > This has been discussed here [1]. TL;DR changing this to page >>> > > > > > > > refcount might blow up in other colorful ways. Can we look closer and >>> > > > > > > > figure out why the underflow happens? >>> > > > > > > I don't see how the approch taken in my patch would blow up. From what I >>> > > > > > > can tell, it should be fairly close to how refcount is handled in >>> > > > > > > page_frag_alloc. The main improvement it adds is to prevent it from >>> > > > > > > blowing up if pool-allocated fragments get shared across multiple skbs >>> > > > > > > with corresponding get_page and page_pool_return_skb_page calls. >>> > > > > > > >>> > > > > > > - Felix >>> > > > > > > >>> > > > > > >>> > > > > > Do you have the patch available to review as an RFC? From what I am >>> > > > > > seeing it looks like you are underrunning on the pp_frag_count itself. >>> > > > > > I would suspect the issue to be something like starting with a bad >>> > > > > > count in terms of the total number of references, or deducing the wrong >>> > > > > > amount when you finally free the page assuming you are tracking your >>> > > > > > frag count using a non-atomic value in the driver. >>> > > > > The driver patches for page pool are here: >>> > > > > https://patchwork.kernel.org/project/linux-wireless/patch/64abb23f4867c075c19d704beaae5a0a2f8e8821.1673963374.git.lorenzo@kernel.org/ >>> > > > > https://patchwork.kernel.org/project/linux-wireless/patch/68081e02cbe2afa2d35c8aa93194f0adddbd0f05.1673963374.git.lorenzo@kernel.org/ >>> > > > > >>> > > > > They are also applied in my mt76 tree at: >>> > > > > https://github.com/nbd168/wireless >>> > > > > >>> > > > > - Felix >>> > > > >>> > > > So one thing I am thinking is that we may be seeing an issue where we >>> > > > are somehow getting a mix of frag and non-frag based page pool pages. >>> > > > That is the only case I can think of where we might be underflowing >>> > > > negative. If you could add some additional debug info on the underflow >>> > > > WARN_ON case in page_pool_defrag_page that might be useful. >>> > > > Specifically I would be curious what the actual return value is. I'm >>> > > > assuming we are only hitting negative 1, but I would want to verify we >>> > > > aren't seeing something else. >>> > > I'll try to run some more tests soon. However, I think I found the piece >>> > > of code that is incompatible with using pp_frag_count. >>> > > When receiving an A-MSDU packet (multiple MSDUs within a single 802.11 >>> > > packet), and it is not split by the hardware, a cfg80211 function >>> > > extracts the individual MSDUs into separate skbs. In that case, a >>> > > fragment can be shared across multiple skbs, and get_page is used to >>> > > increase the refcount. >>> > > You can find this in net/wireless/util.c: ieee80211_amsdu_to_8023s (and >>> > > its helper functions). >>> > >>> > I'm not sure if it is problematic or not. Basically it is trading off >>> > by copying over the frags, calling get_page on each frag, and then >>> > using dev_kfree_skb to disassemble and release the pp_frag references. >>> > There should be other paths in the kernel that are doing something >>> > similar. >>> > >>> > > This code also has a bug where it doesn't set pp_recycle on the newly >>> > > allocated skb if the previous one has it, but that's a separate matter >>> > > and fixing it doesn't make the crash go away. >>> > >>> > Adding the recycle would cause this bug. So one thing we might be >>> > seeing is something like that triggering this error. Specifically if >>> > the page is taken via get_page when assembling the new skb then we >>> > cannot set the recycle flag in the new skb otherwise it will result in >>> > the reference undercount we are seeing. What we are doing is shifting >>> > the references away from the pp_frag_count to the page reference count >>> > in this case. If we set the pp_recycle flag then it would cause us to >>> > decrement pp_frag_count instead of the page reference count resulting >>> > in the underrun. >>> Couldn't leaving out the pp_recycle flag potentially lead to a case >>> where the last user of the page drops it via page_frag_free instead of >>> page_pool_return_skb_page? Is that valid? >> >> No. What will happen is that when the pp_frag_count is exhausted the >> page will be unmapped and evicted from the page pool. When the page is >> then finally freed it will end up going back to the page allocator >> instead of page pool. >> >> Basically the idea is that until pp_frag_count reaches 0 there will be >> at least 1 page reference held. >> >>> > > Is there any way I can make that part of the code work with the current >>> > > page pool frag implementation? >>> > >>> > The current code should work. Basically as long as the references are >>> > taken w/ get_page and skb->pp_recycle is not set then we shouldn't run >>> > into this issue because the pp_frag_count will be dropped when the >>> > original skb is freed and the page reference count will be decremented >>> > when the new one is freed. >>> > >>> > For page pool page fragments the main thing to keep in mind is that if >>> > pp_recycle is set it will update the pp_frag_count and if it is not >>> > then it will just decrement the page reference count. >>> What takes care of DMA unmap and other cleanup if the last reference to >>> the page is dropped via page_frag_free? >>> >>> - Felix >> >> When the page is freed on the skb w/ pp_recycle set it will unmap the >> page and evict it from the page pool. Basically in these cases the page >> goes from the page pool back to the page allocator. >> >> The general idea with this is that if we are using fragments that there >> will be enough of them floating around that if one or two frags have a >> temporeary detour through a non-recycling path that hopefully by the >> time the last fragment is freed the other instances holding the >> additional page reference will have let them go. If not then the page >> will go back to the page allocator and it will have to be replaced in >> the page pool. > Thanks for the explanation, it makes sense to me now. Unfortunately it > also means that I have no idea what could cause this issue. I will > finish my mt76 patch rework which gets rid of the pp vs non-pp > allocation mix and re-run my tests to provide updated traces. Here's the updated mt76 page pool support commit: https://github.com/nbd168/wireless/commit/923cdab6d4c92a0acb3536b3b0cc4af9fee7c808 And here is the trace that I'm getting with 6.1: https://nbd.name/p/a16957f2 If you have any debug patch you'd like me to test, please let me know. - Felix
On 25.01.23 20:40, Felix Fietkau wrote: > On 25.01.23 20:10, Felix Fietkau wrote: >> On 25.01.23 20:02, Alexander H Duyck wrote: >>> On Wed, 2023-01-25 at 19:42 +0100, Felix Fietkau wrote: >>>> On 25.01.23 19:26, Alexander H Duyck wrote: >>>> > On Wed, 2023-01-25 at 18:32 +0100, Felix Fietkau wrote: >>>> > > On 25.01.23 18:11, Alexander H Duyck wrote: >>>> > > > On Tue, 2023-01-24 at 22:30 +0100, Felix Fietkau wrote: >>>> > > > > On 24.01.23 22:10, Alexander H Duyck wrote: >>>> > > > > > On Tue, 2023-01-24 at 18:22 +0100, Felix Fietkau wrote: >>>> > > > > > > On 24.01.23 15:11, Ilias Apalodimas wrote: >>>> > > > > > > > Hi Felix, >>>> > > > > > > > >>>> > > > > > > > ++cc Alexander and Yunsheng. >>>> > > > > > > > >>>> > > > > > > > Thanks for the report >>>> > > > > > > > >>>> > > > > > > > On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote: >>>> > > > > > > > > >>>> > > > > > > > > While testing fragmented page_pool allocation in the mt76 driver, I was able >>>> > > > > > > > > to reliably trigger page refcount underflow issues, which did not occur with >>>> > > > > > > > > full-page page_pool allocation. >>>> > > > > > > > > It appears to me, that handling refcounting in two separate counters >>>> > > > > > > > > (page->pp_frag_count and page refcount) is racy when page refcount gets >>>> > > > > > > > > incremented by code dealing with skb fragments directly, and >>>> > > > > > > > > page_pool_return_skb_page is called multiple times for the same fragment. >>>> > > > > > > > > >>>> > > > > > > > > Dropping page->pp_frag_count and relying entirely on the page refcount makes >>>> > > > > > > > > these underflow issues and crashes go away. >>>> > > > > > > > > >>>> > > > > > > > >>>> > > > > > > > This has been discussed here [1]. TL;DR changing this to page >>>> > > > > > > > refcount might blow up in other colorful ways. Can we look closer and >>>> > > > > > > > figure out why the underflow happens? >>>> > > > > > > I don't see how the approch taken in my patch would blow up. From what I >>>> > > > > > > can tell, it should be fairly close to how refcount is handled in >>>> > > > > > > page_frag_alloc. The main improvement it adds is to prevent it from >>>> > > > > > > blowing up if pool-allocated fragments get shared across multiple skbs >>>> > > > > > > with corresponding get_page and page_pool_return_skb_page calls. >>>> > > > > > > >>>> > > > > > > - Felix >>>> > > > > > > >>>> > > > > > >>>> > > > > > Do you have the patch available to review as an RFC? From what I am >>>> > > > > > seeing it looks like you are underrunning on the pp_frag_count itself. >>>> > > > > > I would suspect the issue to be something like starting with a bad >>>> > > > > > count in terms of the total number of references, or deducing the wrong >>>> > > > > > amount when you finally free the page assuming you are tracking your >>>> > > > > > frag count using a non-atomic value in the driver. >>>> > > > > The driver patches for page pool are here: >>>> > > > > https://patchwork.kernel.org/project/linux-wireless/patch/64abb23f4867c075c19d704beaae5a0a2f8e8821.1673963374.git.lorenzo@kernel.org/ >>>> > > > > https://patchwork.kernel.org/project/linux-wireless/patch/68081e02cbe2afa2d35c8aa93194f0adddbd0f05.1673963374.git.lorenzo@kernel.org/ >>>> > > > > >>>> > > > > They are also applied in my mt76 tree at: >>>> > > > > https://github.com/nbd168/wireless >>>> > > > > >>>> > > > > - Felix >>>> > > > >>>> > > > So one thing I am thinking is that we may be seeing an issue where we >>>> > > > are somehow getting a mix of frag and non-frag based page pool pages. >>>> > > > That is the only case I can think of where we might be underflowing >>>> > > > negative. If you could add some additional debug info on the underflow >>>> > > > WARN_ON case in page_pool_defrag_page that might be useful. >>>> > > > Specifically I would be curious what the actual return value is. I'm >>>> > > > assuming we are only hitting negative 1, but I would want to verify we >>>> > > > aren't seeing something else. >>>> > > I'll try to run some more tests soon. However, I think I found the piece >>>> > > of code that is incompatible with using pp_frag_count. >>>> > > When receiving an A-MSDU packet (multiple MSDUs within a single 802.11 >>>> > > packet), and it is not split by the hardware, a cfg80211 function >>>> > > extracts the individual MSDUs into separate skbs. In that case, a >>>> > > fragment can be shared across multiple skbs, and get_page is used to >>>> > > increase the refcount. >>>> > > You can find this in net/wireless/util.c: ieee80211_amsdu_to_8023s (and >>>> > > its helper functions). >>>> > >>>> > I'm not sure if it is problematic or not. Basically it is trading off >>>> > by copying over the frags, calling get_page on each frag, and then >>>> > using dev_kfree_skb to disassemble and release the pp_frag references. >>>> > There should be other paths in the kernel that are doing something >>>> > similar. >>>> > >>>> > > This code also has a bug where it doesn't set pp_recycle on the newly >>>> > > allocated skb if the previous one has it, but that's a separate matter >>>> > > and fixing it doesn't make the crash go away. >>>> > >>>> > Adding the recycle would cause this bug. So one thing we might be >>>> > seeing is something like that triggering this error. Specifically if >>>> > the page is taken via get_page when assembling the new skb then we >>>> > cannot set the recycle flag in the new skb otherwise it will result in >>>> > the reference undercount we are seeing. What we are doing is shifting >>>> > the references away from the pp_frag_count to the page reference count >>>> > in this case. If we set the pp_recycle flag then it would cause us to >>>> > decrement pp_frag_count instead of the page reference count resulting >>>> > in the underrun. >>>> Couldn't leaving out the pp_recycle flag potentially lead to a case >>>> where the last user of the page drops it via page_frag_free instead of >>>> page_pool_return_skb_page? Is that valid? >>> >>> No. What will happen is that when the pp_frag_count is exhausted the >>> page will be unmapped and evicted from the page pool. When the page is >>> then finally freed it will end up going back to the page allocator >>> instead of page pool. >>> >>> Basically the idea is that until pp_frag_count reaches 0 there will be >>> at least 1 page reference held. >>> >>>> > > Is there any way I can make that part of the code work with the current >>>> > > page pool frag implementation? >>>> > >>>> > The current code should work. Basically as long as the references are >>>> > taken w/ get_page and skb->pp_recycle is not set then we shouldn't run >>>> > into this issue because the pp_frag_count will be dropped when the >>>> > original skb is freed and the page reference count will be decremented >>>> > when the new one is freed. >>>> > >>>> > For page pool page fragments the main thing to keep in mind is that if >>>> > pp_recycle is set it will update the pp_frag_count and if it is not >>>> > then it will just decrement the page reference count. >>>> What takes care of DMA unmap and other cleanup if the last reference to >>>> the page is dropped via page_frag_free? >>>> >>>> - Felix >>> >>> When the page is freed on the skb w/ pp_recycle set it will unmap the >>> page and evict it from the page pool. Basically in these cases the page >>> goes from the page pool back to the page allocator. >>> >>> The general idea with this is that if we are using fragments that there >>> will be enough of them floating around that if one or two frags have a >>> temporeary detour through a non-recycling path that hopefully by the >>> time the last fragment is freed the other instances holding the >>> additional page reference will have let them go. If not then the page >>> will go back to the page allocator and it will have to be replaced in >>> the page pool. >> Thanks for the explanation, it makes sense to me now. Unfortunately it >> also means that I have no idea what could cause this issue. I will >> finish my mt76 patch rework which gets rid of the pp vs non-pp >> allocation mix and re-run my tests to provide updated traces. > Here's the updated mt76 page pool support commit: > https://github.com/nbd168/wireless/commit/923cdab6d4c92a0acb3536b3b0cc4af9fee7c808 > > And here is the trace that I'm getting with 6.1: > https://nbd.name/p/a16957f2 > > If you have any debug patch you'd like me to test, please let me know. To answer your earlier question: When pp_frag_count goes below zero, it's at -1 as suspected. Here are some more completely different traces that I got during other test runs. I hope they provide some kind of clue: https://nbd.name/p/8e46b6eb - Felix
On Wed, 2023-01-25 at 20:40 +0100, Felix Fietkau wrote: > On 25.01.23 20:10, Felix Fietkau wrote: > > On 25.01.23 20:02, Alexander H Duyck wrote: > > > On Wed, 2023-01-25 at 19:42 +0100, Felix Fietkau wrote: > > > > On 25.01.23 19:26, Alexander H Duyck wrote: > > > > > On Wed, 2023-01-25 at 18:32 +0100, Felix Fietkau wrote: > > > > > > On 25.01.23 18:11, Alexander H Duyck wrote: > > > > > > > On Tue, 2023-01-24 at 22:30 +0100, Felix Fietkau wrote: > > > > > > > > On 24.01.23 22:10, Alexander H Duyck wrote: > > > > > > > > > On Tue, 2023-01-24 at 18:22 +0100, Felix Fietkau wrote: > > > > > > > > > > On 24.01.23 15:11, Ilias Apalodimas wrote: > > > > > > > > > > > Hi Felix, > > > > > > > > > > > > > > > > > > > > > > ++cc Alexander and Yunsheng. > > > > > > > > > > > > > > > > > > > > > > Thanks for the report > > > > > > > > > > > > > > > > > > > > > > On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote: > > > > > > > > > > > > > > > > > > > > > > > > While testing fragmented page_pool allocation in the mt76 driver, I was able > > > > > > > > > > > > to reliably trigger page refcount underflow issues, which did not occur with > > > > > > > > > > > > full-page page_pool allocation. > > > > > > > > > > > > It appears to me, that handling refcounting in two separate counters > > > > > > > > > > > > (page->pp_frag_count and page refcount) is racy when page refcount gets > > > > > > > > > > > > incremented by code dealing with skb fragments directly, and > > > > > > > > > > > > page_pool_return_skb_page is called multiple times for the same fragment. > > > > > > > > > > > > > > > > > > > > > > > > Dropping page->pp_frag_count and relying entirely on the page refcount makes > > > > > > > > > > > > these underflow issues and crashes go away. > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > This has been discussed here [1]. TL;DR changing this to page > > > > > > > > > > > refcount might blow up in other colorful ways. Can we look closer and > > > > > > > > > > > figure out why the underflow happens? > > > > > > > > > > I don't see how the approch taken in my patch would blow up. From what I > > > > > > > > > > can tell, it should be fairly close to how refcount is handled in > > > > > > > > > > page_frag_alloc. The main improvement it adds is to prevent it from > > > > > > > > > > blowing up if pool-allocated fragments get shared across multiple skbs > > > > > > > > > > with corresponding get_page and page_pool_return_skb_page calls. > > > > > > > > > > > > > > > > > > > > - Felix > > > > > > > > > > > > > > > > > > > > > > > > > > > > Do you have the patch available to review as an RFC? From what I am > > > > > > > > > seeing it looks like you are underrunning on the pp_frag_count itself. > > > > > > > > > I would suspect the issue to be something like starting with a bad > > > > > > > > > count in terms of the total number of references, or deducing the wrong > > > > > > > > > amount when you finally free the page assuming you are tracking your > > > > > > > > > frag count using a non-atomic value in the driver. > > > > > > > > The driver patches for page pool are here: > > > > > > > > https://patchwork.kernel.org/project/linux-wireless/patch/64abb23f4867c075c19d704beaae5a0a2f8e8821.1673963374.git.lorenzo@kernel.org/ > > > > > > > > https://patchwork.kernel.org/project/linux-wireless/patch/68081e02cbe2afa2d35c8aa93194f0adddbd0f05.1673963374.git.lorenzo@kernel.org/ > > > > > > > > > > > > > > > > They are also applied in my mt76 tree at: > > > > > > > > https://github.com/nbd168/wireless > > > > > > > > > > > > > > > > - Felix > > > > > > > > > > > > > > So one thing I am thinking is that we may be seeing an issue where we > > > > > > > are somehow getting a mix of frag and non-frag based page pool pages. > > > > > > > That is the only case I can think of where we might be underflowing > > > > > > > negative. If you could add some additional debug info on the underflow > > > > > > > WARN_ON case in page_pool_defrag_page that might be useful. > > > > > > > Specifically I would be curious what the actual return value is. I'm > > > > > > > assuming we are only hitting negative 1, but I would want to verify we > > > > > > > aren't seeing something else. > > > > > > I'll try to run some more tests soon. However, I think I found the piece > > > > > > of code that is incompatible with using pp_frag_count. > > > > > > When receiving an A-MSDU packet (multiple MSDUs within a single 802.11 > > > > > > packet), and it is not split by the hardware, a cfg80211 function > > > > > > extracts the individual MSDUs into separate skbs. In that case, a > > > > > > fragment can be shared across multiple skbs, and get_page is used to > > > > > > increase the refcount. > > > > > > You can find this in net/wireless/util.c: ieee80211_amsdu_to_8023s (and > > > > > > its helper functions). > > > > > > > > > > I'm not sure if it is problematic or not. Basically it is trading off > > > > > by copying over the frags, calling get_page on each frag, and then > > > > > using dev_kfree_skb to disassemble and release the pp_frag references. > > > > > There should be other paths in the kernel that are doing something > > > > > similar. > > > > > > > > > > > This code also has a bug where it doesn't set pp_recycle on the newly > > > > > > allocated skb if the previous one has it, but that's a separate matter > > > > > > and fixing it doesn't make the crash go away. > > > > > > > > > > Adding the recycle would cause this bug. So one thing we might be > > > > > seeing is something like that triggering this error. Specifically if > > > > > the page is taken via get_page when assembling the new skb then we > > > > > cannot set the recycle flag in the new skb otherwise it will result in > > > > > the reference undercount we are seeing. What we are doing is shifting > > > > > the references away from the pp_frag_count to the page reference count > > > > > in this case. If we set the pp_recycle flag then it would cause us to > > > > > decrement pp_frag_count instead of the page reference count resulting > > > > > in the underrun. > > > > Couldn't leaving out the pp_recycle flag potentially lead to a case > > > > where the last user of the page drops it via page_frag_free instead of > > > > page_pool_return_skb_page? Is that valid? > > > > > > No. What will happen is that when the pp_frag_count is exhausted the > > > page will be unmapped and evicted from the page pool. When the page is > > > then finally freed it will end up going back to the page allocator > > > instead of page pool. > > > > > > Basically the idea is that until pp_frag_count reaches 0 there will be > > > at least 1 page reference held. > > > > > > > > > Is there any way I can make that part of the code work with the current > > > > > > page pool frag implementation? > > > > > > > > > > The current code should work. Basically as long as the references are > > > > > taken w/ get_page and skb->pp_recycle is not set then we shouldn't run > > > > > into this issue because the pp_frag_count will be dropped when the > > > > > original skb is freed and the page reference count will be decremented > > > > > when the new one is freed. > > > > > > > > > > For page pool page fragments the main thing to keep in mind is that if > > > > > pp_recycle is set it will update the pp_frag_count and if it is not > > > > > then it will just decrement the page reference count. > > > > What takes care of DMA unmap and other cleanup if the last reference to > > > > the page is dropped via page_frag_free? > > > > > > > > - Felix > > > > > > When the page is freed on the skb w/ pp_recycle set it will unmap the > > > page and evict it from the page pool. Basically in these cases the page > > > goes from the page pool back to the page allocator. > > > > > > The general idea with this is that if we are using fragments that there > > > will be enough of them floating around that if one or two frags have a > > > temporeary detour through a non-recycling path that hopefully by the > > > time the last fragment is freed the other instances holding the > > > additional page reference will have let them go. If not then the page > > > will go back to the page allocator and it will have to be replaced in > > > the page pool. > > Thanks for the explanation, it makes sense to me now. Unfortunately it > > also means that I have no idea what could cause this issue. I will > > finish my mt76 patch rework which gets rid of the pp vs non-pp > > allocation mix and re-run my tests to provide updated traces. > Here's the updated mt76 page pool support commit: > https://github.com/nbd168/wireless/commit/923cdab6d4c92a0acb3536b3b0cc4af9fee7c808 Yeah, so I don't see anything wrong with the patch in terms of page pool. > And here is the trace that I'm getting with 6.1: > https://nbd.name/p/a16957f2 > > If you have any debug patch you'd like me to test, please let me know. > > - Felix So looking at the traces I am assuming what we are seeing is the deferred freeing from the TCP Rx path since I don't see a driver anywhere between net_rx_action and napi_consume skb. So it seems like the packets are likely making it all the way up the network stack. Is this the first wireless driver to add support for page pool? I'm thinking we must be seeing something in the wireless path that is causing an issue such as the function you called out earlier but I can't see anything obvious. One thing we need to be on the lookout for is cloned skbs. When an skb is cloned the pp_recycle gets copied over. In that case the reference is moved over to the skb dataref count. What comes to mind is something like commit 1effe8ca4e34c ("skbuff: fix coalescing for page_pool fragment recycling").
On 25.01.23 23:14, Alexander H Duyck wrote: > On Wed, 2023-01-25 at 20:40 +0100, Felix Fietkau wrote: >> On 25.01.23 20:10, Felix Fietkau wrote: >> > On 25.01.23 20:02, Alexander H Duyck wrote: >> > > On Wed, 2023-01-25 at 19:42 +0100, Felix Fietkau wrote: >> > > > On 25.01.23 19:26, Alexander H Duyck wrote: >> > > > > On Wed, 2023-01-25 at 18:32 +0100, Felix Fietkau wrote: >> > > > > > On 25.01.23 18:11, Alexander H Duyck wrote: >> > > > > > > On Tue, 2023-01-24 at 22:30 +0100, Felix Fietkau wrote: >> > > > > > > > On 24.01.23 22:10, Alexander H Duyck wrote: >> > > > > > > > > On Tue, 2023-01-24 at 18:22 +0100, Felix Fietkau wrote: >> > > > > > > > > > On 24.01.23 15:11, Ilias Apalodimas wrote: >> > > > > > > > > > > Hi Felix, >> > > > > > > > > > > >> > > > > > > > > > > ++cc Alexander and Yunsheng. >> > > > > > > > > > > >> > > > > > > > > > > Thanks for the report >> > > > > > > > > > > >> > > > > > > > > > > On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote: >> > > > > > > > > > > > >> > > > > > > > > > > > While testing fragmented page_pool allocation in the mt76 driver, I was able >> > > > > > > > > > > > to reliably trigger page refcount underflow issues, which did not occur with >> > > > > > > > > > > > full-page page_pool allocation. >> > > > > > > > > > > > It appears to me, that handling refcounting in two separate counters >> > > > > > > > > > > > (page->pp_frag_count and page refcount) is racy when page refcount gets >> > > > > > > > > > > > incremented by code dealing with skb fragments directly, and >> > > > > > > > > > > > page_pool_return_skb_page is called multiple times for the same fragment. >> > > > > > > > > > > > >> > > > > > > > > > > > Dropping page->pp_frag_count and relying entirely on the page refcount makes >> > > > > > > > > > > > these underflow issues and crashes go away. >> > > > > > > > > > > > >> > > > > > > > > > > >> > > > > > > > > > > This has been discussed here [1]. TL;DR changing this to page >> > > > > > > > > > > refcount might blow up in other colorful ways. Can we look closer and >> > > > > > > > > > > figure out why the underflow happens? >> > > > > > > > > > I don't see how the approch taken in my patch would blow up. From what I >> > > > > > > > > > can tell, it should be fairly close to how refcount is handled in >> > > > > > > > > > page_frag_alloc. The main improvement it adds is to prevent it from >> > > > > > > > > > blowing up if pool-allocated fragments get shared across multiple skbs >> > > > > > > > > > with corresponding get_page and page_pool_return_skb_page calls. >> > > > > > > > > > >> > > > > > > > > > - Felix >> > > > > > > > > > >> > > > > > > > > >> > > > > > > > > Do you have the patch available to review as an RFC? From what I am >> > > > > > > > > seeing it looks like you are underrunning on the pp_frag_count itself. >> > > > > > > > > I would suspect the issue to be something like starting with a bad >> > > > > > > > > count in terms of the total number of references, or deducing the wrong >> > > > > > > > > amount when you finally free the page assuming you are tracking your >> > > > > > > > > frag count using a non-atomic value in the driver. >> > > > > > > > The driver patches for page pool are here: >> > > > > > > > https://patchwork.kernel.org/project/linux-wireless/patch/64abb23f4867c075c19d704beaae5a0a2f8e8821.1673963374.git.lorenzo@kernel.org/ >> > > > > > > > https://patchwork.kernel.org/project/linux-wireless/patch/68081e02cbe2afa2d35c8aa93194f0adddbd0f05.1673963374.git.lorenzo@kernel.org/ >> > > > > > > > >> > > > > > > > They are also applied in my mt76 tree at: >> > > > > > > > https://github.com/nbd168/wireless >> > > > > > > > >> > > > > > > > - Felix >> > > > > > > >> > > > > > > So one thing I am thinking is that we may be seeing an issue where we >> > > > > > > are somehow getting a mix of frag and non-frag based page pool pages. >> > > > > > > That is the only case I can think of where we might be underflowing >> > > > > > > negative. If you could add some additional debug info on the underflow >> > > > > > > WARN_ON case in page_pool_defrag_page that might be useful. >> > > > > > > Specifically I would be curious what the actual return value is. I'm >> > > > > > > assuming we are only hitting negative 1, but I would want to verify we >> > > > > > > aren't seeing something else. >> > > > > > I'll try to run some more tests soon. However, I think I found the piece >> > > > > > of code that is incompatible with using pp_frag_count. >> > > > > > When receiving an A-MSDU packet (multiple MSDUs within a single 802.11 >> > > > > > packet), and it is not split by the hardware, a cfg80211 function >> > > > > > extracts the individual MSDUs into separate skbs. In that case, a >> > > > > > fragment can be shared across multiple skbs, and get_page is used to >> > > > > > increase the refcount. >> > > > > > You can find this in net/wireless/util.c: ieee80211_amsdu_to_8023s (and >> > > > > > its helper functions). >> > > > > >> > > > > I'm not sure if it is problematic or not. Basically it is trading off >> > > > > by copying over the frags, calling get_page on each frag, and then >> > > > > using dev_kfree_skb to disassemble and release the pp_frag references. >> > > > > There should be other paths in the kernel that are doing something >> > > > > similar. >> > > > > >> > > > > > This code also has a bug where it doesn't set pp_recycle on the newly >> > > > > > allocated skb if the previous one has it, but that's a separate matter >> > > > > > and fixing it doesn't make the crash go away. >> > > > > >> > > > > Adding the recycle would cause this bug. So one thing we might be >> > > > > seeing is something like that triggering this error. Specifically if >> > > > > the page is taken via get_page when assembling the new skb then we >> > > > > cannot set the recycle flag in the new skb otherwise it will result in >> > > > > the reference undercount we are seeing. What we are doing is shifting >> > > > > the references away from the pp_frag_count to the page reference count >> > > > > in this case. If we set the pp_recycle flag then it would cause us to >> > > > > decrement pp_frag_count instead of the page reference count resulting >> > > > > in the underrun. >> > > > Couldn't leaving out the pp_recycle flag potentially lead to a case >> > > > where the last user of the page drops it via page_frag_free instead of >> > > > page_pool_return_skb_page? Is that valid? >> > > >> > > No. What will happen is that when the pp_frag_count is exhausted the >> > > page will be unmapped and evicted from the page pool. When the page is >> > > then finally freed it will end up going back to the page allocator >> > > instead of page pool. >> > > >> > > Basically the idea is that until pp_frag_count reaches 0 there will be >> > > at least 1 page reference held. >> > > >> > > > > > Is there any way I can make that part of the code work with the current >> > > > > > page pool frag implementation? >> > > > > >> > > > > The current code should work. Basically as long as the references are >> > > > > taken w/ get_page and skb->pp_recycle is not set then we shouldn't run >> > > > > into this issue because the pp_frag_count will be dropped when the >> > > > > original skb is freed and the page reference count will be decremented >> > > > > when the new one is freed. >> > > > > >> > > > > For page pool page fragments the main thing to keep in mind is that if >> > > > > pp_recycle is set it will update the pp_frag_count and if it is not >> > > > > then it will just decrement the page reference count. >> > > > What takes care of DMA unmap and other cleanup if the last reference to >> > > > the page is dropped via page_frag_free? >> > > > >> > > > - Felix >> > > >> > > When the page is freed on the skb w/ pp_recycle set it will unmap the >> > > page and evict it from the page pool. Basically in these cases the page >> > > goes from the page pool back to the page allocator. >> > > >> > > The general idea with this is that if we are using fragments that there >> > > will be enough of them floating around that if one or two frags have a >> > > temporeary detour through a non-recycling path that hopefully by the >> > > time the last fragment is freed the other instances holding the >> > > additional page reference will have let them go. If not then the page >> > > will go back to the page allocator and it will have to be replaced in >> > > the page pool. >> > Thanks for the explanation, it makes sense to me now. Unfortunately it >> > also means that I have no idea what could cause this issue. I will >> > finish my mt76 patch rework which gets rid of the pp vs non-pp >> > allocation mix and re-run my tests to provide updated traces. >> Here's the updated mt76 page pool support commit: >> https://github.com/nbd168/wireless/commit/923cdab6d4c92a0acb3536b3b0cc4af9fee7c808 > > Yeah, so I don't see anything wrong with the patch in terms of page > pool. > >> And here is the trace that I'm getting with 6.1: >> https://nbd.name/p/a16957f2 >> >> If you have any debug patch you'd like me to test, please let me know. >> >> - Felix > > So looking at the traces I am assuming what we are seeing is the > deferred freeing from the TCP Rx path since I don't see a driver > anywhere between net_rx_action and napi_consume skb. So it seems like > the packets are likely making it all the way up the network stack. > > Is this the first wireless driver to add support for page pool? I'm > thinking we must be seeing something in the wireless path that is > causing an issue such as the function you called out earlier but I > can't see anything obvious. Yes, it's the first driver with page pool support. > One thing we need to be on the lookout for is cloned skbs. When an skb > is cloned the pp_recycle gets copied over. In that case the reference > is moved over to the skb dataref count. What comes to mind is something > like commit 1effe8ca4e34c ("skbuff: fix coalescing for page_pool > fragment recycling"). I suspect that the crash might be related to a bad interaction between the page reuse in A-MSDU rx + skb coalescing on TCP rx. If I change the A-MSDU code to copy data instead of reusing fragments, it doesn't crash anymore. I believe the issue must be specific to that codepath, since most received and processed packets are either not A-MSDU or A-MSDU decap has already been performed by the hardware. If I change my test to use 3 client mode interfaces instead of 4, the hardware is able to offload all A-MSDU rx processing and I don't see any crashes anymore. Could you please take another look at ieee80211_amsdu_to_8023s to see if there's anything in there that could cause these issues? - Felix
On 26.01.23 07:12, Felix Fietkau wrote: > On 25.01.23 23:14, Alexander H Duyck wrote: >> On Wed, 2023-01-25 at 20:40 +0100, Felix Fietkau wrote: >>> On 25.01.23 20:10, Felix Fietkau wrote: >>> > On 25.01.23 20:02, Alexander H Duyck wrote: >>> > > On Wed, 2023-01-25 at 19:42 +0100, Felix Fietkau wrote: >>> > > > On 25.01.23 19:26, Alexander H Duyck wrote: >>> > > > > On Wed, 2023-01-25 at 18:32 +0100, Felix Fietkau wrote: >>> > > > > > On 25.01.23 18:11, Alexander H Duyck wrote: >>> > > > > > > On Tue, 2023-01-24 at 22:30 +0100, Felix Fietkau wrote: >>> > > > > > > > On 24.01.23 22:10, Alexander H Duyck wrote: >>> > > > > > > > > On Tue, 2023-01-24 at 18:22 +0100, Felix Fietkau wrote: >>> > > > > > > > > > On 24.01.23 15:11, Ilias Apalodimas wrote: >>> > > > > > > > > > > Hi Felix, >>> > > > > > > > > > > >>> > > > > > > > > > > ++cc Alexander and Yunsheng. >>> > > > > > > > > > > >>> > > > > > > > > > > Thanks for the report >>> > > > > > > > > > > >>> > > > > > > > > > > On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote: >>> > > > > > > > > > > > >>> > > > > > > > > > > > While testing fragmented page_pool allocation in the mt76 driver, I was able >>> > > > > > > > > > > > to reliably trigger page refcount underflow issues, which did not occur with >>> > > > > > > > > > > > full-page page_pool allocation. >>> > > > > > > > > > > > It appears to me, that handling refcounting in two separate counters >>> > > > > > > > > > > > (page->pp_frag_count and page refcount) is racy when page refcount gets >>> > > > > > > > > > > > incremented by code dealing with skb fragments directly, and >>> > > > > > > > > > > > page_pool_return_skb_page is called multiple times for the same fragment. >>> > > > > > > > > > > > >>> > > > > > > > > > > > Dropping page->pp_frag_count and relying entirely on the page refcount makes >>> > > > > > > > > > > > these underflow issues and crashes go away. >>> > > > > > > > > > > > >>> > > > > > > > > > > >>> > > > > > > > > > > This has been discussed here [1]. TL;DR changing this to page >>> > > > > > > > > > > refcount might blow up in other colorful ways. Can we look closer and >>> > > > > > > > > > > figure out why the underflow happens? >>> > > > > > > > > > I don't see how the approch taken in my patch would blow up. From what I >>> > > > > > > > > > can tell, it should be fairly close to how refcount is handled in >>> > > > > > > > > > page_frag_alloc. The main improvement it adds is to prevent it from >>> > > > > > > > > > blowing up if pool-allocated fragments get shared across multiple skbs >>> > > > > > > > > > with corresponding get_page and page_pool_return_skb_page calls. >>> > > > > > > > > > >>> > > > > > > > > > - Felix >>> > > > > > > > > > >>> > > > > > > > > >>> > > > > > > > > Do you have the patch available to review as an RFC? From what I am >>> > > > > > > > > seeing it looks like you are underrunning on the pp_frag_count itself. >>> > > > > > > > > I would suspect the issue to be something like starting with a bad >>> > > > > > > > > count in terms of the total number of references, or deducing the wrong >>> > > > > > > > > amount when you finally free the page assuming you are tracking your >>> > > > > > > > > frag count using a non-atomic value in the driver. >>> > > > > > > > The driver patches for page pool are here: >>> > > > > > > > https://patchwork.kernel.org/project/linux-wireless/patch/64abb23f4867c075c19d704beaae5a0a2f8e8821.1673963374.git.lorenzo@kernel.org/ >>> > > > > > > > https://patchwork.kernel.org/project/linux-wireless/patch/68081e02cbe2afa2d35c8aa93194f0adddbd0f05.1673963374.git.lorenzo@kernel.org/ >>> > > > > > > > >>> > > > > > > > They are also applied in my mt76 tree at: >>> > > > > > > > https://github.com/nbd168/wireless >>> > > > > > > > >>> > > > > > > > - Felix >>> > > > > > > >>> > > > > > > So one thing I am thinking is that we may be seeing an issue where we >>> > > > > > > are somehow getting a mix of frag and non-frag based page pool pages. >>> > > > > > > That is the only case I can think of where we might be underflowing >>> > > > > > > negative. If you could add some additional debug info on the underflow >>> > > > > > > WARN_ON case in page_pool_defrag_page that might be useful. >>> > > > > > > Specifically I would be curious what the actual return value is. I'm >>> > > > > > > assuming we are only hitting negative 1, but I would want to verify we >>> > > > > > > aren't seeing something else. >>> > > > > > I'll try to run some more tests soon. However, I think I found the piece >>> > > > > > of code that is incompatible with using pp_frag_count. >>> > > > > > When receiving an A-MSDU packet (multiple MSDUs within a single 802.11 >>> > > > > > packet), and it is not split by the hardware, a cfg80211 function >>> > > > > > extracts the individual MSDUs into separate skbs. In that case, a >>> > > > > > fragment can be shared across multiple skbs, and get_page is used to >>> > > > > > increase the refcount. >>> > > > > > You can find this in net/wireless/util.c: ieee80211_amsdu_to_8023s (and >>> > > > > > its helper functions). >>> > > > > >>> > > > > I'm not sure if it is problematic or not. Basically it is trading off >>> > > > > by copying over the frags, calling get_page on each frag, and then >>> > > > > using dev_kfree_skb to disassemble and release the pp_frag references. >>> > > > > There should be other paths in the kernel that are doing something >>> > > > > similar. >>> > > > > >>> > > > > > This code also has a bug where it doesn't set pp_recycle on the newly >>> > > > > > allocated skb if the previous one has it, but that's a separate matter >>> > > > > > and fixing it doesn't make the crash go away. >>> > > > > >>> > > > > Adding the recycle would cause this bug. So one thing we might be >>> > > > > seeing is something like that triggering this error. Specifically if >>> > > > > the page is taken via get_page when assembling the new skb then we >>> > > > > cannot set the recycle flag in the new skb otherwise it will result in >>> > > > > the reference undercount we are seeing. What we are doing is shifting >>> > > > > the references away from the pp_frag_count to the page reference count >>> > > > > in this case. If we set the pp_recycle flag then it would cause us to >>> > > > > decrement pp_frag_count instead of the page reference count resulting >>> > > > > in the underrun. >>> > > > Couldn't leaving out the pp_recycle flag potentially lead to a case >>> > > > where the last user of the page drops it via page_frag_free instead of >>> > > > page_pool_return_skb_page? Is that valid? >>> > > >>> > > No. What will happen is that when the pp_frag_count is exhausted the >>> > > page will be unmapped and evicted from the page pool. When the page is >>> > > then finally freed it will end up going back to the page allocator >>> > > instead of page pool. >>> > > >>> > > Basically the idea is that until pp_frag_count reaches 0 there will be >>> > > at least 1 page reference held. >>> > > >>> > > > > > Is there any way I can make that part of the code work with the current >>> > > > > > page pool frag implementation? >>> > > > > >>> > > > > The current code should work. Basically as long as the references are >>> > > > > taken w/ get_page and skb->pp_recycle is not set then we shouldn't run >>> > > > > into this issue because the pp_frag_count will be dropped when the >>> > > > > original skb is freed and the page reference count will be decremented >>> > > > > when the new one is freed. >>> > > > > >>> > > > > For page pool page fragments the main thing to keep in mind is that if >>> > > > > pp_recycle is set it will update the pp_frag_count and if it is not >>> > > > > then it will just decrement the page reference count. >>> > > > What takes care of DMA unmap and other cleanup if the last reference to >>> > > > the page is dropped via page_frag_free? >>> > > > >>> > > > - Felix >>> > > >>> > > When the page is freed on the skb w/ pp_recycle set it will unmap the >>> > > page and evict it from the page pool. Basically in these cases the page >>> > > goes from the page pool back to the page allocator. >>> > > >>> > > The general idea with this is that if we are using fragments that there >>> > > will be enough of them floating around that if one or two frags have a >>> > > temporeary detour through a non-recycling path that hopefully by the >>> > > time the last fragment is freed the other instances holding the >>> > > additional page reference will have let them go. If not then the page >>> > > will go back to the page allocator and it will have to be replaced in >>> > > the page pool. >>> > Thanks for the explanation, it makes sense to me now. Unfortunately it >>> > also means that I have no idea what could cause this issue. I will >>> > finish my mt76 patch rework which gets rid of the pp vs non-pp >>> > allocation mix and re-run my tests to provide updated traces. >>> Here's the updated mt76 page pool support commit: >>> https://github.com/nbd168/wireless/commit/923cdab6d4c92a0acb3536b3b0cc4af9fee7c808 >> >> Yeah, so I don't see anything wrong with the patch in terms of page >> pool. >> >>> And here is the trace that I'm getting with 6.1: >>> https://nbd.name/p/a16957f2 >>> >>> If you have any debug patch you'd like me to test, please let me know. >>> >>> - Felix >> >> So looking at the traces I am assuming what we are seeing is the >> deferred freeing from the TCP Rx path since I don't see a driver >> anywhere between net_rx_action and napi_consume skb. So it seems like >> the packets are likely making it all the way up the network stack. >> >> Is this the first wireless driver to add support for page pool? I'm >> thinking we must be seeing something in the wireless path that is >> causing an issue such as the function you called out earlier but I >> can't see anything obvious. > Yes, it's the first driver with page pool support. > >> One thing we need to be on the lookout for is cloned skbs. When an skb >> is cloned the pp_recycle gets copied over. In that case the reference >> is moved over to the skb dataref count. What comes to mind is something >> like commit 1effe8ca4e34c ("skbuff: fix coalescing for page_pool >> fragment recycling"). > I suspect that the crash might be related to a bad interaction between > the page reuse in A-MSDU rx + skb coalescing on TCP rx. > If I change the A-MSDU code to copy data instead of reusing fragments, > it doesn't crash anymore. > I believe the issue must be specific to that codepath, since most > received and processed packets are either not A-MSDU or A-MSDU decap has > already been performed by the hardware. > If I change my test to use 3 client mode interfaces instead of 4, the > hardware is able to offload all A-MSDU rx processing and I don't see any > crashes anymore. > > Could you please take another look at ieee80211_amsdu_to_8023s to see if > there's anything in there that could cause these issues? Here are clues from a few more tests that I ran: - preventing the reuse of the last skb in ieee80211_amsdu_to_8023s does not prevent the crashes, so the issue is indeed related to taking page references and putting the pages in skb fragments. - if I return false in skb_try_coalesce, it still crashes: https://nbd.name/p/18cac078 - Felix
Hi Alexander, Sorry for being late to the party, was overloaded... On Tue, Jan 24, 2023 at 07:57:35AM -0800, Alexander H Duyck wrote: > On Tue, 2023-01-24 at 16:11 +0200, Ilias Apalodimas wrote: > > Hi Felix, > > > > ++cc Alexander and Yunsheng. > > > > Thanks for the report > > > > On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote: > > > > > > While testing fragmented page_pool allocation in the mt76 driver, I was able > > > to reliably trigger page refcount underflow issues, which did not occur with > > > full-page page_pool allocation. > > > It appears to me, that handling refcounting in two separate counters > > > (page->pp_frag_count and page refcount) is racy when page refcount gets > > > incremented by code dealing with skb fragments directly, and > > > page_pool_return_skb_page is called multiple times for the same fragment. > > > > > > Dropping page->pp_frag_count and relying entirely on the page refcount makes > > > these underflow issues and crashes go away. > > > > > > > This has been discussed here [1]. TL;DR changing this to page > > refcount might blow up in other colorful ways. Can we look closer and > > figure out why the underflow happens? > > > > [1] https://lore.kernel.org/netdev/1625903002-31619-4-git-send-email-linyunsheng@huawei.com/ > > > > Thanks > > /Ilias > > > > > > The logic should be safe in terms of the page pool itself as it should > be holding one reference to the page while the pp_frag_count is non- > zero. That one reference is what keeps the two halfs in sync as the > page shouldn't be able to be freed until we exhaust the pp_frag_count. Do you remember why we decided to go with the fragment counter instead of page references? > > To have an underflow there are two possible scenarios. One is that > either put_page or free_page is being called somewhere that the > page_pool freeing functions should be used. Wouldn't that affect the non fragmented path as well? IOW the driver that works with a full page would crash as well. > The other possibility is > that a pp_frag_count reference was taken somewhere a page reference > should have. > > Do we have a backtrace for the spots that are showing this underrun? If > nothing else we may want to look at tracking down the spots that are > freeing the page pool pages via put_page or free_page to determine what > paths these pages are taking. Thanks /Ilias
On Tue, Jan 24, 2023 at 06:22:54PM +0100, Felix Fietkau wrote: > On 24.01.23 15:11, Ilias Apalodimas wrote: > > Hi Felix, > > > > ++cc Alexander and Yunsheng. > > > > Thanks for the report > > > > On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote: > > > > > > While testing fragmented page_pool allocation in the mt76 driver, I was able > > > to reliably trigger page refcount underflow issues, which did not occur with > > > full-page page_pool allocation. > > > It appears to me, that handling refcounting in two separate counters > > > (page->pp_frag_count and page refcount) is racy when page refcount gets > > > incremented by code dealing with skb fragments directly, and > > > page_pool_return_skb_page is called multiple times for the same fragment. > > > > > > Dropping page->pp_frag_count and relying entirely on the page refcount makes > > > these underflow issues and crashes go away. > > > > > > > This has been discussed here [1]. TL;DR changing this to page > > refcount might blow up in other colorful ways. Can we look closer and > > figure out why the underflow happens? > I don't see how the approch taken in my patch would blow up. From what I can > tell, it should be fairly close to how refcount is handled in > page_frag_alloc. The main improvement it adds is to prevent it from blowing > up if pool-allocated fragments get shared across multiple skbs with > corresponding get_page and page_pool_return_skb_page calls. > > - Felix > Yes sorry for the noise, that patch I referred to was doing a completely different thing, elevating the page refcnt to BIAS_MAX from the start Thanks /Ilias
On Thu, Jan 26, 2023 at 2:32 AM Ilias Apalodimas <ilias.apalodimas@linaro.org> wrote: > > Hi Alexander, > > Sorry for being late to the party, was overloaded... > > On Tue, Jan 24, 2023 at 07:57:35AM -0800, Alexander H Duyck wrote: > > On Tue, 2023-01-24 at 16:11 +0200, Ilias Apalodimas wrote: > > > Hi Felix, > > > > > > ++cc Alexander and Yunsheng. > > > > > > Thanks for the report > > > > > > On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote: > > > > > > > > While testing fragmented page_pool allocation in the mt76 driver, I was able > > > > to reliably trigger page refcount underflow issues, which did not occur with > > > > full-page page_pool allocation. > > > > It appears to me, that handling refcounting in two separate counters > > > > (page->pp_frag_count and page refcount) is racy when page refcount gets > > > > incremented by code dealing with skb fragments directly, and > > > > page_pool_return_skb_page is called multiple times for the same fragment. > > > > > > > > Dropping page->pp_frag_count and relying entirely on the page refcount makes > > > > these underflow issues and crashes go away. > > > > > > > > > > This has been discussed here [1]. TL;DR changing this to page > > > refcount might blow up in other colorful ways. Can we look closer and > > > figure out why the underflow happens? > > > > > > [1] https://lore.kernel.org/netdev/1625903002-31619-4-git-send-email-linyunsheng@huawei.com/ > > > > > > Thanks > > > /Ilias > > > > > > > > > > The logic should be safe in terms of the page pool itself as it should > > be holding one reference to the page while the pp_frag_count is non- > > zero. That one reference is what keeps the two halfs in sync as the > > page shouldn't be able to be freed until we exhaust the pp_frag_count. > > Do you remember why we decided to go with the fragment counter instead of > page references? The issue has to do with when to destroy the mappings. Basically with the fragment counter we destroy the mappings and remove the page from the pool when the count hits 0. The reference count is really used for the page allocator to do its tracking. If we end up trying to merge the two the problem becomes one of lifetimes as we wouldn't know when to destroy the DMA mappings as they would have to live the full life of the page. > > > > To have an underflow there are two possible scenarios. One is that > > either put_page or free_page is being called somewhere that the > > page_pool freeing functions should be used. > > Wouldn't that affect the non fragmented path as well? IOW the driver that > works with a full page would crash as well. The problem is the non-fragmented path doesn't get as noisy. Also there aren't currently any wireless drivers making use of the page pool, or at least that is my understanding. I'm suspecting something like the issue we saw in 1effe8ca4e34c ("skbuff: fix coalescing for page_pool fragment recycling"). We likely have some corner case where we should be taking a page reference and clearing a pp_recycle flag.
On Thu, Jan 26, 2023 at 07:41:15AM -0800, Alexander Duyck wrote: > On Thu, Jan 26, 2023 at 2:32 AM Ilias Apalodimas > <ilias.apalodimas@linaro.org> wrote: > > > > Hi Alexander, > > > > Sorry for being late to the party, was overloaded... > > > > On Tue, Jan 24, 2023 at 07:57:35AM -0800, Alexander H Duyck wrote: > > > On Tue, 2023-01-24 at 16:11 +0200, Ilias Apalodimas wrote: > > > > Hi Felix, > > > > > > > > ++cc Alexander and Yunsheng. > > > > > > > > Thanks for the report > > > > > > > > On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote: > > > > > > > > > > While testing fragmented page_pool allocation in the mt76 driver, I was able > > > > > to reliably trigger page refcount underflow issues, which did not occur with > > > > > full-page page_pool allocation. > > > > > It appears to me, that handling refcounting in two separate counters > > > > > (page->pp_frag_count and page refcount) is racy when page refcount gets > > > > > incremented by code dealing with skb fragments directly, and > > > > > page_pool_return_skb_page is called multiple times for the same fragment. > > > > > > > > > > Dropping page->pp_frag_count and relying entirely on the page refcount makes > > > > > these underflow issues and crashes go away. > > > > > > > > > > > > > This has been discussed here [1]. TL;DR changing this to page > > > > refcount might blow up in other colorful ways. Can we look closer and > > > > figure out why the underflow happens? > > > > > > > > [1] https://lore.kernel.org/netdev/1625903002-31619-4-git-send-email-linyunsheng@huawei.com/ > > > > > > > > Thanks > > > > /Ilias > > > > > > > > > > > > > > The logic should be safe in terms of the page pool itself as it should > > > be holding one reference to the page while the pp_frag_count is non- > > > zero. That one reference is what keeps the two halfs in sync as the > > > page shouldn't be able to be freed until we exhaust the pp_frag_count. > > > > Do you remember why we decided to go with the fragment counter instead of > > page references? > > The issue has to do with when to destroy the mappings. Basically with > the fragment counter we destroy the mappings and remove the page from > the pool when the count hits 0. The reference count is really used for > the page allocator to do its tracking. If we end up trying to merge > the two the problem becomes one of lifetimes as we wouldn't know when > to destroy the DMA mappings as they would have to live the full life > of the page. Ah yes thanks! We need that on a comment somewhere, I keep forgetting... Basically the pp_frag_count is our number of outstanding dma mappings. > > > > > > > To have an underflow there are two possible scenarios. One is that > > > either put_page or free_page is being called somewhere that the > > > page_pool freeing functions should be used. > > > > Wouldn't that affect the non fragmented path as well? IOW the driver that > > works with a full page would crash as well. > > The problem is the non-fragmented path doesn't get as noisy. Also > there aren't currently any wireless drivers making use of the page > pool, or at least that is my understanding. I'm suspecting something > like the issue we saw in 1effe8ca4e34c ("skbuff: fix coalescing for > page_pool fragment recycling"). We likely have some corner case where > we should be taking a page reference and clearing a pp_recycle flag. Yea, same thinking here. I'll have another closer look tomorrow, but looking at the wireless internals what happens is 1. They alloc a fragment 2. They create a new SKB, without the recycle bit and refer to the existing fragments. Since the recyle bit is off *that* skb will never try to decrease the frag counter. Instead it bumps the page refcnt which should be properly decremented one that SKB is freed. I guess somehow an SKB ends up with the recycle bit set, when it shouldn't. Regards /Ilias
On Thu, Jan 26, 2023 at 1:15 AM Felix Fietkau <nbd@nbd.name> wrote: > > On 26.01.23 07:12, Felix Fietkau wrote: > > On 25.01.23 23:14, Alexander H Duyck wrote: > >> On Wed, 2023-01-25 at 20:40 +0100, Felix Fietkau wrote: > >>> On 25.01.23 20:10, Felix Fietkau wrote: > >>> > On 25.01.23 20:02, Alexander H Duyck wrote: > >>> > > On Wed, 2023-01-25 at 19:42 +0100, Felix Fietkau wrote: > >>> > > > On 25.01.23 19:26, Alexander H Duyck wrote: > >>> > > > > On Wed, 2023-01-25 at 18:32 +0100, Felix Fietkau wrote: > >>> > > > > > On 25.01.23 18:11, Alexander H Duyck wrote: > >>> > > > > > > On Tue, 2023-01-24 at 22:30 +0100, Felix Fietkau wrote: > >>> > > > > > > > On 24.01.23 22:10, Alexander H Duyck wrote: > >>> > > > > > > > > On Tue, 2023-01-24 at 18:22 +0100, Felix Fietkau wrote: > >>> > > > > > > > > > On 24.01.23 15:11, Ilias Apalodimas wrote: > >>> > > > > > > > > > > Hi Felix, > >>> > > > > > > > > > > > >>> > > > > > > > > > > ++cc Alexander and Yunsheng. > >>> > > > > > > > > > > > >>> > > > > > > > > > > Thanks for the report > >>> > > > > > > > > > > > >>> > > > > > > > > > > On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote: > >>> > > > > > > > > > > > > >>> > > > > > > > > > > > While testing fragmented page_pool allocation in the mt76 driver, I was able > >>> > > > > > > > > > > > to reliably trigger page refcount underflow issues, which did not occur with > >>> > > > > > > > > > > > full-page page_pool allocation. > >>> > > > > > > > > > > > It appears to me, that handling refcounting in two separate counters > >>> > > > > > > > > > > > (page->pp_frag_count and page refcount) is racy when page refcount gets > >>> > > > > > > > > > > > incremented by code dealing with skb fragments directly, and > >>> > > > > > > > > > > > page_pool_return_skb_page is called multiple times for the same fragment. > >>> > > > > > > > > > > > > >>> > > > > > > > > > > > Dropping page->pp_frag_count and relying entirely on the page refcount makes > >>> > > > > > > > > > > > these underflow issues and crashes go away. > >>> > > > > > > > > > > > > >>> > > > > > > > > > > > >>> > > > > > > > > > > This has been discussed here [1]. TL;DR changing this to page > >>> > > > > > > > > > > refcount might blow up in other colorful ways. Can we look closer and > >>> > > > > > > > > > > figure out why the underflow happens? > >>> > > > > > > > > > I don't see how the approch taken in my patch would blow up. From what I > >>> > > > > > > > > > can tell, it should be fairly close to how refcount is handled in > >>> > > > > > > > > > page_frag_alloc. The main improvement it adds is to prevent it from > >>> > > > > > > > > > blowing up if pool-allocated fragments get shared across multiple skbs > >>> > > > > > > > > > with corresponding get_page and page_pool_return_skb_page calls. > >>> > > > > > > > > > > >>> > > > > > > > > > - Felix > >>> > > > > > > > > > > >>> > > > > > > > > > >>> > > > > > > > > Do you have the patch available to review as an RFC? From what I am > >>> > > > > > > > > seeing it looks like you are underrunning on the pp_frag_count itself. > >>> > > > > > > > > I would suspect the issue to be something like starting with a bad > >>> > > > > > > > > count in terms of the total number of references, or deducing the wrong > >>> > > > > > > > > amount when you finally free the page assuming you are tracking your > >>> > > > > > > > > frag count using a non-atomic value in the driver. > >>> > > > > > > > The driver patches for page pool are here: > >>> > > > > > > > https://patchwork.kernel.org/project/linux-wireless/patch/64abb23f4867c075c19d704beaae5a0a2f8e8821.1673963374.git.lorenzo@kernel.org/ > >>> > > > > > > > https://patchwork.kernel.org/project/linux-wireless/patch/68081e02cbe2afa2d35c8aa93194f0adddbd0f05.1673963374.git.lorenzo@kernel.org/ > >>> > > > > > > > > >>> > > > > > > > They are also applied in my mt76 tree at: > >>> > > > > > > > https://github.com/nbd168/wireless > >>> > > > > > > > > >>> > > > > > > > - Felix > >>> > > > > > > > >>> > > > > > > So one thing I am thinking is that we may be seeing an issue where we > >>> > > > > > > are somehow getting a mix of frag and non-frag based page pool pages. > >>> > > > > > > That is the only case I can think of where we might be underflowing > >>> > > > > > > negative. If you could add some additional debug info on the underflow > >>> > > > > > > WARN_ON case in page_pool_defrag_page that might be useful. > >>> > > > > > > Specifically I would be curious what the actual return value is. I'm > >>> > > > > > > assuming we are only hitting negative 1, but I would want to verify we > >>> > > > > > > aren't seeing something else. > >>> > > > > > I'll try to run some more tests soon. However, I think I found the piece > >>> > > > > > of code that is incompatible with using pp_frag_count. > >>> > > > > > When receiving an A-MSDU packet (multiple MSDUs within a single 802.11 > >>> > > > > > packet), and it is not split by the hardware, a cfg80211 function > >>> > > > > > extracts the individual MSDUs into separate skbs. In that case, a > >>> > > > > > fragment can be shared across multiple skbs, and get_page is used to > >>> > > > > > increase the refcount. > >>> > > > > > You can find this in net/wireless/util.c: ieee80211_amsdu_to_8023s (and > >>> > > > > > its helper functions). > >>> > > > > > >>> > > > > I'm not sure if it is problematic or not. Basically it is trading off > >>> > > > > by copying over the frags, calling get_page on each frag, and then > >>> > > > > using dev_kfree_skb to disassemble and release the pp_frag references. > >>> > > > > There should be other paths in the kernel that are doing something > >>> > > > > similar. > >>> > > > > > >>> > > > > > This code also has a bug where it doesn't set pp_recycle on the newly > >>> > > > > > allocated skb if the previous one has it, but that's a separate matter > >>> > > > > > and fixing it doesn't make the crash go away. > >>> > > > > > >>> > > > > Adding the recycle would cause this bug. So one thing we might be > >>> > > > > seeing is something like that triggering this error. Specifically if > >>> > > > > the page is taken via get_page when assembling the new skb then we > >>> > > > > cannot set the recycle flag in the new skb otherwise it will result in > >>> > > > > the reference undercount we are seeing. What we are doing is shifting > >>> > > > > the references away from the pp_frag_count to the page reference count > >>> > > > > in this case. If we set the pp_recycle flag then it would cause us to > >>> > > > > decrement pp_frag_count instead of the page reference count resulting > >>> > > > > in the underrun. > >>> > > > Couldn't leaving out the pp_recycle flag potentially lead to a case > >>> > > > where the last user of the page drops it via page_frag_free instead of > >>> > > > page_pool_return_skb_page? Is that valid? > >>> > > > >>> > > No. What will happen is that when the pp_frag_count is exhausted the > >>> > > page will be unmapped and evicted from the page pool. When the page is > >>> > > then finally freed it will end up going back to the page allocator > >>> > > instead of page pool. > >>> > > > >>> > > Basically the idea is that until pp_frag_count reaches 0 there will be > >>> > > at least 1 page reference held. > >>> > > > >>> > > > > > Is there any way I can make that part of the code work with the current > >>> > > > > > page pool frag implementation? > >>> > > > > > >>> > > > > The current code should work. Basically as long as the references are > >>> > > > > taken w/ get_page and skb->pp_recycle is not set then we shouldn't run > >>> > > > > into this issue because the pp_frag_count will be dropped when the > >>> > > > > original skb is freed and the page reference count will be decremented > >>> > > > > when the new one is freed. > >>> > > > > > >>> > > > > For page pool page fragments the main thing to keep in mind is that if > >>> > > > > pp_recycle is set it will update the pp_frag_count and if it is not > >>> > > > > then it will just decrement the page reference count. > >>> > > > What takes care of DMA unmap and other cleanup if the last reference to > >>> > > > the page is dropped via page_frag_free? > >>> > > > > >>> > > > - Felix > >>> > > > >>> > > When the page is freed on the skb w/ pp_recycle set it will unmap the > >>> > > page and evict it from the page pool. Basically in these cases the page > >>> > > goes from the page pool back to the page allocator. > >>> > > > >>> > > The general idea with this is that if we are using fragments that there > >>> > > will be enough of them floating around that if one or two frags have a > >>> > > temporeary detour through a non-recycling path that hopefully by the > >>> > > time the last fragment is freed the other instances holding the > >>> > > additional page reference will have let them go. If not then the page > >>> > > will go back to the page allocator and it will have to be replaced in > >>> > > the page pool. > >>> > Thanks for the explanation, it makes sense to me now. Unfortunately it > >>> > also means that I have no idea what could cause this issue. I will > >>> > finish my mt76 patch rework which gets rid of the pp vs non-pp > >>> > allocation mix and re-run my tests to provide updated traces. > >>> Here's the updated mt76 page pool support commit: > >>> https://github.com/nbd168/wireless/commit/923cdab6d4c92a0acb3536b3b0cc4af9fee7c808 > >> > >> Yeah, so I don't see anything wrong with the patch in terms of page > >> pool. > >> > >>> And here is the trace that I'm getting with 6.1: > >>> https://nbd.name/p/a16957f2 > >>> > >>> If you have any debug patch you'd like me to test, please let me know. > >>> > >>> - Felix > >> > >> So looking at the traces I am assuming what we are seeing is the > >> deferred freeing from the TCP Rx path since I don't see a driver > >> anywhere between net_rx_action and napi_consume skb. So it seems like > >> the packets are likely making it all the way up the network stack. > >> > >> Is this the first wireless driver to add support for page pool? I'm > >> thinking we must be seeing something in the wireless path that is > >> causing an issue such as the function you called out earlier but I > >> can't see anything obvious. > > Yes, it's the first driver with page pool support. > > > >> One thing we need to be on the lookout for is cloned skbs. When an skb > >> is cloned the pp_recycle gets copied over. In that case the reference > >> is moved over to the skb dataref count. What comes to mind is something > >> like commit 1effe8ca4e34c ("skbuff: fix coalescing for page_pool > >> fragment recycling"). > > I suspect that the crash might be related to a bad interaction between > > the page reuse in A-MSDU rx + skb coalescing on TCP rx. > > If I change the A-MSDU code to copy data instead of reusing fragments, > > it doesn't crash anymore. Which piece did you change? My main interest would be trying to narrow down the section of this function that is causing this. Did you modify __ieee80211_amsdu_copy or some other function within the setup? > > I believe the issue must be specific to that codepath, since most > > received and processed packets are either not A-MSDU or A-MSDU decap has > > already been performed by the hardware. > > If I change my test to use 3 client mode interfaces instead of 4, the > > hardware is able to offload all A-MSDU rx processing and I don't see any > > crashes anymore. > > > > Could you please take another look at ieee80211_amsdu_to_8023s to see if > > there's anything in there that could cause these issues? The thing is I am not sure it is the only cause for this. I am suspecting we are seeing this triggering an issue when combined with something else. If we could add some tracing to dump the skb and list buffers that might be helpful. We would want to verify the pp_recycle value, clone flag, and for the frags we would want to frag count and page reference counts. The expectation would be that the original skb should have the pp_recycle flag set and the frag counts consistent through the process, and the list skbs should all have taken page references w/ no pp_recycle on the skbs in the list. > Here are clues from a few more tests that I ran: > - preventing the reuse of the last skb in ieee80211_amsdu_to_8023s does > not prevent the crashes, so the issue is indeed related to taking page > references and putting the pages in skb fragments. You said in the first email it stops it and in the second "does not". I am assuming that is some sort of typo since you seem to be implying it does resolve it for you. Is that correct? > - if I return false in skb_try_coalesce, it still crashes: > https://nbd.name/p/18cac078 Yeah, I wasn't suspecting skb_try_coalesce since we have exercised the code. My thought was something like the function you mentioned above plus cloning or something else.
? On Thu, Jan 26, 2023 at 8:08 AM Alexander Duyck <alexander.duyck@gmail.com> wrote: > > On Thu, Jan 26, 2023 at 1:15 AM Felix Fietkau <nbd@nbd.name> wrote: > > ... > > - if I return false in skb_try_coalesce, it still crashes: > > https://nbd.name/p/18cac078 > > Yeah, I wasn't suspecting skb_try_coalesce since we have exercised the > code. My thought was something like the function you mentioned above > plus cloning or something else. One question I would have. Is GRO happening after the call to ieee80211_amsdu_to_8023s? If so we might want to try switching that off to see if it might be aggregating page pool frames and non-page pool frames together.
On 26.01.23 17:08, Alexander Duyck wrote: > On Thu, Jan 26, 2023 at 1:15 AM Felix Fietkau <nbd@nbd.name> wrote: >> >> On 26.01.23 07:12, Felix Fietkau wrote: >> > On 25.01.23 23:14, Alexander H Duyck wrote: >> >> On Wed, 2023-01-25 at 20:40 +0100, Felix Fietkau wrote: >> >>> On 25.01.23 20:10, Felix Fietkau wrote: >> >>> > On 25.01.23 20:02, Alexander H Duyck wrote: >> >>> > > On Wed, 2023-01-25 at 19:42 +0100, Felix Fietkau wrote: >> >>> > > > On 25.01.23 19:26, Alexander H Duyck wrote: >> >>> > > > > On Wed, 2023-01-25 at 18:32 +0100, Felix Fietkau wrote: >> >>> > > > > > On 25.01.23 18:11, Alexander H Duyck wrote: >> >>> > > > > > > On Tue, 2023-01-24 at 22:30 +0100, Felix Fietkau wrote: >> >>> > > > > > > > On 24.01.23 22:10, Alexander H Duyck wrote: >> >>> > > > > > > > > On Tue, 2023-01-24 at 18:22 +0100, Felix Fietkau wrote: >> >>> > > > > > > > > > On 24.01.23 15:11, Ilias Apalodimas wrote: >> >>> > > > > > > > > > > Hi Felix, >> >>> > > > > > > > > > > >> >>> > > > > > > > > > > ++cc Alexander and Yunsheng. >> >>> > > > > > > > > > > >> >>> > > > > > > > > > > Thanks for the report >> >>> > > > > > > > > > > >> >>> > > > > > > > > > > On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote: >> >>> > > > > > > > > > > > >> >>> > > > > > > > > > > > While testing fragmented page_pool allocation in the mt76 driver, I was able >> >>> > > > > > > > > > > > to reliably trigger page refcount underflow issues, which did not occur with >> >>> > > > > > > > > > > > full-page page_pool allocation. >> >>> > > > > > > > > > > > It appears to me, that handling refcounting in two separate counters >> >>> > > > > > > > > > > > (page->pp_frag_count and page refcount) is racy when page refcount gets >> >>> > > > > > > > > > > > incremented by code dealing with skb fragments directly, and >> >>> > > > > > > > > > > > page_pool_return_skb_page is called multiple times for the same fragment. >> >>> > > > > > > > > > > > >> >>> > > > > > > > > > > > Dropping page->pp_frag_count and relying entirely on the page refcount makes >> >>> > > > > > > > > > > > these underflow issues and crashes go away. >> >>> > > > > > > > > > > > >> >>> > > > > > > > > > > >> >>> > > > > > > > > > > This has been discussed here [1]. TL;DR changing this to page >> >>> > > > > > > > > > > refcount might blow up in other colorful ways. Can we look closer and >> >>> > > > > > > > > > > figure out why the underflow happens? >> >>> > > > > > > > > > I don't see how the approch taken in my patch would blow up. From what I >> >>> > > > > > > > > > can tell, it should be fairly close to how refcount is handled in >> >>> > > > > > > > > > page_frag_alloc. The main improvement it adds is to prevent it from >> >>> > > > > > > > > > blowing up if pool-allocated fragments get shared across multiple skbs >> >>> > > > > > > > > > with corresponding get_page and page_pool_return_skb_page calls. >> >>> > > > > > > > > > >> >>> > > > > > > > > > - Felix >> >>> > > > > > > > > > >> >>> > > > > > > > > >> >>> > > > > > > > > Do you have the patch available to review as an RFC? From what I am >> >>> > > > > > > > > seeing it looks like you are underrunning on the pp_frag_count itself. >> >>> > > > > > > > > I would suspect the issue to be something like starting with a bad >> >>> > > > > > > > > count in terms of the total number of references, or deducing the wrong >> >>> > > > > > > > > amount when you finally free the page assuming you are tracking your >> >>> > > > > > > > > frag count using a non-atomic value in the driver. >> >>> > > > > > > > The driver patches for page pool are here: >> >>> > > > > > > > https://patchwork.kernel.org/project/linux-wireless/patch/64abb23f4867c075c19d704beaae5a0a2f8e8821.1673963374.git.lorenzo@kernel.org/ >> >>> > > > > > > > https://patchwork.kernel.org/project/linux-wireless/patch/68081e02cbe2afa2d35c8aa93194f0adddbd0f05.1673963374.git.lorenzo@kernel.org/ >> >>> > > > > > > > >> >>> > > > > > > > They are also applied in my mt76 tree at: >> >>> > > > > > > > https://github.com/nbd168/wireless >> >>> > > > > > > > >> >>> > > > > > > > - Felix >> >>> > > > > > > >> >>> > > > > > > So one thing I am thinking is that we may be seeing an issue where we >> >>> > > > > > > are somehow getting a mix of frag and non-frag based page pool pages. >> >>> > > > > > > That is the only case I can think of where we might be underflowing >> >>> > > > > > > negative. If you could add some additional debug info on the underflow >> >>> > > > > > > WARN_ON case in page_pool_defrag_page that might be useful. >> >>> > > > > > > Specifically I would be curious what the actual return value is. I'm >> >>> > > > > > > assuming we are only hitting negative 1, but I would want to verify we >> >>> > > > > > > aren't seeing something else. >> >>> > > > > > I'll try to run some more tests soon. However, I think I found the piece >> >>> > > > > > of code that is incompatible with using pp_frag_count. >> >>> > > > > > When receiving an A-MSDU packet (multiple MSDUs within a single 802.11 >> >>> > > > > > packet), and it is not split by the hardware, a cfg80211 function >> >>> > > > > > extracts the individual MSDUs into separate skbs. In that case, a >> >>> > > > > > fragment can be shared across multiple skbs, and get_page is used to >> >>> > > > > > increase the refcount. >> >>> > > > > > You can find this in net/wireless/util.c: ieee80211_amsdu_to_8023s (and >> >>> > > > > > its helper functions). >> >>> > > > > >> >>> > > > > I'm not sure if it is problematic or not. Basically it is trading off >> >>> > > > > by copying over the frags, calling get_page on each frag, and then >> >>> > > > > using dev_kfree_skb to disassemble and release the pp_frag references. >> >>> > > > > There should be other paths in the kernel that are doing something >> >>> > > > > similar. >> >>> > > > > >> >>> > > > > > This code also has a bug where it doesn't set pp_recycle on the newly >> >>> > > > > > allocated skb if the previous one has it, but that's a separate matter >> >>> > > > > > and fixing it doesn't make the crash go away. >> >>> > > > > >> >>> > > > > Adding the recycle would cause this bug. So one thing we might be >> >>> > > > > seeing is something like that triggering this error. Specifically if >> >>> > > > > the page is taken via get_page when assembling the new skb then we >> >>> > > > > cannot set the recycle flag in the new skb otherwise it will result in >> >>> > > > > the reference undercount we are seeing. What we are doing is shifting >> >>> > > > > the references away from the pp_frag_count to the page reference count >> >>> > > > > in this case. If we set the pp_recycle flag then it would cause us to >> >>> > > > > decrement pp_frag_count instead of the page reference count resulting >> >>> > > > > in the underrun. >> >>> > > > Couldn't leaving out the pp_recycle flag potentially lead to a case >> >>> > > > where the last user of the page drops it via page_frag_free instead of >> >>> > > > page_pool_return_skb_page? Is that valid? >> >>> > > >> >>> > > No. What will happen is that when the pp_frag_count is exhausted the >> >>> > > page will be unmapped and evicted from the page pool. When the page is >> >>> > > then finally freed it will end up going back to the page allocator >> >>> > > instead of page pool. >> >>> > > >> >>> > > Basically the idea is that until pp_frag_count reaches 0 there will be >> >>> > > at least 1 page reference held. >> >>> > > >> >>> > > > > > Is there any way I can make that part of the code work with the current >> >>> > > > > > page pool frag implementation? >> >>> > > > > >> >>> > > > > The current code should work. Basically as long as the references are >> >>> > > > > taken w/ get_page and skb->pp_recycle is not set then we shouldn't run >> >>> > > > > into this issue because the pp_frag_count will be dropped when the >> >>> > > > > original skb is freed and the page reference count will be decremented >> >>> > > > > when the new one is freed. >> >>> > > > > >> >>> > > > > For page pool page fragments the main thing to keep in mind is that if >> >>> > > > > pp_recycle is set it will update the pp_frag_count and if it is not >> >>> > > > > then it will just decrement the page reference count. >> >>> > > > What takes care of DMA unmap and other cleanup if the last reference to >> >>> > > > the page is dropped via page_frag_free? >> >>> > > > >> >>> > > > - Felix >> >>> > > >> >>> > > When the page is freed on the skb w/ pp_recycle set it will unmap the >> >>> > > page and evict it from the page pool. Basically in these cases the page >> >>> > > goes from the page pool back to the page allocator. >> >>> > > >> >>> > > The general idea with this is that if we are using fragments that there >> >>> > > will be enough of them floating around that if one or two frags have a >> >>> > > temporeary detour through a non-recycling path that hopefully by the >> >>> > > time the last fragment is freed the other instances holding the >> >>> > > additional page reference will have let them go. If not then the page >> >>> > > will go back to the page allocator and it will have to be replaced in >> >>> > > the page pool. >> >>> > Thanks for the explanation, it makes sense to me now. Unfortunately it >> >>> > also means that I have no idea what could cause this issue. I will >> >>> > finish my mt76 patch rework which gets rid of the pp vs non-pp >> >>> > allocation mix and re-run my tests to provide updated traces. >> >>> Here's the updated mt76 page pool support commit: >> >>> https://github.com/nbd168/wireless/commit/923cdab6d4c92a0acb3536b3b0cc4af9fee7c808 >> >> >> >> Yeah, so I don't see anything wrong with the patch in terms of page >> >> pool. >> >> >> >>> And here is the trace that I'm getting with 6.1: >> >>> https://nbd.name/p/a16957f2 >> >>> >> >>> If you have any debug patch you'd like me to test, please let me know. >> >>> >> >>> - Felix >> >> >> >> So looking at the traces I am assuming what we are seeing is the >> >> deferred freeing from the TCP Rx path since I don't see a driver >> >> anywhere between net_rx_action and napi_consume skb. So it seems like >> >> the packets are likely making it all the way up the network stack. >> >> >> >> Is this the first wireless driver to add support for page pool? I'm >> >> thinking we must be seeing something in the wireless path that is >> >> causing an issue such as the function you called out earlier but I >> >> can't see anything obvious. >> > Yes, it's the first driver with page pool support. >> > >> >> One thing we need to be on the lookout for is cloned skbs. When an skb >> >> is cloned the pp_recycle gets copied over. In that case the reference >> >> is moved over to the skb dataref count. What comes to mind is something >> >> like commit 1effe8ca4e34c ("skbuff: fix coalescing for page_pool >> >> fragment recycling"). >> > I suspect that the crash might be related to a bad interaction between >> > the page reuse in A-MSDU rx + skb coalescing on TCP rx. >> > If I change the A-MSDU code to copy data instead of reusing fragments, >> > it doesn't crash anymore. > > Which piece did you change? My main interest would be trying to narrow > down the section of this function that is causing this. Did you modify > __ieee80211_amsdu_copy or some other function within the setup? I replaced this line: bool reuse_frag = skb->head_frag && !skb_has_frag_list(skb); with: bool reuse_frag = false; >> > I believe the issue must be specific to that codepath, since most >> > received and processed packets are either not A-MSDU or A-MSDU decap has >> > already been performed by the hardware. >> > If I change my test to use 3 client mode interfaces instead of 4, the >> > hardware is able to offload all A-MSDU rx processing and I don't see any >> > crashes anymore. >> > >> > Could you please take another look at ieee80211_amsdu_to_8023s to see if >> > there's anything in there that could cause these issues? > > The thing is I am not sure it is the only cause for this. I am > suspecting we are seeing this triggering an issue when combined with > something else. > > If we could add some tracing to dump the skb and list buffers that > might be helpful. We would want to verify the pp_recycle value, clone > flag, and for the frags we would want to frag count and page reference > counts. The expectation would be that the original skb should have the > pp_recycle flag set and the frag counts consistent through the > process, and the list skbs should all have taken page references w/ no > pp_recycle on the skbs in the list. > >> Here are clues from a few more tests that I ran: >> - preventing the reuse of the last skb in ieee80211_amsdu_to_8023s does >> not prevent the crashes, so the issue is indeed related to taking page >> references and putting the pages in skb fragments. > > You said in the first email it stops it and in the second "does not". > I am assuming that is some sort of typo since you seem to be implying > it does resolve it for you. Is that correct? For everything except for the last subframe, the function pulls fragments out of the original skb and puts them into a new skb allocated with dev_alloc_skb. Additionally, the last skb is reused for the final subframe. What I meant was: In order to figure out if the skb-reuse is problematic, I prevented it from happening, ensuring that all subframes are allocated and get the fragments from the skb. All I meant to say is that since that led to the same crash, the skb-reuse is not the problem. Regarding the question from your other mail: without GRO there is no crash and it looks stable. - Felix
> > > Which piece did you change? My main interest would be trying to narrow > > down the section of this function that is causing this. Did you modify > > __ieee80211_amsdu_copy or some other function within the setup? > I replaced this line: > bool reuse_frag = skb->head_frag && !skb_has_frag_list(skb); > with: > bool reuse_frag = false; I see. So essentially everything is copied into the new buffers. > > > > I believe the issue must be specific to that codepath, since most > > > > received and processed packets are either not A-MSDU or A-MSDU decap has > > > > already been performed by the hardware. > > > > If I change my test to use 3 client mode interfaces instead of 4, the > > > > hardware is able to offload all A-MSDU rx processing and I don't see any > > > > crashes anymore. > > > > > > > > Could you please take another look at ieee80211_amsdu_to_8023s to see if > > > > there's anything in there that could cause these issues? > > > > The thing is I am not sure it is the only cause for this. I am > > suspecting we are seeing this triggering an issue when combined with > > something else. > > > > If we could add some tracing to dump the skb and list buffers that > > might be helpful. We would want to verify the pp_recycle value, clone > > flag, and for the frags we would want to frag count and page reference > > counts. The expectation would be that the original skb should have the > > pp_recycle flag set and the frag counts consistent through the > > process, and the list skbs should all have taken page references w/ no > > pp_recycle on the skbs in the list. > > > > > Here are clues from a few more tests that I ran: > > > - preventing the reuse of the last skb in ieee80211_amsdu_to_8023s does > > > not prevent the crashes, so the issue is indeed related to taking page > > > references and putting the pages in skb fragments. > > > > You said in the first email it stops it and in the second "does not". > > I am assuming that is some sort of typo since you seem to be implying > > it does resolve it for you. Is that correct? > For everything except for the last subframe, the function pulls > fragments out of the original skb and puts them into a new skb allocated > with dev_alloc_skb. Additionally, the last skb is reused for the final > subframe. What I meant was: In order to figure out if the skb-reuse is > problematic, I prevented it from happening, ensuring that all subframes > are allocated and get the fragments from the skb. > All I meant to say is that since that led to the same crash, the > skb-reuse is not the problem. > > Regarding the question from your other mail: without GRO there is no > crash and it looks stable. > > - Felix > Okay, I think that tells me exactly what is going on. Can you give the change below a try and see if it solves the problem for you. I think what is happening is that after you are reassigning the frags they are getting merged into GRO frames where the head may have pp_recycle set. As a result I think the pages are getting recycled when they should be just freed via put_page. I'm suspecting this wasn't an issue up until now as I don't believe there are any that are running in a mixed mode where they have both pp_recycle and non-pp_recycle skbs coming from the same device. diff --git a/net/core/gro.c b/net/core/gro.c index 506f83d715f8..4bac7ea6e025 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -162,6 +162,15 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) struct sk_buff *lp; int segs; + /* Do not splice page pool based packets w/ non-page pool + * packets. This can result in reference count issues as page + * pool pages will not decrement the reference count and will + * instead be immediately returned to the pool or have frag + * count decremented. + */ + if (p->pp_recycle != skb->pp_recycle) + return -ETOOMANYREFS; + /* pairs with WRITE_ONCE() in netif_set_gro_max_size() */ gro_max_size = READ_ONCE(p->dev->gro_max_size);
On 26.01.23 19:38, Alexander H Duyck wrote: > Okay, I think that tells me exactly what is going on. Can you give the > change below a try and see if it solves the problem for you. > > I think what is happening is that after you are reassigning the frags > they are getting merged into GRO frames where the head may have > pp_recycle set. As a result I think the pages are getting recycled when > they should be just freed via put_page. > > I'm suspecting this wasn't an issue up until now as I don't believe > there are any that are running in a mixed mode where they have both > pp_recycle and non-pp_recycle skbs coming from the same device. > > diff --git a/net/core/gro.c b/net/core/gro.c > index 506f83d715f8..4bac7ea6e025 100644 > --- a/net/core/gro.c > +++ b/net/core/gro.c > @@ -162,6 +162,15 @@ int skb_gro_receive(struct sk_buff *p, struct > sk_buff *skb) > struct sk_buff *lp; > int segs; > > + /* Do not splice page pool based packets w/ non-page pool > + * packets. This can result in reference count issues as page > + * pool pages will not decrement the reference count and will > + * instead be immediately returned to the pool or have frag > + * count decremented. > + */ > + if (p->pp_recycle != skb->pp_recycle) > + return -ETOOMANYREFS; > + > /* pairs with WRITE_ONCE() in netif_set_gro_max_size() */ > gro_max_size = READ_ONCE(p->dev->gro_max_size); > That works, thanks! - Felix
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 9757067c3053..96ec3b19a86d 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -125,18 +125,11 @@ struct page { struct page_pool *pp; unsigned long _pp_mapping_pad; unsigned long dma_addr; - union { - /** - * dma_addr_upper: might require a 64-bit - * value on 32-bit architectures. - */ - unsigned long dma_addr_upper; - /** - * For frag page support, not supported in - * 32-bit architectures with 64-bit DMA. - */ - atomic_long_t pp_frag_count; - }; + /** + * dma_addr_upper: might require a 64-bit + * value on 32-bit architectures. + */ + unsigned long dma_addr_upper; }; struct { /* Tail pages of compound page */ unsigned long compound_head; /* Bit zero is set */ diff --git a/include/net/page_pool.h b/include/net/page_pool.h index 813c93499f20..28e1fdbdcd53 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -279,14 +279,14 @@ void page_pool_put_defragged_page(struct page_pool *pool, struct page *page, static inline void page_pool_fragment_page(struct page *page, long nr) { - atomic_long_set(&page->pp_frag_count, nr); + page_ref_add(page, nr); } static inline long page_pool_defrag_page(struct page *page, long nr) { long ret; - /* If nr == pp_frag_count then we have cleared all remaining + /* If nr == page_ref_count then we have cleared all remaining * references to the page. No need to actually overwrite it, instead * we can leave this to be overwritten by the calling function. * @@ -295,22 +295,14 @@ static inline long page_pool_defrag_page(struct page *page, long nr) * especially when dealing with a page that may be partitioned * into only 2 or 3 pieces. */ - if (atomic_long_read(&page->pp_frag_count) == nr) + if (page_ref_count(page) == nr) return 0; - ret = atomic_long_sub_return(nr, &page->pp_frag_count); + ret = page_ref_sub_return(page, nr); WARN_ON(ret < 0); return ret; } -static inline bool page_pool_is_last_frag(struct page_pool *pool, - struct page *page) -{ - /* If fragments aren't enabled or count is 0 we were the last user */ - return !(pool->p.flags & PP_FLAG_PAGE_FRAG) || - (page_pool_defrag_page(page, 1) == 0); -} - static inline void page_pool_put_page(struct page_pool *pool, struct page *page, unsigned int dma_sync_size, @@ -320,9 +312,6 @@ static inline void page_pool_put_page(struct page_pool *pool, * allow registering MEM_TYPE_PAGE_POOL, but shield linker. */ #ifdef CONFIG_PAGE_POOL - if (!page_pool_is_last_frag(pool, page)) - return; - page_pool_put_defragged_page(pool, page, dma_sync_size, allow_direct); #endif } diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 9b203d8660e4..0defcadae225 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -25,7 +25,7 @@ #define DEFER_TIME (msecs_to_jiffies(1000)) #define DEFER_WARN_INTERVAL (60 * HZ) -#define BIAS_MAX LONG_MAX +#define BIAS_MAX(pool) (PAGE_SIZE << ((pool)->p.order)) #ifdef CONFIG_PAGE_POOL_STATS /* alloc_stat_inc is intended to be used in softirq context */ @@ -619,10 +619,6 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data, for (i = 0; i < count; i++) { struct page *page = virt_to_head_page(data[i]); - /* It is not the last user for the page frag case */ - if (!page_pool_is_last_frag(pool, page)) - continue; - page = __page_pool_put_page(pool, page, -1, false); /* Approved for bulk recycling in ptr_ring cache */ if (page) @@ -659,7 +655,7 @@ EXPORT_SYMBOL(page_pool_put_page_bulk); static struct page *page_pool_drain_frag(struct page_pool *pool, struct page *page) { - long drain_count = BIAS_MAX - pool->frag_users; + long drain_count = BIAS_MAX(pool) - pool->frag_users; /* Some user is still using the page frag */ if (likely(page_pool_defrag_page(page, drain_count))) @@ -678,7 +674,7 @@ static struct page *page_pool_drain_frag(struct page_pool *pool, static void page_pool_free_frag(struct page_pool *pool) { - long drain_count = BIAS_MAX - pool->frag_users; + long drain_count = BIAS_MAX(pool) - pool->frag_users; struct page *page = pool->frag_page; pool->frag_page = NULL; @@ -724,7 +720,7 @@ struct page *page_pool_alloc_frag(struct page_pool *pool, pool->frag_users = 1; *offset = 0; pool->frag_offset = size; - page_pool_fragment_page(page, BIAS_MAX); + page_pool_fragment_page(page, BIAS_MAX(pool)); return page; }
While testing fragmented page_pool allocation in the mt76 driver, I was able to reliably trigger page refcount underflow issues, which did not occur with full-page page_pool allocation. It appears to me, that handling refcounting in two separate counters (page->pp_frag_count and page refcount) is racy when page refcount gets incremented by code dealing with skb fragments directly, and page_pool_return_skb_page is called multiple times for the same fragment. Dropping page->pp_frag_count and relying entirely on the page refcount makes these underflow issues and crashes go away. Cc: Lorenzo Bianconi <lorenzo@kernel.org> Signed-off-by: Felix Fietkau <nbd@nbd.name> --- include/linux/mm_types.h | 17 +++++------------ include/net/page_pool.h | 19 ++++--------------- net/core/page_pool.c | 12 ++++-------- 3 files changed, 13 insertions(+), 35 deletions(-)