Message ID | 20250401-page-pool-track-dma-v6-2-8b83474870d4@redhat.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | Fix late DMA unmap crash for page pool | expand |
On 4/1/25 10:27, Toke Høiland-Jørgensen wrote: ... > Reported-by: Yonglong Liu <liuyonglong@huawei.com> > Closes: https://lore.kernel.org/r/8743264a-9700-4227-a556-5f931c720211@huawei.com > Fixes: ff7d6b27f894 ("page_pool: refurbish version of page_pool code") > Suggested-by: Mina Almasry <almasrymina@google.com> > Reviewed-by: Mina Almasry <almasrymina@google.com> > Reviewed-by: Jesper Dangaard Brouer <hawk@kernel.org> > Tested-by: Jesper Dangaard Brouer <hawk@kernel.org> > Tested-by: Qiuling Ren <qren@redhat.com> > Tested-by: Yuying Ma <yuma@redhat.com> > Tested-by: Yonglong Liu <liuyonglong@huawei.com> > Acked-by: Jesper Dangaard Brouer <hawk@kernel.org> > Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com> I haven't looked into the bit carving, but the rest looks good to me. A few nits below, ... > diff --git a/net/core/page_pool.c b/net/core/page_pool.c > index 7745ad924ae2d801580a6760eba9393e1cf67b01..52b5ddab7ecb405066fd55b8d61abfd4186b9dcf 100644 > --- a/net/core/page_pool.c > +++ b/net/core/page_pool.c > @@ -227,6 +227,8 @@ static int page_pool_init(struct page_pool *pool, > return -EINVAL; > > pool->dma_map = true; > + > + xa_init_flags(&pool->dma_mapped, XA_FLAGS_ALLOC1); nit: might be better to init/destroy unconditionally, it doesn't allocate any memory. > } > > if (pool->slow.flags & PP_FLAG_DMA_SYNC_DEV) { > @@ -276,9 +278,6 @@ static int page_pool_init(struct page_pool *pool, > /* Driver calling page_pool_create() also call page_pool_destroy() */ > refcount_set(&pool->user_cnt, 1); > > - if (pool->dma_map) > - get_device(pool->p.dev); > - > if (pool->slow.flags & PP_FLAG_ALLOW_UNREADABLE_NETMEM) { > netdev_assert_locked(pool->slow.netdev); > rxq = __netif_get_rx_queue(pool->slow.netdev, > @@ -322,7 +321,7 @@ static void page_pool_uninit(struct page_pool *pool) > ptr_ring_cleanup(&pool->ring, NULL); > > if (pool->dma_map) > - put_device(pool->p.dev); > + xa_destroy(&pool->dma_mapped); > > #ifdef CONFIG_PAGE_POOL_STATS > if (!pool->system) > @@ -463,13 +462,21 @@ page_pool_dma_sync_for_device(const struct page_pool *pool, > netmem_ref netmem, > u32 dma_sync_size) > { > - if (pool->dma_sync && dma_dev_need_sync(pool->p.dev)) > - __page_pool_dma_sync_for_device(pool, netmem, dma_sync_size); > + if (READ_ONCE(pool->dma_sync) && dma_dev_need_sync(pool->p.dev)) { > + rcu_read_lock(); > + /* re-check under rcu_read_lock() to sync with page_pool_scrub() */ > + if (READ_ONCE(pool->dma_sync)) > + __page_pool_dma_sync_for_device(pool, netmem, > + dma_sync_size); > + rcu_read_unlock(); > + } > } > > -static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem) > +static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem, gfp_t gfp) > { > dma_addr_t dma; > + int err; > + u32 id; > > /* Setup DMA mapping: use 'struct page' area for storing DMA-addr > * since dma_addr_t can be either 32 or 64 bits and does not always fit > @@ -483,15 +490,28 @@ static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem) > if (dma_mapping_error(pool->p.dev, dma)) > return false; > > - if (page_pool_set_dma_addr_netmem(netmem, dma)) > + if (in_softirq()) > + err = xa_alloc(&pool->dma_mapped, &id, netmem_to_page(netmem), > + PP_DMA_INDEX_LIMIT, gfp); > + else > + err = xa_alloc_bh(&pool->dma_mapped, &id, netmem_to_page(netmem), > + PP_DMA_INDEX_LIMIT, gfp); Is it an optimisation? bh disable should be reentrable and could just be xa_alloc_bh(). KERN_{NOTICE,INFO} Maybe? > + if (err) { > + WARN_ONCE(1, "couldn't track DMA mapping, please report to netdev@"); That can happen with enough memory pressure, I don't think it should be a warning. Maybe some pr_info? > goto unmap_failed; > + } > > + if (page_pool_set_dma_addr_netmem(netmem, dma)) { > + WARN_ONCE(1, "unexpected DMA address, please report to netdev@"); > + goto unmap_failed; > + } > + > + netmem_set_dma_index(netmem, id); > page_pool_dma_sync_for_device(pool, netmem, pool->p.max_len); > > return true;
Pavel Begunkov <asml.silence@gmail.com> writes: > On 4/1/25 10:27, Toke Høiland-Jørgensen wrote: > ... >> Reported-by: Yonglong Liu <liuyonglong@huawei.com> >> Closes: https://lore.kernel.org/r/8743264a-9700-4227-a556-5f931c720211@huawei.com >> Fixes: ff7d6b27f894 ("page_pool: refurbish version of page_pool code") >> Suggested-by: Mina Almasry <almasrymina@google.com> >> Reviewed-by: Mina Almasry <almasrymina@google.com> >> Reviewed-by: Jesper Dangaard Brouer <hawk@kernel.org> >> Tested-by: Jesper Dangaard Brouer <hawk@kernel.org> >> Tested-by: Qiuling Ren <qren@redhat.com> >> Tested-by: Yuying Ma <yuma@redhat.com> >> Tested-by: Yonglong Liu <liuyonglong@huawei.com> >> Acked-by: Jesper Dangaard Brouer <hawk@kernel.org> >> Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com> > > I haven't looked into the bit carving, but the rest looks > good to me. A few nits below, > > ... >> diff --git a/net/core/page_pool.c b/net/core/page_pool.c >> index 7745ad924ae2d801580a6760eba9393e1cf67b01..52b5ddab7ecb405066fd55b8d61abfd4186b9dcf 100644 >> --- a/net/core/page_pool.c >> +++ b/net/core/page_pool.c >> @@ -227,6 +227,8 @@ static int page_pool_init(struct page_pool *pool, >> return -EINVAL; >> >> pool->dma_map = true; >> + >> + xa_init_flags(&pool->dma_mapped, XA_FLAGS_ALLOC1); > > nit: might be better to init/destroy unconditionally, it doesn't > allocate any memory. Hmm, yeah, suppose both could work; I do think this makes it clearer that it's tied to DMA mapping, but I won't insist. Not sure it's worth respinning just for this, though (see below). >> } >> >> if (pool->slow.flags & PP_FLAG_DMA_SYNC_DEV) { >> @@ -276,9 +278,6 @@ static int page_pool_init(struct page_pool *pool, >> /* Driver calling page_pool_create() also call page_pool_destroy() */ >> refcount_set(&pool->user_cnt, 1); >> >> - if (pool->dma_map) >> - get_device(pool->p.dev); >> - >> if (pool->slow.flags & PP_FLAG_ALLOW_UNREADABLE_NETMEM) { >> netdev_assert_locked(pool->slow.netdev); >> rxq = __netif_get_rx_queue(pool->slow.netdev, >> @@ -322,7 +321,7 @@ static void page_pool_uninit(struct page_pool *pool) >> ptr_ring_cleanup(&pool->ring, NULL); >> >> if (pool->dma_map) >> - put_device(pool->p.dev); >> + xa_destroy(&pool->dma_mapped); >> >> #ifdef CONFIG_PAGE_POOL_STATS >> if (!pool->system) >> @@ -463,13 +462,21 @@ page_pool_dma_sync_for_device(const struct page_pool *pool, >> netmem_ref netmem, >> u32 dma_sync_size) >> { >> - if (pool->dma_sync && dma_dev_need_sync(pool->p.dev)) >> - __page_pool_dma_sync_for_device(pool, netmem, dma_sync_size); >> + if (READ_ONCE(pool->dma_sync) && dma_dev_need_sync(pool->p.dev)) { >> + rcu_read_lock(); >> + /* re-check under rcu_read_lock() to sync with page_pool_scrub() */ >> + if (READ_ONCE(pool->dma_sync)) >> + __page_pool_dma_sync_for_device(pool, netmem, >> + dma_sync_size); >> + rcu_read_unlock(); >> + } >> } >> >> -static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem) >> +static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem, gfp_t gfp) >> { >> dma_addr_t dma; >> + int err; >> + u32 id; >> >> /* Setup DMA mapping: use 'struct page' area for storing DMA-addr >> * since dma_addr_t can be either 32 or 64 bits and does not always fit >> @@ -483,15 +490,28 @@ static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem) >> if (dma_mapping_error(pool->p.dev, dma)) >> return false; >> >> - if (page_pool_set_dma_addr_netmem(netmem, dma)) >> + if (in_softirq()) >> + err = xa_alloc(&pool->dma_mapped, &id, netmem_to_page(netmem), >> + PP_DMA_INDEX_LIMIT, gfp); >> + else >> + err = xa_alloc_bh(&pool->dma_mapped, &id, netmem_to_page(netmem), >> + PP_DMA_INDEX_LIMIT, gfp); > > Is it an optimisation? bh disable should be reentrable and could > just be xa_alloc_bh(). Yeah, it's an optimisation. We do the same thing in page_pool_recycle_in_ring(), so I just kept the same pattern. > KERN_{NOTICE,INFO} Maybe? Erm? Was this supposed to be part of the comment below? >> + if (err) { >> + WARN_ONCE(1, "couldn't track DMA mapping, please report to netdev@"); > > That can happen with enough memory pressure, I don't think > it should be a warning. Maybe some pr_info? So my reasoning here was that this code is only called in the alloc path, so if we're under memory pressure, the page allocation itself should fail before the xarray alloc does. And if it doesn't (i.e., if the use of xarray itself causes allocation failures), we really want to know about it so we can change things. Hence the loud warning. @maintainers, given the comments above I'm not going to respin for this unless you tell me to, but let me know! :) -Toke
On 4/2/25 12:10, Toke Høiland-Jørgensen wrote: > Pavel Begunkov <asml.silence@gmail.com> writes: > >> On 4/1/25 10:27, Toke Høiland-Jørgensen wrote: >> ... >>> Reported-by: Yonglong Liu <liuyonglong@huawei.com> >>> Closes: https://lore.kernel.org/r/8743264a-9700-4227-a556-5f931c720211@huawei.com >>> Fixes: ff7d6b27f894 ("page_pool: refurbish version of page_pool code") >>> Suggested-by: Mina Almasry <almasrymina@google.com> >>> Reviewed-by: Mina Almasry <almasrymina@google.com> >>> Reviewed-by: Jesper Dangaard Brouer <hawk@kernel.org> >>> Tested-by: Jesper Dangaard Brouer <hawk@kernel.org> >>> Tested-by: Qiuling Ren <qren@redhat.com> >>> Tested-by: Yuying Ma <yuma@redhat.com> >>> Tested-by: Yonglong Liu <liuyonglong@huawei.com> >>> Acked-by: Jesper Dangaard Brouer <hawk@kernel.org> >>> Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com> >> >> I haven't looked into the bit carving, but the rest looks >> good to me. A few nits below, >> >> ... >>> diff --git a/net/core/page_pool.c b/net/core/page_pool.c >>> index 7745ad924ae2d801580a6760eba9393e1cf67b01..52b5ddab7ecb405066fd55b8d61abfd4186b9dcf 100644 >>> --- a/net/core/page_pool.c >>> +++ b/net/core/page_pool.c >>> @@ -227,6 +227,8 @@ static int page_pool_init(struct page_pool *pool, >>> return -EINVAL; >>> >>> pool->dma_map = true; >>> + >>> + xa_init_flags(&pool->dma_mapped, XA_FLAGS_ALLOC1); >> >> nit: might be better to init/destroy unconditionally, it doesn't >> allocate any memory. > > Hmm, yeah, suppose both could work; I do think this makes it clearer > that it's tied to DMA mapping, but I won't insist. Not sure it's worth > respinning just for this, though (see below). That's a somewhat safer way, but yes, I agree, it's not worth of a respin. > >>> } >>> >>> if (pool->slow.flags & PP_FLAG_DMA_SYNC_DEV) { >>> @@ -276,9 +278,6 @@ static int page_pool_init(struct page_pool *pool, >>> /* Driver calling page_pool_create() also call page_pool_destroy() */ >>> refcount_set(&pool->user_cnt, 1); >>> >>> - if (pool->dma_map) >>> - get_device(pool->p.dev); >>> - >>> if (pool->slow.flags & PP_FLAG_ALLOW_UNREADABLE_NETMEM) { >>> netdev_assert_locked(pool->slow.netdev); >>> rxq = __netif_get_rx_queue(pool->slow.netdev, >>> @@ -322,7 +321,7 @@ static void page_pool_uninit(struct page_pool *pool) >>> ptr_ring_cleanup(&pool->ring, NULL); >>> >>> if (pool->dma_map) >>> - put_device(pool->p.dev); >>> + xa_destroy(&pool->dma_mapped); >>> >>> #ifdef CONFIG_PAGE_POOL_STATS >>> if (!pool->system) >>> @@ -463,13 +462,21 @@ page_pool_dma_sync_for_device(const struct page_pool *pool, >>> netmem_ref netmem, >>> u32 dma_sync_size) >>> { >>> - if (pool->dma_sync && dma_dev_need_sync(pool->p.dev)) >>> - __page_pool_dma_sync_for_device(pool, netmem, dma_sync_size); >>> + if (READ_ONCE(pool->dma_sync) && dma_dev_need_sync(pool->p.dev)) { >>> + rcu_read_lock(); >>> + /* re-check under rcu_read_lock() to sync with page_pool_scrub() */ >>> + if (READ_ONCE(pool->dma_sync)) >>> + __page_pool_dma_sync_for_device(pool, netmem, >>> + dma_sync_size); >>> + rcu_read_unlock(); >>> + } >>> } >>> >>> -static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem) >>> +static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem, gfp_t gfp) >>> { >>> dma_addr_t dma; >>> + int err; >>> + u32 id; >>> >>> /* Setup DMA mapping: use 'struct page' area for storing DMA-addr >>> * since dma_addr_t can be either 32 or 64 bits and does not always fit >>> @@ -483,15 +490,28 @@ static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem) >>> if (dma_mapping_error(pool->p.dev, dma)) >>> return false; >>> >>> - if (page_pool_set_dma_addr_netmem(netmem, dma)) >>> + if (in_softirq()) >>> + err = xa_alloc(&pool->dma_mapped, &id, netmem_to_page(netmem), >>> + PP_DMA_INDEX_LIMIT, gfp); >>> + else >>> + err = xa_alloc_bh(&pool->dma_mapped, &id, netmem_to_page(netmem), >>> + PP_DMA_INDEX_LIMIT, gfp); >> >> Is it an optimisation? bh disable should be reentrable and could >> just be xa_alloc_bh(). > > Yeah, it's an optimisation. We do the same thing in > page_pool_recycle_in_ring(), so I just kept the same pattern. Got it > >> KERN_{NOTICE,INFO} Maybe? > > Erm? Was this supposed to be part of the comment below? Oops, yes, that's for the warning below >>> + if (err) { >>> + WARN_ONCE(1, "couldn't track DMA mapping, please report to netdev@"); >> >> That can happen with enough memory pressure, I don't think >> it should be a warning. Maybe some pr_info? > > So my reasoning here was that this code is only called in the alloc > path, so if we're under memory pressure, the page allocation itself > should fail before the xarray alloc does. And if it doesn't (i.e., if > the use of xarray itself causes allocation failures), we really want to > know about it so we can change things. Hence the loud warning. There is a gap between allocations, one doesn't guarantee another. I'd say the mental test here is whether we can reasonably cause it from user space (including by abusive users), because crash on warning setups exist, and it'll let you know about itself too loudly, when it could've been tolerated just fine. Not going to insist though. > @maintainers, given the comments above I'm not going to respin for this > unless you tell me to, but let me know! :) > > -Toke >
Pavel Begunkov <asml.silence@gmail.com> writes: >>>> + if (err) { >>>> + WARN_ONCE(1, "couldn't track DMA mapping, please report to netdev@"); >>> >>> That can happen with enough memory pressure, I don't think >>> it should be a warning. Maybe some pr_info? >> >> So my reasoning here was that this code is only called in the alloc >> path, so if we're under memory pressure, the page allocation itself >> should fail before the xarray alloc does. And if it doesn't (i.e., if >> the use of xarray itself causes allocation failures), we really want to >> know about it so we can change things. Hence the loud warning. > > There is a gap between allocations, one doesn't guarantee > another. I'd say the mental test here is whether we can reasonably > cause it from user space (including by abusive users), because crash > on warning setups exist, and it'll let you know about itself too > loudly, when it could've been tolerated just fine. Not going to > insist though. Right, I do see what you mean - it's not guaranteed to be coupled. However, my feeling is nonetheless that it's better for this to be loud to weed out any new issues that may arise from this, so I'm inclined to keep it as-is. -Toke
diff --git a/include/linux/poison.h b/include/linux/poison.h index 331a9a996fa8746626afa63ea462b85ca3e5938b..5351efd710d5e21cc341f7bb533b1aeea4a0808a 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -70,6 +70,10 @@ #define KEY_DESTROY 0xbd /********** net/core/page_pool.c **********/ +/* + * page_pool uses additional free bits within this value to store data, see the + * definition of PP_DMA_INDEX_MASK in include/net/page_pool/types.h + */ #define PP_SIGNATURE (0x40 + POISON_POINTER_DELTA) /********** net/core/skbuff.c **********/ diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h index df0d3c1608929605224feb26173135ff37951ef8..5835d359ecd0ac75dd737736926914ef7dd60646 100644 --- a/include/net/page_pool/types.h +++ b/include/net/page_pool/types.h @@ -6,6 +6,7 @@ #include <linux/dma-direction.h> #include <linux/ptr_ring.h> #include <linux/types.h> +#include <linux/xarray.h> #include <net/netmem.h> #define PP_FLAG_DMA_MAP BIT(0) /* Should page_pool do the DMA @@ -54,13 +55,51 @@ struct pp_alloc_cache { netmem_ref cache[PP_ALLOC_CACHE_SIZE]; }; +/* + * DMA mapping IDs + * + * When DMA-mapping a page, we allocate an ID (from an xarray) and stash this in + * the upper bits of page->pp_magic. We always want to be able to unambiguously + * identify page pool pages (using page_pool_page_is_pp()). Non-PP pages can + * have arbitrary kernel pointers stored in the same field as pp_magic (since it + * overlaps with page->lru.next), so we must ensure that we cannot mistake a + * valid kernel pointer with any of the values we write into this field. + * + * On architectures that set POISON_POINTER_DELTA, this is already ensured, + * since this value becomes part of PP_SIGNATURE; meaning we can just use the + * space between the PP_SIGNATURE value (without POISON_POINTER_DELTA), and the + * lowest bits of POISON_POINTER_DELTA. On arches where POISON_POINTER_DELTA is + * 0, we make sure that we leave the two topmost bits empty, as that guarantees + * we won't mistake a valid kernel pointer for a value we set, regardless of the + * VMSPLIT setting. + * + * Altogether, this means that the number of bits available is constrained by + * the size of an unsigned long (at the upper end, subtracting two bits per the + * above), and the definition of PP_SIGNATURE (with or without + * POISON_POINTER_DELTA). + */ +#define PP_DMA_INDEX_SHIFT (1 + __fls(PP_SIGNATURE - POISON_POINTER_DELTA)) +#if POISON_POINTER_DELTA > 0 +/* PP_SIGNATURE includes POISON_POINTER_DELTA, so limit the size of the DMA + * index to not overlap with that if set + */ +#define PP_DMA_INDEX_BITS MIN(32, __ffs(POISON_POINTER_DELTA) - PP_DMA_INDEX_SHIFT) +#else +/* Always leave out the topmost two; see above. */ +#define PP_DMA_INDEX_BITS MIN(32, BITS_PER_LONG - PP_DMA_INDEX_SHIFT - 2) +#endif + +#define PP_DMA_INDEX_MASK GENMASK(PP_DMA_INDEX_BITS + PP_DMA_INDEX_SHIFT - 1, \ + PP_DMA_INDEX_SHIFT) +#define PP_DMA_INDEX_LIMIT XA_LIMIT(1, BIT(PP_DMA_INDEX_BITS) - 1) + /* Mask used for checking in page_pool_page_is_pp() below. page->pp_magic is * OR'ed with PP_SIGNATURE after the allocation in order to preserve bit 0 for - * the head page of compound page and bit 1 for pfmemalloc page. - * page_is_pfmemalloc() is checked in __page_pool_put_page() to avoid recycling - * the pfmemalloc page. + * the head page of compound page and bit 1 for pfmemalloc page, as well as the + * bits used for the DMA index. page_is_pfmemalloc() is checked in + * __page_pool_put_page() to avoid recycling the pfmemalloc page. */ -#define PP_MAGIC_MASK ~0x3UL +#define PP_MAGIC_MASK ~(PP_DMA_INDEX_MASK | 0x3UL) /** * struct page_pool_params - page pool parameters @@ -173,10 +212,10 @@ struct page_pool { int cpuid; u32 pages_state_hold_cnt; - bool has_init_callback:1; /* slow::init_callback is set */ + bool dma_sync; /* Perform DMA sync for device */ bool dma_map:1; /* Perform DMA mapping */ - bool dma_sync:1; /* Perform DMA sync for device */ bool dma_sync_for_cpu:1; /* Perform DMA sync for cpu */ + bool has_init_callback:1; /* slow::init_callback is set */ #ifdef CONFIG_PAGE_POOL_STATS bool system:1; /* This is a global percpu pool */ #endif @@ -229,6 +268,8 @@ struct page_pool { void *mp_priv; const struct memory_provider_ops *mp_ops; + struct xarray dma_mapped; + #ifdef CONFIG_PAGE_POOL_STATS /* recycle stats are per-cpu to avoid locking */ struct page_pool_recycle_stats __percpu *recycle_stats; diff --git a/net/core/netmem_priv.h b/net/core/netmem_priv.h index f33162fd281c23e109273ba09950c5d0a2829bc9..cd95394399b40c3604934ba7898eeeeacb8aee99 100644 --- a/net/core/netmem_priv.h +++ b/net/core/netmem_priv.h @@ -5,7 +5,7 @@ static inline unsigned long netmem_get_pp_magic(netmem_ref netmem) { - return __netmem_clear_lsb(netmem)->pp_magic; + return __netmem_clear_lsb(netmem)->pp_magic & ~PP_DMA_INDEX_MASK; } static inline void netmem_or_pp_magic(netmem_ref netmem, unsigned long pp_magic) @@ -15,6 +15,8 @@ static inline void netmem_or_pp_magic(netmem_ref netmem, unsigned long pp_magic) static inline void netmem_clear_pp_magic(netmem_ref netmem) { + WARN_ON_ONCE(__netmem_clear_lsb(netmem)->pp_magic & PP_DMA_INDEX_MASK); + __netmem_clear_lsb(netmem)->pp_magic = 0; } @@ -33,4 +35,28 @@ static inline void netmem_set_dma_addr(netmem_ref netmem, { __netmem_clear_lsb(netmem)->dma_addr = dma_addr; } + +static inline unsigned long netmem_get_dma_index(netmem_ref netmem) +{ + unsigned long magic; + + if (WARN_ON_ONCE(netmem_is_net_iov(netmem))) + return 0; + + magic = __netmem_clear_lsb(netmem)->pp_magic; + + return (magic & PP_DMA_INDEX_MASK) >> PP_DMA_INDEX_SHIFT; +} + +static inline void netmem_set_dma_index(netmem_ref netmem, + unsigned long id) +{ + unsigned long magic; + + if (WARN_ON_ONCE(netmem_is_net_iov(netmem))) + return; + + magic = netmem_get_pp_magic(netmem) | (id << PP_DMA_INDEX_SHIFT); + __netmem_clear_lsb(netmem)->pp_magic = magic; +} #endif diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 7745ad924ae2d801580a6760eba9393e1cf67b01..52b5ddab7ecb405066fd55b8d61abfd4186b9dcf 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -227,6 +227,8 @@ static int page_pool_init(struct page_pool *pool, return -EINVAL; pool->dma_map = true; + + xa_init_flags(&pool->dma_mapped, XA_FLAGS_ALLOC1); } if (pool->slow.flags & PP_FLAG_DMA_SYNC_DEV) { @@ -276,9 +278,6 @@ static int page_pool_init(struct page_pool *pool, /* Driver calling page_pool_create() also call page_pool_destroy() */ refcount_set(&pool->user_cnt, 1); - if (pool->dma_map) - get_device(pool->p.dev); - if (pool->slow.flags & PP_FLAG_ALLOW_UNREADABLE_NETMEM) { netdev_assert_locked(pool->slow.netdev); rxq = __netif_get_rx_queue(pool->slow.netdev, @@ -322,7 +321,7 @@ static void page_pool_uninit(struct page_pool *pool) ptr_ring_cleanup(&pool->ring, NULL); if (pool->dma_map) - put_device(pool->p.dev); + xa_destroy(&pool->dma_mapped); #ifdef CONFIG_PAGE_POOL_STATS if (!pool->system) @@ -463,13 +462,21 @@ page_pool_dma_sync_for_device(const struct page_pool *pool, netmem_ref netmem, u32 dma_sync_size) { - if (pool->dma_sync && dma_dev_need_sync(pool->p.dev)) - __page_pool_dma_sync_for_device(pool, netmem, dma_sync_size); + if (READ_ONCE(pool->dma_sync) && dma_dev_need_sync(pool->p.dev)) { + rcu_read_lock(); + /* re-check under rcu_read_lock() to sync with page_pool_scrub() */ + if (READ_ONCE(pool->dma_sync)) + __page_pool_dma_sync_for_device(pool, netmem, + dma_sync_size); + rcu_read_unlock(); + } } -static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem) +static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem, gfp_t gfp) { dma_addr_t dma; + int err; + u32 id; /* Setup DMA mapping: use 'struct page' area for storing DMA-addr * since dma_addr_t can be either 32 or 64 bits and does not always fit @@ -483,15 +490,28 @@ static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem) if (dma_mapping_error(pool->p.dev, dma)) return false; - if (page_pool_set_dma_addr_netmem(netmem, dma)) + if (in_softirq()) + err = xa_alloc(&pool->dma_mapped, &id, netmem_to_page(netmem), + PP_DMA_INDEX_LIMIT, gfp); + else + err = xa_alloc_bh(&pool->dma_mapped, &id, netmem_to_page(netmem), + PP_DMA_INDEX_LIMIT, gfp); + if (err) { + WARN_ONCE(1, "couldn't track DMA mapping, please report to netdev@"); goto unmap_failed; + } + if (page_pool_set_dma_addr_netmem(netmem, dma)) { + WARN_ONCE(1, "unexpected DMA address, please report to netdev@"); + goto unmap_failed; + } + + netmem_set_dma_index(netmem, id); page_pool_dma_sync_for_device(pool, netmem, pool->p.max_len); return true; unmap_failed: - WARN_ONCE(1, "unexpected DMA address, please report to netdev@"); dma_unmap_page_attrs(pool->p.dev, dma, PAGE_SIZE << pool->p.order, pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); @@ -508,7 +528,7 @@ static struct page *__page_pool_alloc_page_order(struct page_pool *pool, if (unlikely(!page)) return NULL; - if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page_to_netmem(page)))) { + if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page_to_netmem(page), gfp))) { put_page(page); return NULL; } @@ -554,7 +574,7 @@ static noinline netmem_ref __page_pool_alloc_pages_slow(struct page_pool *pool, */ for (i = 0; i < nr_pages; i++) { netmem = pool->alloc.cache[i]; - if (dma_map && unlikely(!page_pool_dma_map(pool, netmem))) { + if (dma_map && unlikely(!page_pool_dma_map(pool, netmem, gfp))) { put_page(netmem_to_page(netmem)); continue; } @@ -656,6 +676,8 @@ void page_pool_clear_pp_info(netmem_ref netmem) static __always_inline void __page_pool_release_page_dma(struct page_pool *pool, netmem_ref netmem) { + struct page *old, *page = netmem_to_page(netmem); + unsigned long id; dma_addr_t dma; if (!pool->dma_map) @@ -664,6 +686,17 @@ static __always_inline void __page_pool_release_page_dma(struct page_pool *pool, */ return; + id = netmem_get_dma_index(netmem); + if (!id) + return; + + if (in_softirq()) + old = xa_cmpxchg(&pool->dma_mapped, id, page, NULL, 0); + else + old = xa_cmpxchg_bh(&pool->dma_mapped, id, page, NULL, 0); + if (old != page) + return; + dma = page_pool_get_dma_addr_netmem(netmem); /* When page is unmapped, it cannot be returned to our pool */ @@ -671,6 +704,7 @@ static __always_inline void __page_pool_release_page_dma(struct page_pool *pool, PAGE_SIZE << pool->p.order, pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); page_pool_set_dma_addr_netmem(netmem, 0); + netmem_set_dma_index(netmem, 0); } /* Disconnects a page (from a page_pool). API users can have a need @@ -1080,8 +1114,32 @@ static void page_pool_empty_alloc_cache_once(struct page_pool *pool) static void page_pool_scrub(struct page_pool *pool) { + unsigned long id; + void *ptr; + page_pool_empty_alloc_cache_once(pool); - pool->destroy_cnt++; + if (!pool->destroy_cnt++ && pool->dma_map) { + if (pool->dma_sync) { + /* paired with READ_ONCE in + * page_pool_dma_sync_for_device() and + * __page_pool_dma_sync_for_cpu() + */ + WRITE_ONCE(pool->dma_sync, false); + + /* Make sure all concurrent returns that may see the old + * value of dma_sync (and thus perform a sync) have + * finished before doing the unmapping below. Skip the + * wait if the device doesn't actually need syncing, or + * if there are no outstanding mapped pages. + */ + if (dma_dev_need_sync(pool->p.dev) && + !xa_empty(&pool->dma_mapped)) + synchronize_net(); + } + + xa_for_each(&pool->dma_mapped, id, ptr) + __page_pool_release_page_dma(pool, page_to_netmem(ptr)); + } /* No more consumers should exist, but producers could still * be in-flight.