Message ID | 20250217123127.3674033-1-linyunsheng@huawei.com (mailing list archive) |
---|---|
State | Handled Elsewhere |
Headers | show |
Series | [RFC] mm: alloc_pages_bulk: remove assumption of populating only NULL elements | expand |
On Mon, 2025-02-17 at 20:31 +0800, Yunsheng Lin wrote: > As mentioned in [1], it seems odd to check NULL elements in > the middle of page bulk allocating, and it seems caller can > do a better job of bulk allocating pages into a whole array > sequentially without checking NULL elements first before > doing the page bulk allocation. > > Remove the above checking also enable the caller to not > zero the array before calling the page bulk allocating API, > which has about 1~2 ns performance improvement for the test > case of time_bench_page_pool03_slow() for page_pool in a > x86 vm system, this reduces some performance impact of > fixing the DMA API misuse problem in [2], performance > improves from 87.886 ns to 86.429 ns. > > 1. https://lore.kernel.org/all/bd8c2f5c-464d-44ab-b607-390a87ea4cd5@huawei.com/ > 2. https://lore.kernel.org/all/20250212092552.1779679-1-linyunsheng@huawei.com/ > CC: Jesper Dangaard Brouer <hawk@kernel.org> > CC: Luiz Capitulino <luizcap@redhat.com> > CC: Mel Gorman <mgorman@techsingularity.net> > Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com> > --- > drivers/vfio/pci/virtio/migrate.c | 2 -- > fs/btrfs/extent_io.c | 8 +++++--- > fs/erofs/zutil.c | 12 ++++++------ > fs/xfs/xfs_buf.c | 9 +++++---- > mm/page_alloc.c | 32 +++++-------------------------- > net/core/page_pool.c | 3 --- > net/sunrpc/svc_xprt.c | 9 +++++---- > 7 files changed, 26 insertions(+), 49 deletions(-) > > diff --git a/drivers/vfio/pci/virtio/migrate.c b/drivers/vfio/pci/virtio/migrate.c > index ba92bb4e9af9..9f003a237dec 100644 > --- a/drivers/vfio/pci/virtio/migrate.c > +++ b/drivers/vfio/pci/virtio/migrate.c > @@ -91,8 +91,6 @@ static int virtiovf_add_migration_pages(struct virtiovf_data_buffer *buf, > if (ret) > goto err_append; > buf->allocated_length += filled * PAGE_SIZE; > - /* clean input for another bulk allocation */ > - memset(page_list, 0, filled * sizeof(*page_list)); > to_fill = min_t(unsigned int, to_alloc, > PAGE_SIZE / sizeof(*page_list)); > } while (to_alloc > 0); > diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c > index b2fae67f8fa3..d0466d795cca 100644 > --- a/fs/btrfs/extent_io.c > +++ b/fs/btrfs/extent_io.c > @@ -626,10 +626,12 @@ int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array, > unsigned int allocated; > > for (allocated = 0; allocated < nr_pages;) { > - unsigned int last = allocated; > + unsigned int new_allocated; > > - allocated = alloc_pages_bulk(gfp, nr_pages, page_array); > - if (unlikely(allocated == last)) { > + new_allocated = alloc_pages_bulk(gfp, nr_pages - allocated, > + page_array + allocated); > + allocated += new_allocated; > + if (unlikely(!new_allocated)) { > /* No progress, fail and do cleanup. */ > for (int i = 0; i < allocated; i++) { > __free_page(page_array[i]); > diff --git a/fs/erofs/zutil.c b/fs/erofs/zutil.c > index 55ff2ab5128e..1c50b5e27371 100644 > --- a/fs/erofs/zutil.c > +++ b/fs/erofs/zutil.c > @@ -85,13 +85,13 @@ int z_erofs_gbuf_growsize(unsigned int nrpages) > > for (j = 0; j < gbuf->nrpages; ++j) > tmp_pages[j] = gbuf->pages[j]; > - do { > - last = j; > - j = alloc_pages_bulk(GFP_KERNEL, nrpages, > - tmp_pages); > - if (last == j) > + > + for (last = j; last < nrpages; last += j) { > + j = alloc_pages_bulk(GFP_KERNEL, nrpages - last, > + tmp_pages + last); > + if (!j) > goto out; > - } while (j != nrpages); > + } > > ptr = vmap(tmp_pages, nrpages, VM_MAP, PAGE_KERNEL); > if (!ptr) > diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c > index 15bb790359f8..9e1ce0ab9c35 100644 > --- a/fs/xfs/xfs_buf.c > +++ b/fs/xfs/xfs_buf.c > @@ -377,16 +377,17 @@ xfs_buf_alloc_pages( > * least one extra page. > */ > for (;;) { > - long last = filled; > + long alloc; > > - filled = alloc_pages_bulk(gfp_mask, bp->b_page_count, > - bp->b_pages); > + alloc = alloc_pages_bulk(gfp_mask, bp->b_page_count - refill, > + bp->b_pages + refill); > + refill += alloc; > if (filled == bp->b_page_count) { > XFS_STATS_INC(bp->b_mount, xb_page_found); > break; > } > > - if (filled != last) > + if (alloc) > continue; > > if (flags & XBF_READ_AHEAD) { > diff --git a/mm/page_alloc.c b/mm/page_alloc.c > index 579789600a3c..e0309532b6c4 100644 > --- a/mm/page_alloc.c > +++ b/mm/page_alloc.c > @@ -4541,9 +4541,6 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, > * This is a batched version of the page allocator that attempts to > * allocate nr_pages quickly. Pages are added to the page_array. > * > - * Note that only NULL elements are populated with pages and nr_pages > - * is the maximum number of pages that will be stored in the array. > - * > * Returns the number of pages in the array. > */ > unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, > @@ -4559,29 +4556,18 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, > struct alloc_context ac; > gfp_t alloc_gfp; > unsigned int alloc_flags = ALLOC_WMARK_LOW; > - int nr_populated = 0, nr_account = 0; > - > - /* > - * Skip populated array elements to determine if any pages need > - * to be allocated before disabling IRQs. > - */ > - while (nr_populated < nr_pages && page_array[nr_populated]) > - nr_populated++; > + int nr_populated = 0; > > /* No pages requested? */ > if (unlikely(nr_pages <= 0)) > goto out; > > - /* Already populated array? */ > - if (unlikely(nr_pages - nr_populated == 0)) > - goto out; > - > /* Bulk allocator does not support memcg accounting. */ > if (memcg_kmem_online() && (gfp & __GFP_ACCOUNT)) > goto failed; > > /* Use the single page allocator for one page. */ > - if (nr_pages - nr_populated == 1) > + if (nr_pages == 1) > goto failed; > > #ifdef CONFIG_PAGE_OWNER > @@ -4653,24 +4639,16 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, > /* Attempt the batch allocation */ > pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)]; > while (nr_populated < nr_pages) { > - > - /* Skip existing pages */ > - if (page_array[nr_populated]) { > - nr_populated++; > - continue; > - } > - > page = __rmqueue_pcplist(zone, 0, ac.migratetype, alloc_flags, > pcp, pcp_list); > if (unlikely(!page)) { > /* Try and allocate at least one page */ > - if (!nr_account) { > + if (!nr_populated) { > pcp_spin_unlock(pcp); > goto failed_irq; > } > break; > } > - nr_account++; > > prep_new_page(page, 0, gfp, 0); > set_page_refcounted(page); > @@ -4680,8 +4658,8 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, > pcp_spin_unlock(pcp); > pcp_trylock_finish(UP_flags); > > - __count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account); > - zone_statistics(zonelist_zone(ac.preferred_zoneref), zone, nr_account); > + __count_zid_vm_events(PGALLOC, zone_idx(zone), nr_populated); > + zone_statistics(zonelist_zone(ac.preferred_zoneref), zone, nr_populated); > > out: > return nr_populated; > diff --git a/net/core/page_pool.c b/net/core/page_pool.c > index f5e908c9e7ad..ae9e8c78e4bb 100644 > --- a/net/core/page_pool.c > +++ b/net/core/page_pool.c > @@ -536,9 +536,6 @@ static noinline netmem_ref __page_pool_alloc_pages_slow(struct page_pool *pool, > if (unlikely(pool->alloc.count > 0)) > return pool->alloc.cache[--pool->alloc.count]; > > - /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk */ > - memset(&pool->alloc.cache, 0, sizeof(void *) * bulk); > - > nr_pages = alloc_pages_bulk_node(gfp, pool->p.nid, bulk, > (struct page **)pool->alloc.cache); > if (unlikely(!nr_pages)) > diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c > index ae25405d8bd2..6321a4d2f2be 100644 > --- a/net/sunrpc/svc_xprt.c > +++ b/net/sunrpc/svc_xprt.c > @@ -663,9 +663,10 @@ static bool svc_alloc_arg(struct svc_rqst *rqstp) > pages = RPCSVC_MAXPAGES; > } > > - for (filled = 0; filled < pages; filled = ret) { > - ret = alloc_pages_bulk(GFP_KERNEL, pages, rqstp->rq_pages); > - if (ret > filled) > + for (filled = 0; filled < pages; filled += ret) { > + ret = alloc_pages_bulk(GFP_KERNEL, pages - filled, > + rqstp->rq_pages + filled); > + if (ret) > /* Made progress, don't sleep yet */ > continue; > > @@ -674,7 +675,7 @@ static bool svc_alloc_arg(struct svc_rqst *rqstp) > set_current_state(TASK_RUNNING); > return false; > } > - trace_svc_alloc_arg_err(pages, ret); > + trace_svc_alloc_arg_err(pages, filled); > memalloc_retry_wait(GFP_KERNEL); > } > rqstp->rq_page_end = &rqstp->rq_pages[pages]; I agree that the API is cumbersome and weird as it is today. For the sunrpc parts: Acked-by: Jeff Layton <jlayton@kernel.org>
On 2/17/25 7:31 AM, Yunsheng Lin wrote: > As mentioned in [1], it seems odd to check NULL elements in > the middle of page bulk allocating, I think I requested that check to be added to the bulk page allocator. When sending an RPC reply, NFSD might release pages in the middle of the rq_pages array, marking each of those array entries with a NULL pointer. We want to ensure that the array is refilled completely in this case. > and it seems caller can > do a better job of bulk allocating pages into a whole array > sequentially without checking NULL elements first before > doing the page bulk allocation. > > Remove the above checking also enable the caller to not > zero the array before calling the page bulk allocating API, > which has about 1~2 ns performance improvement for the test > case of time_bench_page_pool03_slow() for page_pool in a > x86 vm system, this reduces some performance impact of > fixing the DMA API misuse problem in [2], performance > improves from 87.886 ns to 86.429 ns. > > 1. https://lore.kernel.org/all/bd8c2f5c-464d-44ab-b607-390a87ea4cd5@huawei.com/ > 2. https://lore.kernel.org/all/20250212092552.1779679-1-linyunsheng@huawei.com/ > CC: Jesper Dangaard Brouer <hawk@kernel.org> > CC: Luiz Capitulino <luizcap@redhat.com> > CC: Mel Gorman <mgorman@techsingularity.net> > Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com> > --- > drivers/vfio/pci/virtio/migrate.c | 2 -- > fs/btrfs/extent_io.c | 8 +++++--- > fs/erofs/zutil.c | 12 ++++++------ > fs/xfs/xfs_buf.c | 9 +++++---- > mm/page_alloc.c | 32 +++++-------------------------- > net/core/page_pool.c | 3 --- > net/sunrpc/svc_xprt.c | 9 +++++---- > 7 files changed, 26 insertions(+), 49 deletions(-) > > diff --git a/drivers/vfio/pci/virtio/migrate.c b/drivers/vfio/pci/virtio/migrate.c > index ba92bb4e9af9..9f003a237dec 100644 > --- a/drivers/vfio/pci/virtio/migrate.c > +++ b/drivers/vfio/pci/virtio/migrate.c > @@ -91,8 +91,6 @@ static int virtiovf_add_migration_pages(struct virtiovf_data_buffer *buf, > if (ret) > goto err_append; > buf->allocated_length += filled * PAGE_SIZE; > - /* clean input for another bulk allocation */ > - memset(page_list, 0, filled * sizeof(*page_list)); > to_fill = min_t(unsigned int, to_alloc, > PAGE_SIZE / sizeof(*page_list)); > } while (to_alloc > 0); > diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c > index b2fae67f8fa3..d0466d795cca 100644 > --- a/fs/btrfs/extent_io.c > +++ b/fs/btrfs/extent_io.c > @@ -626,10 +626,12 @@ int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array, > unsigned int allocated; > > for (allocated = 0; allocated < nr_pages;) { > - unsigned int last = allocated; > + unsigned int new_allocated; > > - allocated = alloc_pages_bulk(gfp, nr_pages, page_array); > - if (unlikely(allocated == last)) { > + new_allocated = alloc_pages_bulk(gfp, nr_pages - allocated, > + page_array + allocated); > + allocated += new_allocated; > + if (unlikely(!new_allocated)) { > /* No progress, fail and do cleanup. */ > for (int i = 0; i < allocated; i++) { > __free_page(page_array[i]); > diff --git a/fs/erofs/zutil.c b/fs/erofs/zutil.c > index 55ff2ab5128e..1c50b5e27371 100644 > --- a/fs/erofs/zutil.c > +++ b/fs/erofs/zutil.c > @@ -85,13 +85,13 @@ int z_erofs_gbuf_growsize(unsigned int nrpages) > > for (j = 0; j < gbuf->nrpages; ++j) > tmp_pages[j] = gbuf->pages[j]; > - do { > - last = j; > - j = alloc_pages_bulk(GFP_KERNEL, nrpages, > - tmp_pages); > - if (last == j) > + > + for (last = j; last < nrpages; last += j) { > + j = alloc_pages_bulk(GFP_KERNEL, nrpages - last, > + tmp_pages + last); > + if (!j) > goto out; > - } while (j != nrpages); > + } > > ptr = vmap(tmp_pages, nrpages, VM_MAP, PAGE_KERNEL); > if (!ptr) > diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c > index 15bb790359f8..9e1ce0ab9c35 100644 > --- a/fs/xfs/xfs_buf.c > +++ b/fs/xfs/xfs_buf.c > @@ -377,16 +377,17 @@ xfs_buf_alloc_pages( > * least one extra page. > */ > for (;;) { > - long last = filled; > + long alloc; > > - filled = alloc_pages_bulk(gfp_mask, bp->b_page_count, > - bp->b_pages); > + alloc = alloc_pages_bulk(gfp_mask, bp->b_page_count - refill, > + bp->b_pages + refill); > + refill += alloc; > if (filled == bp->b_page_count) { > XFS_STATS_INC(bp->b_mount, xb_page_found); > break; > } > > - if (filled != last) > + if (alloc) > continue; > > if (flags & XBF_READ_AHEAD) { > diff --git a/mm/page_alloc.c b/mm/page_alloc.c > index 579789600a3c..e0309532b6c4 100644 > --- a/mm/page_alloc.c > +++ b/mm/page_alloc.c > @@ -4541,9 +4541,6 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, > * This is a batched version of the page allocator that attempts to > * allocate nr_pages quickly. Pages are added to the page_array. > * > - * Note that only NULL elements are populated with pages and nr_pages > - * is the maximum number of pages that will be stored in the array. > - * > * Returns the number of pages in the array. > */ > unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, > @@ -4559,29 +4556,18 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, > struct alloc_context ac; > gfp_t alloc_gfp; > unsigned int alloc_flags = ALLOC_WMARK_LOW; > - int nr_populated = 0, nr_account = 0; > - > - /* > - * Skip populated array elements to determine if any pages need > - * to be allocated before disabling IRQs. > - */ > - while (nr_populated < nr_pages && page_array[nr_populated]) > - nr_populated++; > + int nr_populated = 0; > > /* No pages requested? */ > if (unlikely(nr_pages <= 0)) > goto out; > > - /* Already populated array? */ > - if (unlikely(nr_pages - nr_populated == 0)) > - goto out; > - > /* Bulk allocator does not support memcg accounting. */ > if (memcg_kmem_online() && (gfp & __GFP_ACCOUNT)) > goto failed; > > /* Use the single page allocator for one page. */ > - if (nr_pages - nr_populated == 1) > + if (nr_pages == 1) > goto failed; > > #ifdef CONFIG_PAGE_OWNER > @@ -4653,24 +4639,16 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, > /* Attempt the batch allocation */ > pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)]; > while (nr_populated < nr_pages) { > - > - /* Skip existing pages */ > - if (page_array[nr_populated]) { > - nr_populated++; > - continue; > - } > - > page = __rmqueue_pcplist(zone, 0, ac.migratetype, alloc_flags, > pcp, pcp_list); > if (unlikely(!page)) { > /* Try and allocate at least one page */ > - if (!nr_account) { > + if (!nr_populated) { > pcp_spin_unlock(pcp); > goto failed_irq; > } > break; > } > - nr_account++; > > prep_new_page(page, 0, gfp, 0); > set_page_refcounted(page); > @@ -4680,8 +4658,8 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, > pcp_spin_unlock(pcp); > pcp_trylock_finish(UP_flags); > > - __count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account); > - zone_statistics(zonelist_zone(ac.preferred_zoneref), zone, nr_account); > + __count_zid_vm_events(PGALLOC, zone_idx(zone), nr_populated); > + zone_statistics(zonelist_zone(ac.preferred_zoneref), zone, nr_populated); > > out: > return nr_populated; > diff --git a/net/core/page_pool.c b/net/core/page_pool.c > index f5e908c9e7ad..ae9e8c78e4bb 100644 > --- a/net/core/page_pool.c > +++ b/net/core/page_pool.c > @@ -536,9 +536,6 @@ static noinline netmem_ref __page_pool_alloc_pages_slow(struct page_pool *pool, > if (unlikely(pool->alloc.count > 0)) > return pool->alloc.cache[--pool->alloc.count]; > > - /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk */ > - memset(&pool->alloc.cache, 0, sizeof(void *) * bulk); > - > nr_pages = alloc_pages_bulk_node(gfp, pool->p.nid, bulk, > (struct page **)pool->alloc.cache); > if (unlikely(!nr_pages)) > diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c > index ae25405d8bd2..6321a4d2f2be 100644 > --- a/net/sunrpc/svc_xprt.c > +++ b/net/sunrpc/svc_xprt.c > @@ -663,9 +663,10 @@ static bool svc_alloc_arg(struct svc_rqst *rqstp) > pages = RPCSVC_MAXPAGES; > } > > - for (filled = 0; filled < pages; filled = ret) { > - ret = alloc_pages_bulk(GFP_KERNEL, pages, rqstp->rq_pages); > - if (ret > filled) > + for (filled = 0; filled < pages; filled += ret) { > + ret = alloc_pages_bulk(GFP_KERNEL, pages - filled, > + rqstp->rq_pages + filled); > + if (ret) > /* Made progress, don't sleep yet */ > continue; > > @@ -674,7 +675,7 @@ static bool svc_alloc_arg(struct svc_rqst *rqstp) > set_current_state(TASK_RUNNING); > return false; > } > - trace_svc_alloc_arg_err(pages, ret); > + trace_svc_alloc_arg_err(pages, filled); > memalloc_retry_wait(GFP_KERNEL); > } > rqstp->rq_page_end = &rqstp->rq_pages[pages];
On Mon, Feb 17, 2025 at 08:31:23PM +0800, Yunsheng Lin wrote: > As mentioned in [1], it seems odd to check NULL elements in > the middle of page bulk allocating, and it seems caller can > do a better job of bulk allocating pages into a whole array > sequentially without checking NULL elements first before > doing the page bulk allocation. .... IMO, the new API is a poor one, and you've demonstrated it clearly in this patch. ..... > diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c > index 15bb790359f8..9e1ce0ab9c35 100644 > --- a/fs/xfs/xfs_buf.c > +++ b/fs/xfs/xfs_buf.c > @@ -377,16 +377,17 @@ xfs_buf_alloc_pages( > * least one extra page. > */ > for (;;) { > - long last = filled; > + long alloc; > > - filled = alloc_pages_bulk(gfp_mask, bp->b_page_count, > - bp->b_pages); > + alloc = alloc_pages_bulk(gfp_mask, bp->b_page_count - refill, > + bp->b_pages + refill); > + refill += alloc; > if (filled == bp->b_page_count) { > XFS_STATS_INC(bp->b_mount, xb_page_found); > break; > } > > - if (filled != last) > + if (alloc) > continue; You didn't even compile this code - refill is not defined anywhere. Even if it did complile, you clearly didn't test it. The logic is broken (what updates filled?) and will result in the first allocation attempt succeeding and then falling into an endless retry loop. i.e. you stepped on the API landmine of your own creation where it is impossible to tell the difference between alloc_pages_bulk() returning "memory allocation failed, you need to retry" and it returning "array is full, nothing more to allocate". Both these cases now return 0. The existing code returns nr_populated in both cases, so it doesn't matter why alloc_pages_bulk() returns with nr_populated != full, it is very clear that we still need to allocate more memory to fill it. The whole point of the existing API is to prevent callers from making stupid, hard to spot logic mistakes like this. Forcing callers to track both empty slots and how full the array is itself, whilst also constraining where in the array empty slots can occur greatly reduces both the safety and functionality that alloc_pages_bulk() provides. Anyone that has code that wants to steal a random page from the array and then refill it now has a heap more complex code to add to their allocator wrapper. IOWs, you just demonstrated why the existing API is more desirable than a highly constrained, slightly faster API that requires callers to get every detail right. i.e. it's hard to get it wrong with the existing API, yet it's so easy to make mistakes with the proposed API that the patch proposing the change has serious bugs in it. -Dave.
On 2025/2/17 22:20, Chuck Lever wrote: > On 2/17/25 7:31 AM, Yunsheng Lin wrote: >> As mentioned in [1], it seems odd to check NULL elements in >> the middle of page bulk allocating, > > I think I requested that check to be added to the bulk page allocator. > > When sending an RPC reply, NFSD might release pages in the middle of It seems there is no usage of the page bulk allocation API in fs/nfsd/ or fs/nfs/, which specific fs the above 'NFSD' is referring to? > the rq_pages array, marking each of those array entries with a NULL > pointer. We want to ensure that the array is refilled completely in this > case. > I did some researching, it seems you requested that in [1]? It seems the 'holes are always at the start' for the case in that discussion too, I am not sure if the case is referring to the caller in net/sunrpc/svc_xprt.c? If yes, it seems caller can do a better job of bulk allocating pages into a whole array sequentially without checking NULL elements first before doing the page bulk allocation as something below: +++ b/net/sunrpc/svc_xprt.c @@ -663,9 +663,10 @@ static bool svc_alloc_arg(struct svc_rqst *rqstp) pages = RPCSVC_MAXPAGES; } - for (filled = 0; filled < pages; filled = ret) { - ret = alloc_pages_bulk(GFP_KERNEL, pages, rqstp->rq_pages); - if (ret > filled) + for (filled = 0; filled < pages; filled += ret) { + ret = alloc_pages_bulk(GFP_KERNEL, pages - filled, + rqstp->rq_pages + filled); + if (ret) /* Made progress, don't sleep yet */ continue; @@ -674,7 +675,7 @@ static bool svc_alloc_arg(struct svc_rqst *rqstp) set_current_state(TASK_RUNNING); return false; } - trace_svc_alloc_arg_err(pages, ret); + trace_svc_alloc_arg_err(pages, filled); memalloc_retry_wait(GFP_KERNEL); } rqstp->rq_page_end = &rqstp->rq_pages[pages]; 1. https://lkml.iu.edu/hypermail/linux/kernel/2103.2/09060.html
On 2025/2/18 5:31, Dave Chinner wrote: ... > ..... > >> diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c >> index 15bb790359f8..9e1ce0ab9c35 100644 >> --- a/fs/xfs/xfs_buf.c >> +++ b/fs/xfs/xfs_buf.c >> @@ -377,16 +377,17 @@ xfs_buf_alloc_pages( >> * least one extra page. >> */ >> for (;;) { >> - long last = filled; >> + long alloc; >> >> - filled = alloc_pages_bulk(gfp_mask, bp->b_page_count, >> - bp->b_pages); >> + alloc = alloc_pages_bulk(gfp_mask, bp->b_page_count - refill, >> + bp->b_pages + refill); >> + refill += alloc; >> if (filled == bp->b_page_count) { >> XFS_STATS_INC(bp->b_mount, xb_page_found); >> break; >> } >> >> - if (filled != last) >> + if (alloc) >> continue; > > You didn't even compile this code - refill is not defined > anywhere. > > Even if it did complile, you clearly didn't test it. The logic is > broken (what updates filled?) and will result in the first > allocation attempt succeeding and then falling into an endless retry > loop. Ah, the 'refill' is a typo, it should be 'filled' instead of 'refill'. The below should fix the compile error: --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -379,9 +379,9 @@ xfs_buf_alloc_pages( for (;;) { long alloc; - alloc = alloc_pages_bulk(gfp_mask, bp->b_page_count - refill, - bp->b_pages + refill); - refill += alloc; + alloc = alloc_pages_bulk(gfp_mask, bp->b_page_count - filled, + bp->b_pages + filled); + filled += alloc; if (filled == bp->b_page_count) { XFS_STATS_INC(bp->b_mount, xb_page_found); break; > > i.e. you stepped on the API landmine of your own creation where > it is impossible to tell the difference between alloc_pages_bulk() > returning "memory allocation failed, you need to retry" and > it returning "array is full, nothing more to allocate". Both these > cases now return 0. As my understanding, alloc_pages_bulk() will not be called when "array is full" as the above 'filled == bp->b_page_count' checking has ensured that if the array is not passed in with holes in the middle for xfs. > > The existing code returns nr_populated in both cases, so it doesn't > matter why alloc_pages_bulk() returns with nr_populated != full, it > is very clear that we still need to allocate more memory to fill it. I am not sure if the array will be passed in with holes in the middle for the xfs fs as mentioned above, if not, it seems to be a typical use case like the one in mempolicy.c as below: https://elixir.bootlin.com/linux/v6.14-rc1/source/mm/mempolicy.c#L2525 > > The whole point of the existing API is to prevent callers from > making stupid, hard to spot logic mistakes like this. Forcing > callers to track both empty slots and how full the array is itself, > whilst also constraining where in the array empty slots can occur > greatly reduces both the safety and functionality that > alloc_pages_bulk() provides. Anyone that has code that wants to > steal a random page from the array and then refill it now has a heap > more complex code to add to their allocator wrapper. Yes, I am agreed that it might be better to provide a common API or wrapper if there is some clear use case that need to pass in an array with holes in the middle by adding a new API like refill_pages_bulk() as below. > > IOWs, you just demonstrated why the existing API is more desirable > than a highly constrained, slightly faster API that requires callers > to get every detail right. i.e. it's hard to get it wrong with the > existing API, yet it's so easy to make mistakes with the proposed > API that the patch proposing the change has serious bugs in it. IMHO, if the API is about refilling pages for the only NULL elements, it seems better to add a API like refill_pages_bulk() for that, as the current API seems to be prone to error of not initializing the array to zero before calling alloc_pages_bulk(). > > -Dave.
On 2/18/25 4:16 AM, Yunsheng Lin wrote: > On 2025/2/17 22:20, Chuck Lever wrote: >> On 2/17/25 7:31 AM, Yunsheng Lin wrote: >>> As mentioned in [1], it seems odd to check NULL elements in >>> the middle of page bulk allocating, >> >> I think I requested that check to be added to the bulk page allocator. >> >> When sending an RPC reply, NFSD might release pages in the middle of > > It seems there is no usage of the page bulk allocation API in fs/nfsd/ > or fs/nfs/, which specific fs the above 'NFSD' is referring to? NFSD is in fs/nfsd/, and it is the major consumer of net/sunrpc/svc_xprt.c. >> the rq_pages array, marking each of those array entries with a NULL >> pointer. We want to ensure that the array is refilled completely in this >> case. >> > > I did some researching, it seems you requested that in [1]? > It seems the 'holes are always at the start' for the case in that > discussion too, I am not sure if the case is referring to the caller > in net/sunrpc/svc_xprt.c? If yes, it seems caller can do a better > job of bulk allocating pages into a whole array sequentially without > checking NULL elements first before doing the page bulk allocation > as something below: > > +++ b/net/sunrpc/svc_xprt.c > @@ -663,9 +663,10 @@ static bool svc_alloc_arg(struct svc_rqst *rqstp) > pages = RPCSVC_MAXPAGES; > } > > - for (filled = 0; filled < pages; filled = ret) { > - ret = alloc_pages_bulk(GFP_KERNEL, pages, rqstp->rq_pages); > - if (ret > filled) > + for (filled = 0; filled < pages; filled += ret) { > + ret = alloc_pages_bulk(GFP_KERNEL, pages - filled, > + rqstp->rq_pages + filled); > + if (ret) > /* Made progress, don't sleep yet */ > continue; > > @@ -674,7 +675,7 @@ static bool svc_alloc_arg(struct svc_rqst *rqstp) > set_current_state(TASK_RUNNING); > return false; > } > - trace_svc_alloc_arg_err(pages, ret); > + trace_svc_alloc_arg_err(pages, filled); > memalloc_retry_wait(GFP_KERNEL); > } > rqstp->rq_page_end = &rqstp->rq_pages[pages]; > > > 1. https://lkml.iu.edu/hypermail/linux/kernel/2103.2/09060.html I still don't see what is broken about the current API. Anyway, any changes in svc_alloc_arg() will need to be run through the upstream NFSD CI suite before they are merged.
On Tue, Feb 18, 2025 at 05:21:27PM +0800, Yunsheng Lin wrote: > On 2025/2/18 5:31, Dave Chinner wrote: > > ... > > > ..... > > > >> diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c > >> index 15bb790359f8..9e1ce0ab9c35 100644 > >> --- a/fs/xfs/xfs_buf.c > >> +++ b/fs/xfs/xfs_buf.c > >> @@ -377,16 +377,17 @@ xfs_buf_alloc_pages( > >> * least one extra page. > >> */ > >> for (;;) { > >> - long last = filled; > >> + long alloc; > >> > >> - filled = alloc_pages_bulk(gfp_mask, bp->b_page_count, > >> - bp->b_pages); > >> + alloc = alloc_pages_bulk(gfp_mask, bp->b_page_count - refill, > >> + bp->b_pages + refill); > >> + refill += alloc; > >> if (filled == bp->b_page_count) { > >> XFS_STATS_INC(bp->b_mount, xb_page_found); > >> break; > >> } > >> > >> - if (filled != last) > >> + if (alloc) > >> continue; > > > > You didn't even compile this code - refill is not defined > > anywhere. > > > > Even if it did complile, you clearly didn't test it. The logic is > > broken (what updates filled?) and will result in the first > > allocation attempt succeeding and then falling into an endless retry > > loop. > > Ah, the 'refill' is a typo, it should be 'filled' instead of 'refill'. > The below should fix the compile error: > --- a/fs/xfs/xfs_buf.c > +++ b/fs/xfs/xfs_buf.c > @@ -379,9 +379,9 @@ xfs_buf_alloc_pages( > for (;;) { > long alloc; > > - alloc = alloc_pages_bulk(gfp_mask, bp->b_page_count - refill, > - bp->b_pages + refill); > - refill += alloc; > + alloc = alloc_pages_bulk(gfp_mask, bp->b_page_count - filled, > + bp->b_pages + filled); > + filled += alloc; > if (filled == bp->b_page_count) { > XFS_STATS_INC(bp->b_mount, xb_page_found); > break; > > > > > i.e. you stepped on the API landmine of your own creation where > > it is impossible to tell the difference between alloc_pages_bulk() > > returning "memory allocation failed, you need to retry" and > > it returning "array is full, nothing more to allocate". Both these > > cases now return 0. > > As my understanding, alloc_pages_bulk() will not be called when > "array is full" as the above 'filled == bp->b_page_count' checking > has ensured that if the array is not passed in with holes in the > middle for xfs. You miss the point entirely. Previously, alloc_pages_bulk() would return a value that would tell us the array is full, even if we call it with a full array to begin with. Now it fails to tell us that the array is full, and we have to track that precisely ourselves - it is impossible to tell the difference between "array is full" and "allocation failed". Not being able to determine from the allocation return value whether the array is ready for use or whether another go-around to fill it is needed is a very poor API choice, regardless of anything else. You've already demonstrated this: tracking array usage in every caller is error-prone and much harder to get right than just having alloc_pages_bulk() do everything for us. > > The existing code returns nr_populated in both cases, so it doesn't > > matter why alloc_pages_bulk() returns with nr_populated != full, it > > is very clear that we still need to allocate more memory to fill it. > > I am not sure if the array will be passed in with holes in the > middle for the xfs fs as mentioned above, if not, it seems to be > a typical use case like the one in mempolicy.c as below: > > https://elixir.bootlin.com/linux/v6.14-rc1/source/mm/mempolicy.c#L2525 That's not "typical" usage. That is implementing "try alloc" fast path that avoids memory reclaim with a slow path fallback to fill the rest of the array when the fast path fails. No other users of alloc_pages_bulk() is trying to do this. Indeed, it looks somewhat pointless to do this here (i.e. premature optimisation!), because the only caller of alloc_pages_bulk_mempolicy_noprof() has it's own fallback slowpath for when alloc_pages_bulk() can't fill the entire request. > > IOWs, you just demonstrated why the existing API is more desirable > > than a highly constrained, slightly faster API that requires callers > > to get every detail right. i.e. it's hard to get it wrong with the > > existing API, yet it's so easy to make mistakes with the proposed > > API that the patch proposing the change has serious bugs in it. > > IMHO, if the API is about refilling pages for the only NULL elements, > it seems better to add a API like refill_pages_bulk() for that, as > the current API seems to be prone to error of not initializing the > array to zero before calling alloc_pages_bulk(). How is requiring a well defined initial state for API parameters "error prone"? What code is failing to do the well known, defined initialisation before calling alloc_pages_bulk()? Allowing uninitialised structures in an API (i.e. unknown initial conditions) means we cannot make assumptions about the structure contents within the API implementation. We cannot assume that all variables are zero on the first use, nor can we assume that anything that is zero has a valid state. Again, this is poor API design - structures passed to interfaces -should- have a well defined initial state, either set by a *_init() function or by defining the initial state to be all zeros (i.e. via memset, kzalloc, etc). Performance and speed is not an excuse for writing fragile, easy to break code and APIs. -Dave.
On 2025/2/19 5:14, Dave Chinner wrote: > On Tue, Feb 18, 2025 at 05:21:27PM +0800, Yunsheng Lin wrote: >> On 2025/2/18 5:31, Dave Chinner wrote: >> >> ... >> >>> ..... >>> >>>> diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c >>>> index 15bb790359f8..9e1ce0ab9c35 100644 >>>> --- a/fs/xfs/xfs_buf.c >>>> +++ b/fs/xfs/xfs_buf.c >>>> @@ -377,16 +377,17 @@ xfs_buf_alloc_pages( >>>> * least one extra page. >>>> */ >>>> for (;;) { >>>> - long last = filled; >>>> + long alloc; >>>> >>>> - filled = alloc_pages_bulk(gfp_mask, bp->b_page_count, >>>> - bp->b_pages); >>>> + alloc = alloc_pages_bulk(gfp_mask, bp->b_page_count - refill, >>>> + bp->b_pages + refill); >>>> + refill += alloc; >>>> if (filled == bp->b_page_count) { >>>> XFS_STATS_INC(bp->b_mount, xb_page_found); >>>> break; >>>> } >>>> >>>> - if (filled != last) >>>> + if (alloc) >>>> continue; >>> >>> You didn't even compile this code - refill is not defined >>> anywhere. >>> >>> Even if it did complile, you clearly didn't test it. The logic is >>> broken (what updates filled?) and will result in the first >>> allocation attempt succeeding and then falling into an endless retry >>> loop. >> >> Ah, the 'refill' is a typo, it should be 'filled' instead of 'refill'. >> The below should fix the compile error: >> --- a/fs/xfs/xfs_buf.c >> +++ b/fs/xfs/xfs_buf.c >> @@ -379,9 +379,9 @@ xfs_buf_alloc_pages( >> for (;;) { >> long alloc; >> >> - alloc = alloc_pages_bulk(gfp_mask, bp->b_page_count - refill, >> - bp->b_pages + refill); >> - refill += alloc; >> + alloc = alloc_pages_bulk(gfp_mask, bp->b_page_count - filled, >> + bp->b_pages + filled); >> + filled += alloc; >> if (filled == bp->b_page_count) { >> XFS_STATS_INC(bp->b_mount, xb_page_found); >> break; >> >>> >>> i.e. you stepped on the API landmine of your own creation where >>> it is impossible to tell the difference between alloc_pages_bulk() >>> returning "memory allocation failed, you need to retry" and >>> it returning "array is full, nothing more to allocate". Both these >>> cases now return 0. >> >> As my understanding, alloc_pages_bulk() will not be called when >> "array is full" as the above 'filled == bp->b_page_count' checking >> has ensured that if the array is not passed in with holes in the >> middle for xfs. > > You miss the point entirely. Previously, alloc_pages_bulk() would > return a value that would tell us the array is full, even if we > call it with a full array to begin with. > > Now it fails to tell us that the array is full, and we have to track > that precisely ourselves - it is impossible to tell the difference > between "array is full" and "allocation failed". Not being able to > determine from the allocation return value whether the array is > ready for use or whether another go-around to fill it is needed is a > very poor API choice, regardless of anything else. > > You've already demonstrated this: tracking array usage in every > caller is error-prone and much harder to get right than just having > alloc_pages_bulk() do everything for us. While I am agreed that it might be hard to track array usage in every caller to see if removing assumption of populating only NULL elements cause problem for them, I still think the page bulk alloc API before this patch have some space for improvement from performance and easy-to-use perspective as the most existing calllers of page bulk alloc API are trying to bulk allocate the page for the whole array sequentially. > >>> The existing code returns nr_populated in both cases, so it doesn't >>> matter why alloc_pages_bulk() returns with nr_populated != full, it >>> is very clear that we still need to allocate more memory to fill it. >> >> I am not sure if the array will be passed in with holes in the >> middle for the xfs fs as mentioned above, if not, it seems to be >> a typical use case like the one in mempolicy.c as below: >> >> https://elixir.bootlin.com/linux/v6.14-rc1/source/mm/mempolicy.c#L2525 > > That's not "typical" usage. That is implementing "try alloc" fast > path that avoids memory reclaim with a slow path fallback to fill > the rest of the array when the fast path fails. > > No other users of alloc_pages_bulk() is trying to do this. What I meant by "typical" usage is the 'page_array + nr_allocated' trick that avoids the NULL checking when page bulk allocation API is used in mm/mempolicy.c, most of existing callers for page bulk allocation in other places seems likely to be changed to do the similar trick as this patch does. > > Indeed, it looks somewhat pointless to do this here (i.e. premature > optimisation!), because the only caller of > alloc_pages_bulk_mempolicy_noprof() has it's own fallback slowpath > for when alloc_pages_bulk() can't fill the entire request. > >>> IOWs, you just demonstrated why the existing API is more desirable >>> than a highly constrained, slightly faster API that requires callers >>> to get every detail right. i.e. it's hard to get it wrong with the >>> existing API, yet it's so easy to make mistakes with the proposed >>> API that the patch proposing the change has serious bugs in it. >> >> IMHO, if the API is about refilling pages for the only NULL elements, >> it seems better to add a API like refill_pages_bulk() for that, as >> the current API seems to be prone to error of not initializing the >> array to zero before calling alloc_pages_bulk(). > > How is requiring a well defined initial state for API parameters > "error prone"? What code is failing to do the well known, defined > initialisation before calling alloc_pages_bulk()? > > Allowing uninitialised structures in an API (i.e. unknown initial > conditions) means we cannot make assumptions about the structure > contents within the API implementation. We cannot assume that all > variables are zero on the first use, nor can we assume that anything > that is zero has a valid state. It seems the above is the main differenece we see from the API perspective, as I see the array as output parameter and you seems to treat the array as both input and output parameter? The kmem_cache_alloc_bulk() API related API seems to treat the array as output parameter too as this patch does, the difference from this patch is that if there is no enough memory, it will free the allocated memory and return 0 to the caller while this patch returns already allocated memory to its caller even when there is no enough memory. > > Again, this is poor API design - structures passed to interfaces > -should- have a well defined initial state, either set by a *_init() > function or by defining the initial state to be all zeros (i.e. via > memset, kzalloc, etc). > > Performance and speed is not an excuse for writing fragile, easy to > break code and APIs. > > -Dave.
diff --git a/drivers/vfio/pci/virtio/migrate.c b/drivers/vfio/pci/virtio/migrate.c index ba92bb4e9af9..9f003a237dec 100644 --- a/drivers/vfio/pci/virtio/migrate.c +++ b/drivers/vfio/pci/virtio/migrate.c @@ -91,8 +91,6 @@ static int virtiovf_add_migration_pages(struct virtiovf_data_buffer *buf, if (ret) goto err_append; buf->allocated_length += filled * PAGE_SIZE; - /* clean input for another bulk allocation */ - memset(page_list, 0, filled * sizeof(*page_list)); to_fill = min_t(unsigned int, to_alloc, PAGE_SIZE / sizeof(*page_list)); } while (to_alloc > 0); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index b2fae67f8fa3..d0466d795cca 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -626,10 +626,12 @@ int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array, unsigned int allocated; for (allocated = 0; allocated < nr_pages;) { - unsigned int last = allocated; + unsigned int new_allocated; - allocated = alloc_pages_bulk(gfp, nr_pages, page_array); - if (unlikely(allocated == last)) { + new_allocated = alloc_pages_bulk(gfp, nr_pages - allocated, + page_array + allocated); + allocated += new_allocated; + if (unlikely(!new_allocated)) { /* No progress, fail and do cleanup. */ for (int i = 0; i < allocated; i++) { __free_page(page_array[i]); diff --git a/fs/erofs/zutil.c b/fs/erofs/zutil.c index 55ff2ab5128e..1c50b5e27371 100644 --- a/fs/erofs/zutil.c +++ b/fs/erofs/zutil.c @@ -85,13 +85,13 @@ int z_erofs_gbuf_growsize(unsigned int nrpages) for (j = 0; j < gbuf->nrpages; ++j) tmp_pages[j] = gbuf->pages[j]; - do { - last = j; - j = alloc_pages_bulk(GFP_KERNEL, nrpages, - tmp_pages); - if (last == j) + + for (last = j; last < nrpages; last += j) { + j = alloc_pages_bulk(GFP_KERNEL, nrpages - last, + tmp_pages + last); + if (!j) goto out; - } while (j != nrpages); + } ptr = vmap(tmp_pages, nrpages, VM_MAP, PAGE_KERNEL); if (!ptr) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 15bb790359f8..9e1ce0ab9c35 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -377,16 +377,17 @@ xfs_buf_alloc_pages( * least one extra page. */ for (;;) { - long last = filled; + long alloc; - filled = alloc_pages_bulk(gfp_mask, bp->b_page_count, - bp->b_pages); + alloc = alloc_pages_bulk(gfp_mask, bp->b_page_count - refill, + bp->b_pages + refill); + refill += alloc; if (filled == bp->b_page_count) { XFS_STATS_INC(bp->b_mount, xb_page_found); break; } - if (filled != last) + if (alloc) continue; if (flags & XBF_READ_AHEAD) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 579789600a3c..e0309532b6c4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4541,9 +4541,6 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, * This is a batched version of the page allocator that attempts to * allocate nr_pages quickly. Pages are added to the page_array. * - * Note that only NULL elements are populated with pages and nr_pages - * is the maximum number of pages that will be stored in the array. - * * Returns the number of pages in the array. */ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, @@ -4559,29 +4556,18 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, struct alloc_context ac; gfp_t alloc_gfp; unsigned int alloc_flags = ALLOC_WMARK_LOW; - int nr_populated = 0, nr_account = 0; - - /* - * Skip populated array elements to determine if any pages need - * to be allocated before disabling IRQs. - */ - while (nr_populated < nr_pages && page_array[nr_populated]) - nr_populated++; + int nr_populated = 0; /* No pages requested? */ if (unlikely(nr_pages <= 0)) goto out; - /* Already populated array? */ - if (unlikely(nr_pages - nr_populated == 0)) - goto out; - /* Bulk allocator does not support memcg accounting. */ if (memcg_kmem_online() && (gfp & __GFP_ACCOUNT)) goto failed; /* Use the single page allocator for one page. */ - if (nr_pages - nr_populated == 1) + if (nr_pages == 1) goto failed; #ifdef CONFIG_PAGE_OWNER @@ -4653,24 +4639,16 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, /* Attempt the batch allocation */ pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)]; while (nr_populated < nr_pages) { - - /* Skip existing pages */ - if (page_array[nr_populated]) { - nr_populated++; - continue; - } - page = __rmqueue_pcplist(zone, 0, ac.migratetype, alloc_flags, pcp, pcp_list); if (unlikely(!page)) { /* Try and allocate at least one page */ - if (!nr_account) { + if (!nr_populated) { pcp_spin_unlock(pcp); goto failed_irq; } break; } - nr_account++; prep_new_page(page, 0, gfp, 0); set_page_refcounted(page); @@ -4680,8 +4658,8 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, pcp_spin_unlock(pcp); pcp_trylock_finish(UP_flags); - __count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account); - zone_statistics(zonelist_zone(ac.preferred_zoneref), zone, nr_account); + __count_zid_vm_events(PGALLOC, zone_idx(zone), nr_populated); + zone_statistics(zonelist_zone(ac.preferred_zoneref), zone, nr_populated); out: return nr_populated; diff --git a/net/core/page_pool.c b/net/core/page_pool.c index f5e908c9e7ad..ae9e8c78e4bb 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -536,9 +536,6 @@ static noinline netmem_ref __page_pool_alloc_pages_slow(struct page_pool *pool, if (unlikely(pool->alloc.count > 0)) return pool->alloc.cache[--pool->alloc.count]; - /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk */ - memset(&pool->alloc.cache, 0, sizeof(void *) * bulk); - nr_pages = alloc_pages_bulk_node(gfp, pool->p.nid, bulk, (struct page **)pool->alloc.cache); if (unlikely(!nr_pages)) diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index ae25405d8bd2..6321a4d2f2be 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -663,9 +663,10 @@ static bool svc_alloc_arg(struct svc_rqst *rqstp) pages = RPCSVC_MAXPAGES; } - for (filled = 0; filled < pages; filled = ret) { - ret = alloc_pages_bulk(GFP_KERNEL, pages, rqstp->rq_pages); - if (ret > filled) + for (filled = 0; filled < pages; filled += ret) { + ret = alloc_pages_bulk(GFP_KERNEL, pages - filled, + rqstp->rq_pages + filled); + if (ret) /* Made progress, don't sleep yet */ continue; @@ -674,7 +675,7 @@ static bool svc_alloc_arg(struct svc_rqst *rqstp) set_current_state(TASK_RUNNING); return false; } - trace_svc_alloc_arg_err(pages, ret); + trace_svc_alloc_arg_err(pages, filled); memalloc_retry_wait(GFP_KERNEL); } rqstp->rq_page_end = &rqstp->rq_pages[pages];
As mentioned in [1], it seems odd to check NULL elements in the middle of page bulk allocating, and it seems caller can do a better job of bulk allocating pages into a whole array sequentially without checking NULL elements first before doing the page bulk allocation. Remove the above checking also enable the caller to not zero the array before calling the page bulk allocating API, which has about 1~2 ns performance improvement for the test case of time_bench_page_pool03_slow() for page_pool in a x86 vm system, this reduces some performance impact of fixing the DMA API misuse problem in [2], performance improves from 87.886 ns to 86.429 ns. 1. https://lore.kernel.org/all/bd8c2f5c-464d-44ab-b607-390a87ea4cd5@huawei.com/ 2. https://lore.kernel.org/all/20250212092552.1779679-1-linyunsheng@huawei.com/ CC: Jesper Dangaard Brouer <hawk@kernel.org> CC: Luiz Capitulino <luizcap@redhat.com> CC: Mel Gorman <mgorman@techsingularity.net> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com> --- drivers/vfio/pci/virtio/migrate.c | 2 -- fs/btrfs/extent_io.c | 8 +++++--- fs/erofs/zutil.c | 12 ++++++------ fs/xfs/xfs_buf.c | 9 +++++---- mm/page_alloc.c | 32 +++++-------------------------- net/core/page_pool.c | 3 --- net/sunrpc/svc_xprt.c | 9 +++++---- 7 files changed, 26 insertions(+), 49 deletions(-)