From patchwork Fri Dec 1 12:02:02 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Yunsheng Lin X-Patchwork-Id: 13475720 X-Patchwork-Delegate: kuba@kernel.org Received: from szxga01-in.huawei.com (szxga01-in.huawei.com [45.249.212.187]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id EDA64D48; Fri, 1 Dec 2023 04:02:17 -0800 (PST) Received: from dggpemm500005.china.huawei.com (unknown [172.30.72.55]) by szxga01-in.huawei.com (SkyGuard) with ESMTP id 4ShWmV5SyqzsRSL; Fri, 1 Dec 2023 19:58:34 +0800 (CST) Received: from localhost.localdomain (10.69.192.56) by dggpemm500005.china.huawei.com (7.185.36.74) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.1.2507.35; Fri, 1 Dec 2023 20:02:16 +0800 From: Yunsheng Lin To: , , CC: , , Yunsheng Lin , Alexander Duyck , Andrew Morton , Eric Dumazet , Subject: [PATCH RFC 1/6] mm/page_alloc: modify page_frag_alloc_align() to accept align as an argument Date: Fri, 1 Dec 2023 20:02:02 +0800 Message-ID: <20231201120208.15080-2-linyunsheng@huawei.com> X-Mailer: git-send-email 2.33.0 In-Reply-To: <20231201120208.15080-1-linyunsheng@huawei.com> References: <20231201120208.15080-1-linyunsheng@huawei.com> Precedence: bulk X-Mailing-List: netdev@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 X-ClientProxiedBy: dggems703-chm.china.huawei.com (10.3.19.180) To dggpemm500005.china.huawei.com (7.185.36.74) X-CFilter-Loop: Reflected X-Patchwork-Delegate: kuba@kernel.org X-Patchwork-State: RFC napi_alloc_frag_align() and netdev_alloc_frag_align() accept align as an argument, and they are thin wrappers around the __napi_alloc_frag_align() and __netdev_alloc_frag_align() APIs doing the align and align_mask conversion, in order to call page_frag_alloc_align() directly. As __napi_alloc_frag_align() and __netdev_alloc_frag_align() APIs are only used by the above thin wrappers, it seems that it makes more sense to remove align and align_mask conversion and call page_frag_alloc_align() directly. By doing that, we can also avoid the confusion between napi_alloc_frag_align() accepting align as an argument and page_frag_alloc_align() accepting align_align as an argument when they both have the 'align' suffix. Signed-off-by: Yunsheng Lin CC: Alexander Duyck --- include/linux/gfp.h | 4 ++-- include/linux/skbuff.h | 22 ++++------------------ mm/page_alloc.c | 6 ++++-- net/core/skbuff.c | 14 +++++++------- 4 files changed, 17 insertions(+), 29 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index de292a007138..bbd75976541e 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -314,12 +314,12 @@ struct page_frag_cache; extern void __page_frag_cache_drain(struct page *page, unsigned int count); extern void *page_frag_alloc_align(struct page_frag_cache *nc, unsigned int fragsz, gfp_t gfp_mask, - unsigned int align_mask); + unsigned int align); static inline void *page_frag_alloc(struct page_frag_cache *nc, unsigned int fragsz, gfp_t gfp_mask) { - return page_frag_alloc_align(nc, fragsz, gfp_mask, ~0u); + return page_frag_alloc_align(nc, fragsz, gfp_mask, 1); } extern void page_frag_free(void *addr); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 27998f73183e..c27ed5ab6557 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3182,7 +3182,7 @@ static inline void skb_queue_purge(struct sk_buff_head *list) unsigned int skb_rbtree_purge(struct rb_root *root); void skb_errqueue_purge(struct sk_buff_head *list); -void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask); +void *netdev_alloc_frag_align(unsigned int fragsz, unsigned int align); /** * netdev_alloc_frag - allocate a page fragment @@ -3193,14 +3193,7 @@ void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask); */ static inline void *netdev_alloc_frag(unsigned int fragsz) { - return __netdev_alloc_frag_align(fragsz, ~0u); -} - -static inline void *netdev_alloc_frag_align(unsigned int fragsz, - unsigned int align) -{ - WARN_ON_ONCE(!is_power_of_2(align)); - return __netdev_alloc_frag_align(fragsz, -align); + return netdev_alloc_frag_align(fragsz, 1); } struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int length, @@ -3260,18 +3253,11 @@ static inline void skb_free_frag(void *addr) page_frag_free(addr); } -void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask); +void *napi_alloc_frag_align(unsigned int fragsz, unsigned int align); static inline void *napi_alloc_frag(unsigned int fragsz) { - return __napi_alloc_frag_align(fragsz, ~0u); -} - -static inline void *napi_alloc_frag_align(unsigned int fragsz, - unsigned int align) -{ - WARN_ON_ONCE(!is_power_of_2(align)); - return __napi_alloc_frag_align(fragsz, -align); + return napi_alloc_frag_align(fragsz, 1); } struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 37ca4f4b62bf..9a16305cf985 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4718,12 +4718,14 @@ EXPORT_SYMBOL(__page_frag_cache_drain); void *page_frag_alloc_align(struct page_frag_cache *nc, unsigned int fragsz, gfp_t gfp_mask, - unsigned int align_mask) + unsigned int align) { unsigned int size = PAGE_SIZE; struct page *page; int offset; + WARN_ON_ONCE(!is_power_of_2(align)); + if (unlikely(!nc->va)) { refill: page = __page_frag_cache_refill(nc, gfp_mask); @@ -4782,7 +4784,7 @@ void *page_frag_alloc_align(struct page_frag_cache *nc, } nc->pagecnt_bias--; - offset &= align_mask; + offset &= -align; nc->offset = offset; return nc->va + offset; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index b157efea5dea..b98d1da4004a 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -291,17 +291,17 @@ void napi_get_frags_check(struct napi_struct *napi) local_bh_enable(); } -void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) +void *napi_alloc_frag_align(unsigned int fragsz, unsigned int align) { struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); fragsz = SKB_DATA_ALIGN(fragsz); - return page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align_mask); + return page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align); } -EXPORT_SYMBOL(__napi_alloc_frag_align); +EXPORT_SYMBOL(napi_alloc_frag_align); -void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) +void *netdev_alloc_frag_align(unsigned int fragsz, unsigned int align) { void *data; @@ -309,18 +309,18 @@ void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) if (in_hardirq() || irqs_disabled()) { struct page_frag_cache *nc = this_cpu_ptr(&netdev_alloc_cache); - data = page_frag_alloc_align(nc, fragsz, GFP_ATOMIC, align_mask); + data = page_frag_alloc_align(nc, fragsz, GFP_ATOMIC, align); } else { struct napi_alloc_cache *nc; local_bh_disable(); nc = this_cpu_ptr(&napi_alloc_cache); - data = page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align_mask); + data = page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align); local_bh_enable(); } return data; } -EXPORT_SYMBOL(__netdev_alloc_frag_align); +EXPORT_SYMBOL(netdev_alloc_frag_align); static struct sk_buff *napi_skb_cache_get(void) { From patchwork Fri Dec 1 12:02:03 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Yunsheng Lin X-Patchwork-Id: 13475721 X-Patchwork-Delegate: kuba@kernel.org Received: from szxga02-in.huawei.com (szxga02-in.huawei.com [45.249.212.188]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 7A03A1B4; Fri, 1 Dec 2023 04:02:20 -0800 (PST) Received: from dggpemm500005.china.huawei.com (unknown [172.30.72.54]) by szxga02-in.huawei.com (SkyGuard) with ESMTP id 4ShWlq02qgzShJS; Fri, 1 Dec 2023 19:57:59 +0800 (CST) Received: from localhost.localdomain (10.69.192.56) by dggpemm500005.china.huawei.com (7.185.36.74) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.1.2507.35; Fri, 1 Dec 2023 20:02:18 +0800 From: Yunsheng Lin To: , , CC: , , Yunsheng Lin , Alexander Duyck , "Michael S. Tsirkin" , Jason Wang , Andrew Morton , Eric Dumazet , , , Subject: [PATCH RFC 2/6] page_frag: unify gfp bit for order 3 page allocation Date: Fri, 1 Dec 2023 20:02:03 +0800 Message-ID: <20231201120208.15080-3-linyunsheng@huawei.com> X-Mailer: git-send-email 2.33.0 In-Reply-To: <20231201120208.15080-1-linyunsheng@huawei.com> References: <20231201120208.15080-1-linyunsheng@huawei.com> Precedence: bulk X-Mailing-List: netdev@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 X-ClientProxiedBy: dggems703-chm.china.huawei.com (10.3.19.180) To dggpemm500005.china.huawei.com (7.185.36.74) X-CFilter-Loop: Reflected X-Patchwork-Delegate: kuba@kernel.org X-Patchwork-State: RFC Currently there seems to be three page frag implementions which all try to allocate order 3 page, if that fails, it then fail back to allocate order 0 page, and each of them all allow order 3 page allocation to fail under certain condition by using specific gfp bits. The gfp bits for order 3 page allocation are different between different implementation, __GFP_NOMEMALLOC is or'd to forbid access to emergency reserves memory for __page_frag_cache_refill(), but it is not or'd in other implementions, __GFP_DIRECT_RECLAIM is xor'd to avoid direct reclaim in skb_page_frag_refill(), but it is not xor'd in __page_frag_cache_refill(). This patch unifies the gfp bits used between different implementions by or'ing __GFP_NOMEMALLOC and xor'ing __GFP_DIRECT_RECLAIM for order 3 page allocation to avoid possible pressure for mm. Signed-off-by: Yunsheng Lin CC: Alexander Duyck --- drivers/vhost/net.c | 2 +- mm/page_alloc.c | 4 ++-- net/core/sock.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index f2ed7167c848..e574e21cc0ca 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -670,7 +670,7 @@ static bool vhost_net_page_frag_refill(struct vhost_net *net, unsigned int sz, /* Avoid direct reclaim but allow kswapd to wake */ pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) | __GFP_COMP | __GFP_NOWARN | - __GFP_NORETRY, + __GFP_NORETRY | __GFP_NOMEMALLOC, SKB_FRAG_PAGE_ORDER); if (likely(pfrag->page)) { pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9a16305cf985..1f0b36dd81b5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4693,8 +4693,8 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc, gfp_t gfp = gfp_mask; #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) - gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY | - __GFP_NOMEMALLOC; + gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) | __GFP_COMP | + __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC; page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, PAGE_FRAG_CACHE_MAX_ORDER); nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE; diff --git a/net/core/sock.c b/net/core/sock.c index fef349dd72fa..4efa9cae4b0d 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2904,7 +2904,7 @@ bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp) /* Avoid direct reclaim but allow kswapd to wake */ pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) | __GFP_COMP | __GFP_NOWARN | - __GFP_NORETRY, + __GFP_NORETRY | __GFP_NOMEMALLOC, SKB_FRAG_PAGE_ORDER); if (likely(pfrag->page)) { pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER; From patchwork Fri Dec 1 12:02:04 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Yunsheng Lin X-Patchwork-Id: 13475722 Received: from szxga02-in.huawei.com (szxga02-in.huawei.com [45.249.212.188]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 2529A170E; Fri, 1 Dec 2023 04:02:22 -0800 (PST) Received: from dggpemm500005.china.huawei.com (unknown [172.30.72.56]) by szxga02-in.huawei.com (SkyGuard) with ESMTP id 4ShWqx2WS9zWhx4; Fri, 1 Dec 2023 20:01:33 +0800 (CST) Received: from localhost.localdomain (10.69.192.56) by dggpemm500005.china.huawei.com (7.185.36.74) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.1.2507.35; Fri, 1 Dec 2023 20:02:20 +0800 From: Yunsheng Lin To: , , CC: , , Yunsheng Lin , Alexander Duyck , Andrew Morton , Subject: [PATCH RFC 3/6] mm/page_alloc: use initial zero offset for page_frag_alloc_align() Date: Fri, 1 Dec 2023 20:02:04 +0800 Message-ID: <20231201120208.15080-4-linyunsheng@huawei.com> X-Mailer: git-send-email 2.33.0 In-Reply-To: <20231201120208.15080-1-linyunsheng@huawei.com> References: <20231201120208.15080-1-linyunsheng@huawei.com> Precedence: bulk X-Mailing-List: netdev@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 X-ClientProxiedBy: dggems703-chm.china.huawei.com (10.3.19.180) To dggpemm500005.china.huawei.com (7.185.36.74) X-CFilter-Loop: Reflected X-Patchwork-State: RFC The next patch is above to use page_frag_alloc_align() to replace vhost_net_page_frag_refill(), the main difference between those two frag page implementations is whether we use a initial zero offset or not. It seems more nature to use a initial zero offset, as it may enable more correct cache prefetching and skb frag coalescing in the networking, so change it to use initial zero offset. Signed-off-by: Yunsheng Lin CC: Alexander Duyck --- mm/page_alloc.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1f0b36dd81b5..083e0c38fb62 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4720,7 +4720,7 @@ void *page_frag_alloc_align(struct page_frag_cache *nc, unsigned int fragsz, gfp_t gfp_mask, unsigned int align) { - unsigned int size = PAGE_SIZE; + unsigned int size; struct page *page; int offset; @@ -4732,10 +4732,6 @@ void *page_frag_alloc_align(struct page_frag_cache *nc, if (!page) return NULL; -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) - /* if size can vary use size else just use PAGE_SIZE */ - size = nc->size; -#endif /* Even if we own the page, we do not use atomic_set(). * This would break get_page_unless_zero() users. */ @@ -4744,11 +4740,18 @@ void *page_frag_alloc_align(struct page_frag_cache *nc, /* reset page count bias and offset to start of new frag */ nc->pfmemalloc = page_is_pfmemalloc(page); nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1; - nc->offset = size; + nc->offset = 0; } - offset = nc->offset - fragsz; - if (unlikely(offset < 0)) { +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) + /* if size can vary use size else just use PAGE_SIZE */ + size = nc->size; +#else + size = PAGE_SIZE; +#endif + + offset = ALIGN(nc->offset, align); + if (unlikely(offset + fragsz > size)) { page = virt_to_page(nc->va); if (!page_ref_sub_and_test(page, nc->pagecnt_bias)) @@ -4759,17 +4762,13 @@ void *page_frag_alloc_align(struct page_frag_cache *nc, goto refill; } -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) - /* if size can vary use size else just use PAGE_SIZE */ - size = nc->size; -#endif /* OK, page count is 0, we can safely set it */ set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1); /* reset page count bias and offset to start of new frag */ nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1; - offset = size - fragsz; - if (unlikely(offset < 0)) { + offset = 0; + if (unlikely(fragsz > size)) { /* * The caller is trying to allocate a fragment * with fragsz > PAGE_SIZE but the cache isn't big @@ -4784,8 +4783,7 @@ void *page_frag_alloc_align(struct page_frag_cache *nc, } nc->pagecnt_bias--; - offset &= -align; - nc->offset = offset; + nc->offset = offset + fragsz; return nc->va + offset; } From patchwork Fri Dec 1 12:02:05 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Yunsheng Lin X-Patchwork-Id: 13475703 Received: from szxga03-in.huawei.com (szxga03-in.huawei.com [45.249.212.189]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 876AA171B; Fri, 1 Dec 2023 04:02:24 -0800 (PST) Received: from dggpemm500005.china.huawei.com (unknown [172.30.72.55]) by szxga03-in.huawei.com (SkyGuard) with ESMTP id 4ShWlF1GCdzMnPl; Fri, 1 Dec 2023 19:57:29 +0800 (CST) Received: from localhost.localdomain (10.69.192.56) by dggpemm500005.china.huawei.com (7.185.36.74) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.1.2507.35; Fri, 1 Dec 2023 20:02:22 +0800 From: Yunsheng Lin To: , , CC: , , Yunsheng Lin , "Michael S. Tsirkin" , Jason Wang , Alexei Starovoitov , Daniel Borkmann , Jesper Dangaard Brouer , John Fastabend , , , Subject: [PATCH RFC 4/6] vhost/net: remove vhost_net_page_frag_refill() Date: Fri, 1 Dec 2023 20:02:05 +0800 Message-ID: <20231201120208.15080-5-linyunsheng@huawei.com> X-Mailer: git-send-email 2.33.0 In-Reply-To: <20231201120208.15080-1-linyunsheng@huawei.com> References: <20231201120208.15080-1-linyunsheng@huawei.com> Precedence: bulk X-Mailing-List: bpf@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 X-ClientProxiedBy: dggems703-chm.china.huawei.com (10.3.19.180) To dggpemm500005.china.huawei.com (7.185.36.74) X-CFilter-Loop: Reflected X-Patchwork-State: RFC The page frag in vhost_net_page_frag_refill() uses the 'struct page_frag' from skb_page_frag_refill(), but it's implementation is similar to page_frag_alloc_align() now. This patch removes vhost_net_page_frag_refill() by using 'struct page_frag_cache' instead of 'struct page_frag', and allocating frag using page_frag_alloc_align(). The added benefit is that not only unifying the page frag implementation a little, but also having about 0.5% performance boost testing by using the vhost_net_test introduced in the last patch. Signed-off-by: Yunsheng Lin --- drivers/vhost/net.c | 93 ++++++++++++++------------------------------- 1 file changed, 29 insertions(+), 64 deletions(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index e574e21cc0ca..805e11d598e4 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -141,10 +141,8 @@ struct vhost_net { unsigned tx_zcopy_err; /* Flush in progress. Protected by tx vq lock. */ bool tx_flush; - /* Private page frag */ - struct page_frag page_frag; - /* Refcount bias of page frag */ - int refcnt_bias; + /* Private page frag cache */ + struct page_frag_cache pf_cache; }; static unsigned vhost_net_zcopy_mask __read_mostly; @@ -655,41 +653,6 @@ static bool tx_can_batch(struct vhost_virtqueue *vq, size_t total_len) !vhost_vq_avail_empty(vq->dev, vq); } -static bool vhost_net_page_frag_refill(struct vhost_net *net, unsigned int sz, - struct page_frag *pfrag, gfp_t gfp) -{ - if (pfrag->page) { - if (pfrag->offset + sz <= pfrag->size) - return true; - __page_frag_cache_drain(pfrag->page, net->refcnt_bias); - } - - pfrag->offset = 0; - net->refcnt_bias = 0; - if (SKB_FRAG_PAGE_ORDER) { - /* Avoid direct reclaim but allow kswapd to wake */ - pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) | - __GFP_COMP | __GFP_NOWARN | - __GFP_NORETRY | __GFP_NOMEMALLOC, - SKB_FRAG_PAGE_ORDER); - if (likely(pfrag->page)) { - pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER; - goto done; - } - } - pfrag->page = alloc_page(gfp); - if (likely(pfrag->page)) { - pfrag->size = PAGE_SIZE; - goto done; - } - return false; - -done: - net->refcnt_bias = USHRT_MAX; - page_ref_add(pfrag->page, USHRT_MAX - 1); - return true; -} - #define VHOST_NET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD) static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq, @@ -699,7 +662,6 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq, struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev); struct socket *sock = vhost_vq_get_backend(vq); - struct page_frag *alloc_frag = &net->page_frag; struct virtio_net_hdr *gso; struct xdp_buff *xdp = &nvq->xdp[nvq->batched_xdp]; struct tun_xdp_hdr *hdr; @@ -710,6 +672,7 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq, int sock_hlen = nvq->sock_hlen; void *buf; int copied; + int ret; if (unlikely(len < nvq->sock_hlen)) return -EFAULT; @@ -719,18 +682,17 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq, return -ENOSPC; buflen += SKB_DATA_ALIGN(len + pad); - alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES); - if (unlikely(!vhost_net_page_frag_refill(net, buflen, - alloc_frag, GFP_KERNEL))) + buf = page_frag_alloc_align(&net->pf_cache, buflen, GFP_KERNEL, + SMP_CACHE_BYTES); + if (unlikely(!buf)) return -ENOMEM; - buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; - copied = copy_page_from_iter(alloc_frag->page, - alloc_frag->offset + - offsetof(struct tun_xdp_hdr, gso), - sock_hlen, from); - if (copied != sock_hlen) - return -EFAULT; + copied = copy_from_iter(buf + offsetof(struct tun_xdp_hdr, gso), + sock_hlen, from); + if (copied != sock_hlen) { + ret = -EFAULT; + goto err; + } hdr = buf; gso = &hdr->gso; @@ -743,27 +705,30 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq, vhost16_to_cpu(vq, gso->csum_start) + vhost16_to_cpu(vq, gso->csum_offset) + 2); - if (vhost16_to_cpu(vq, gso->hdr_len) > len) - return -EINVAL; + if (vhost16_to_cpu(vq, gso->hdr_len) > len) { + ret = -EINVAL; + goto err; + } } len -= sock_hlen; - copied = copy_page_from_iter(alloc_frag->page, - alloc_frag->offset + pad, - len, from); - if (copied != len) - return -EFAULT; + copied = copy_from_iter(buf + pad, len, from); + if (copied != len) { + ret = -EFAULT; + goto err; + } xdp_init_buff(xdp, buflen, NULL); xdp_prepare_buff(xdp, buf, pad, len, true); hdr->buflen = buflen; - --net->refcnt_bias; - alloc_frag->offset += buflen; - ++nvq->batched_xdp; return 0; + +err: + page_frag_free(buf); + return ret; } static void handle_tx_copy(struct vhost_net *net, struct socket *sock) @@ -1353,8 +1318,7 @@ static int vhost_net_open(struct inode *inode, struct file *f) vqs[VHOST_NET_VQ_RX]); f->private_data = n; - n->page_frag.page = NULL; - n->refcnt_bias = 0; + n->pf_cache.va = NULL; return 0; } @@ -1422,8 +1386,9 @@ static int vhost_net_release(struct inode *inode, struct file *f) kfree(n->vqs[VHOST_NET_VQ_RX].rxq.queue); kfree(n->vqs[VHOST_NET_VQ_TX].xdp); kfree(n->dev.vqs); - if (n->page_frag.page) - __page_frag_cache_drain(n->page_frag.page, n->refcnt_bias); + if (n->pf_cache.va) + __page_frag_cache_drain(virt_to_head_page(n->pf_cache.va), + n->pf_cache.pagecnt_bias); kvfree(n); return 0; } From patchwork Fri Dec 1 12:02:06 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Yunsheng Lin X-Patchwork-Id: 13475723 X-Patchwork-Delegate: kuba@kernel.org Received: from szxga08-in.huawei.com (szxga08-in.huawei.com [45.249.212.255]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 1C6B41720; Fri, 1 Dec 2023 04:02:32 -0800 (PST) Received: from dggpemm500005.china.huawei.com (unknown [172.30.72.54]) by szxga08-in.huawei.com (SkyGuard) with ESMTP id 4ShWmn704Gz1P91G; Fri, 1 Dec 2023 19:58:49 +0800 (CST) Received: from localhost.localdomain (10.69.192.56) by dggpemm500005.china.huawei.com (7.185.36.74) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.1.2507.35; Fri, 1 Dec 2023 20:02:30 +0800 From: Yunsheng Lin To: , , CC: , , Yunsheng Lin , Jeroen de Borst , Praveen Kaligineedi , Shailend Chand , Eric Dumazet , Felix Fietkau , John Crispin , Sean Wang , Mark Lee , Lorenzo Bianconi , Matthias Brugger , AngeloGioacchino Del Regno , Keith Busch , Jens Axboe , Christoph Hellwig , Sagi Grimberg , Chaitanya Kulkarni , "Michael S. Tsirkin" , Jason Wang , Andrew Morton , , , , , , Subject: [PATCH RFC 5/6] net: introduce page_frag_cache_drain() Date: Fri, 1 Dec 2023 20:02:06 +0800 Message-ID: <20231201120208.15080-6-linyunsheng@huawei.com> X-Mailer: git-send-email 2.33.0 In-Reply-To: <20231201120208.15080-1-linyunsheng@huawei.com> References: <20231201120208.15080-1-linyunsheng@huawei.com> Precedence: bulk X-Mailing-List: netdev@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 X-ClientProxiedBy: dggems703-chm.china.huawei.com (10.3.19.180) To dggpemm500005.china.huawei.com (7.185.36.74) X-CFilter-Loop: Reflected X-Patchwork-Delegate: kuba@kernel.org X-Patchwork-State: RFC When draining a page_frag_cache, most user are doing the similar steps, so introduce an API to avoid code duplication. Signed-off-by: Yunsheng Lin --- drivers/net/ethernet/google/gve/gve_main.c | 11 ++--------- drivers/net/ethernet/mediatek/mtk_wed_wo.c | 17 ++--------------- drivers/nvme/host/tcp.c | 7 +------ drivers/nvme/target/tcp.c | 4 +--- drivers/vhost/net.c | 4 +--- include/linux/gfp.h | 2 ++ mm/page_alloc.c | 10 ++++++++++ 7 files changed, 19 insertions(+), 36 deletions(-) diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index 619bf63ec935..d976190b0f4d 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -1278,17 +1278,10 @@ static void gve_unreg_xdp_info(struct gve_priv *priv) static void gve_drain_page_cache(struct gve_priv *priv) { - struct page_frag_cache *nc; int i; - for (i = 0; i < priv->rx_cfg.num_queues; i++) { - nc = &priv->rx[i].page_cache; - if (nc->va) { - __page_frag_cache_drain(virt_to_page(nc->va), - nc->pagecnt_bias); - nc->va = NULL; - } - } + for (i = 0; i < priv->rx_cfg.num_queues; i++) + page_frag_cache_drain(&priv->rx[i].page_cache); } static int gve_open(struct net_device *dev) diff --git a/drivers/net/ethernet/mediatek/mtk_wed_wo.c b/drivers/net/ethernet/mediatek/mtk_wed_wo.c index 7ffbd4fca881..df0a3ceaf59b 100644 --- a/drivers/net/ethernet/mediatek/mtk_wed_wo.c +++ b/drivers/net/ethernet/mediatek/mtk_wed_wo.c @@ -286,7 +286,6 @@ mtk_wed_wo_queue_free(struct mtk_wed_wo *wo, struct mtk_wed_wo_queue *q) static void mtk_wed_wo_queue_tx_clean(struct mtk_wed_wo *wo, struct mtk_wed_wo_queue *q) { - struct page *page; int i; for (i = 0; i < q->n_desc; i++) { @@ -298,19 +297,12 @@ mtk_wed_wo_queue_tx_clean(struct mtk_wed_wo *wo, struct mtk_wed_wo_queue *q) entry->buf = NULL; } - if (!q->cache.va) - return; - - page = virt_to_page(q->cache.va); - __page_frag_cache_drain(page, q->cache.pagecnt_bias); - memset(&q->cache, 0, sizeof(q->cache)); + page_frag_cache_drain(&q->cache); } static void mtk_wed_wo_queue_rx_clean(struct mtk_wed_wo *wo, struct mtk_wed_wo_queue *q) { - struct page *page; - for (;;) { void *buf = mtk_wed_wo_dequeue(wo, q, NULL, true); @@ -320,12 +312,7 @@ mtk_wed_wo_queue_rx_clean(struct mtk_wed_wo *wo, struct mtk_wed_wo_queue *q) skb_free_frag(buf); } - if (!q->cache.va) - return; - - page = virt_to_page(q->cache.va); - __page_frag_cache_drain(page, q->cache.pagecnt_bias); - memset(&q->cache, 0, sizeof(q->cache)); + page_frag_cache_drain(&q->cache); } static void diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 89661a9cf850..8d4f4a06f9d9 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1338,7 +1338,6 @@ static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl *ctrl) static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid) { - struct page *page; struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); struct nvme_tcp_queue *queue = &ctrl->queues[qid]; unsigned int noreclaim_flag; @@ -1349,11 +1348,7 @@ static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid) if (queue->hdr_digest || queue->data_digest) nvme_tcp_free_crypto(queue); - if (queue->pf_cache.va) { - page = virt_to_head_page(queue->pf_cache.va); - __page_frag_cache_drain(page, queue->pf_cache.pagecnt_bias); - queue->pf_cache.va = NULL; - } + page_frag_cache_drain(&queue->pf_cache); noreclaim_flag = memalloc_noreclaim_save(); /* ->sock will be released by fput() */ diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 92b74d0b8686..f9a553d70a61 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -1576,7 +1576,6 @@ static void nvmet_tcp_free_cmd_data_in_buffers(struct nvmet_tcp_queue *queue) static void nvmet_tcp_release_queue_work(struct work_struct *w) { - struct page *page; struct nvmet_tcp_queue *queue = container_of(w, struct nvmet_tcp_queue, release_work); @@ -1600,8 +1599,7 @@ static void nvmet_tcp_release_queue_work(struct work_struct *w) if (queue->hdr_digest || queue->data_digest) nvmet_tcp_free_crypto(queue); ida_free(&nvmet_tcp_queue_ida, queue->idx); - page = virt_to_head_page(queue->pf_cache.va); - __page_frag_cache_drain(page, queue->pf_cache.pagecnt_bias); + page_frag_cache_drain(&queue->pf_cache); kfree(queue); } diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 805e11d598e4..4b2fcb228a0a 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -1386,9 +1386,7 @@ static int vhost_net_release(struct inode *inode, struct file *f) kfree(n->vqs[VHOST_NET_VQ_RX].rxq.queue); kfree(n->vqs[VHOST_NET_VQ_TX].xdp); kfree(n->dev.vqs); - if (n->pf_cache.va) - __page_frag_cache_drain(virt_to_head_page(n->pf_cache.va), - n->pf_cache.pagecnt_bias); + page_frag_cache_drain(&n->pf_cache); kvfree(n); return 0; } diff --git a/include/linux/gfp.h b/include/linux/gfp.h index bbd75976541e..03ba079655d3 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -316,6 +316,8 @@ extern void *page_frag_alloc_align(struct page_frag_cache *nc, unsigned int fragsz, gfp_t gfp_mask, unsigned int align); +void page_frag_cache_drain(struct page_frag_cache *nc); + static inline void *page_frag_alloc(struct page_frag_cache *nc, unsigned int fragsz, gfp_t gfp_mask) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 083e0c38fb62..5a0e68edcb05 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4716,6 +4716,16 @@ void __page_frag_cache_drain(struct page *page, unsigned int count) } EXPORT_SYMBOL(__page_frag_cache_drain); +void page_frag_cache_drain(struct page_frag_cache *nc) +{ + if (!nc->va) + return; + + __page_frag_cache_drain(virt_to_head_page(nc->va), nc->pagecnt_bias); + nc->va = NULL; +} +EXPORT_SYMBOL(page_frag_cache_drain); + void *page_frag_alloc_align(struct page_frag_cache *nc, unsigned int fragsz, gfp_t gfp_mask, unsigned int align) From patchwork Fri Dec 1 12:02:07 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Yunsheng Lin X-Patchwork-Id: 13475724 Received: from szxga02-in.huawei.com (szxga02-in.huawei.com [45.249.212.188]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id BF0F11712; Fri, 1 Dec 2023 04:02:36 -0800 (PST) Received: from dggpemm500005.china.huawei.com (unknown [172.30.72.55]) by szxga02-in.huawei.com (SkyGuard) with ESMTP id 4ShWrC6DqNzWhy8; Fri, 1 Dec 2023 20:01:47 +0800 (CST) Received: from localhost.localdomain (10.69.192.56) by dggpemm500005.china.huawei.com (7.185.36.74) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.1.2507.35; Fri, 1 Dec 2023 20:02:34 +0800 From: Yunsheng Lin To: , , CC: , , Yunsheng Lin , "Michael S. Tsirkin" , Jason Wang , Xuan Zhuo , Subject: [PATCH RFC 6/6] tools: virtio: introduce vhost_net_test Date: Fri, 1 Dec 2023 20:02:07 +0800 Message-ID: <20231201120208.15080-7-linyunsheng@huawei.com> X-Mailer: git-send-email 2.33.0 In-Reply-To: <20231201120208.15080-1-linyunsheng@huawei.com> References: <20231201120208.15080-1-linyunsheng@huawei.com> Precedence: bulk X-Mailing-List: netdev@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 X-ClientProxiedBy: dggems703-chm.china.huawei.com (10.3.19.180) To dggpemm500005.china.huawei.com (7.185.36.74) X-CFilter-Loop: Reflected X-Patchwork-State: RFC introduce vhost_net_test basing on virtio_test to test vhost_net changing in the kernel. Signed-off-by: Yunsheng Lin --- tools/virtio/Makefile | 8 +- tools/virtio/vhost_net_test.c | 441 ++++++++++++++++++++++++++++++++++ 2 files changed, 446 insertions(+), 3 deletions(-) create mode 100644 tools/virtio/vhost_net_test.c diff --git a/tools/virtio/Makefile b/tools/virtio/Makefile index d128925980e0..e25e99c1c3b7 100644 --- a/tools/virtio/Makefile +++ b/tools/virtio/Makefile @@ -1,8 +1,9 @@ # SPDX-License-Identifier: GPL-2.0 all: test mod -test: virtio_test vringh_test +test: virtio_test vringh_test vhost_net_test virtio_test: virtio_ring.o virtio_test.o vringh_test: vringh_test.o vringh.o virtio_ring.o +vhost_net_test: virtio_ring.o vhost_net_test.o try-run = $(shell set -e; \ if ($(1)) >/dev/null 2>&1; \ @@ -49,6 +50,7 @@ oot-clean: OOT_BUILD+=clean .PHONY: all test mod clean vhost oot oot-clean oot-build clean: - ${RM} *.o vringh_test virtio_test vhost_test/*.o vhost_test/.*.cmd \ - vhost_test/Module.symvers vhost_test/modules.order *.d + ${RM} *.o vringh_test virtio_test vhost_net_test vhost_test/*.o \ + vhost_test/.*.cmd vhost_test/Module.symvers \ + vhost_test/modules.order *.d -include *.d diff --git a/tools/virtio/vhost_net_test.c b/tools/virtio/vhost_net_test.c new file mode 100644 index 000000000000..7e7b7aba3668 --- /dev/null +++ b/tools/virtio/vhost_net_test.c @@ -0,0 +1,441 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define RANDOM_BATCH -1 + +static int tun_alloc(void) +{ + struct ifreq ifr; + int fd, e; + + fd = open("/dev/net/tun", O_RDWR); + if (fd < 0) { + perror("Cannot open /dev/net/tun"); + return fd; + } + + memset(&ifr, 0, sizeof(ifr)); + + ifr.ifr_flags = IFF_TUN | IFF_NO_PI; + strncpy(ifr.ifr_name, "tun0", IFNAMSIZ); + + e = ioctl(fd, TUNSETIFF, (void *) &ifr); + if (e < 0) { + perror("ioctl[TUNSETIFF]"); + close(fd); + return e; + } + + return fd; +} + +/* Unused */ +void *__kmalloc_fake, *__kfree_ignore_start, *__kfree_ignore_end; + +struct vq_info { + int kick; + int call; + int num; + int idx; + void *ring; + /* copy used for control */ + struct vring vring; + struct virtqueue *vq; +}; + +struct vdev_info { + struct virtio_device vdev; + int control; + struct pollfd fds[1]; + struct vq_info vqs[1]; + int nvqs; + void *buf; + size_t buf_size; + struct vhost_memory *mem; +}; + +static struct vhost_vring_file no_backend = { .index = 1, .fd = -1 }, + backend = { .index = 1, .fd = 1 }; +static const struct vhost_vring_state null_state = {}; + +bool vq_notify(struct virtqueue *vq) +{ + struct vq_info *info = vq->priv; + unsigned long long v = 1; + int r; + r = write(info->kick, &v, sizeof v); + assert(r == sizeof v); + return true; +} + +void vq_callback(struct virtqueue *vq) +{ +} + + +void vhost_vq_setup(struct vdev_info *dev, struct vq_info *info) +{ + struct vhost_vring_state state = { .index = info->idx }; + struct vhost_vring_file file = { .index = info->idx }; + unsigned long long features = dev->vdev.features; + struct vhost_vring_addr addr = { + .index = info->idx, + .desc_user_addr = (uint64_t)(unsigned long)info->vring.desc, + .avail_user_addr = (uint64_t)(unsigned long)info->vring.avail, + .used_user_addr = (uint64_t)(unsigned long)info->vring.used, + }; + int r; + r = ioctl(dev->control, VHOST_SET_FEATURES, &features); + assert(r >= 0); + state.num = info->vring.num; + r = ioctl(dev->control, VHOST_SET_VRING_NUM, &state); + assert(r >= 0); + state.num = 0; + r = ioctl(dev->control, VHOST_SET_VRING_BASE, &state); + assert(r >= 0); + r = ioctl(dev->control, VHOST_SET_VRING_ADDR, &addr); + assert(r >= 0); + file.fd = info->kick; + r = ioctl(dev->control, VHOST_SET_VRING_KICK, &file); + assert(r >= 0); + file.fd = info->call; + r = ioctl(dev->control, VHOST_SET_VRING_CALL, &file); + assert(r >= 0); +} + +static void vq_reset(struct vq_info *info, int num, struct virtio_device *vdev) +{ + if (info->vq) + vring_del_virtqueue(info->vq); + + memset(info->ring, 0, vring_size(num, 4096)); + vring_init(&info->vring, num, info->ring, 4096); + info->vq = vring_new_virtqueue(info->idx, num, 4096, vdev, true, false, + info->ring, vq_notify, vq_callback, "test"); + assert(info->vq); + info->vq->priv = info; +} + +static void vq_info_add(struct vdev_info *dev, int num) +{ + struct vq_info *info = &dev->vqs[dev->nvqs]; + int r; + + /* use VHOST_NET_VQ_TX for testing */ + info->idx = 1; + info->kick = eventfd(0, EFD_NONBLOCK); + info->call = eventfd(0, EFD_NONBLOCK); + r = posix_memalign(&info->ring, 4096, vring_size(num, 4096)); + assert(r >= 0); + vq_reset(info, num, &dev->vdev); + vhost_vq_setup(dev, info); + dev->fds[0].fd = info->call; + dev->fds[0].events = POLLIN; + dev->nvqs++; +} + +static void vdev_info_init(struct vdev_info* dev, unsigned long long features) +{ + int r; + memset(dev, 0, sizeof *dev); + dev->vdev.features = features; + INIT_LIST_HEAD(&dev->vdev.vqs); + spin_lock_init(&dev->vdev.vqs_list_lock); + dev->buf_size = 1024; + dev->buf = malloc(dev->buf_size); + assert(dev->buf); + dev->control = open("/dev/vhost-net", O_RDWR); + assert(dev->control >= 0); + r = ioctl(dev->control, VHOST_SET_OWNER, NULL); + assert(r >= 0); + dev->mem = malloc(offsetof(struct vhost_memory, regions) + + sizeof dev->mem->regions[0]); + assert(dev->mem); + memset(dev->mem, 0, offsetof(struct vhost_memory, regions) + + sizeof dev->mem->regions[0]); + dev->mem->nregions = 1; + dev->mem->regions[0].guest_phys_addr = (long)dev->buf; + dev->mem->regions[0].userspace_addr = (long)dev->buf; + dev->mem->regions[0].memory_size = dev->buf_size; + r = ioctl(dev->control, VHOST_SET_MEM_TABLE, dev->mem); + assert(r >= 0); +} + +/* TODO: this is pretty bad: we get a cache line bounce + * for the wait queue on poll and another one on read, + * plus the read which is there just to clear the + * current state. */ +static void wait_for_interrupt(struct vdev_info *dev) +{ + int i; + unsigned long long val; + poll(dev->fds, dev->nvqs, -1); + for (i = 0; i < dev->nvqs; ++i) + if (dev->fds[i].revents & POLLIN) { + read(dev->fds[i].fd, &val, sizeof val); + } +} + +static void run_test(struct vdev_info *dev, struct vq_info *vq, + bool delayed, int batch, int reset_n, int bufs) +{ + struct scatterlist sl; + long started = 0, completed = 0, next_reset = reset_n; + long completed_before, started_before; + int r; + unsigned int len; + long long spurious = 0; + const bool random_batch = batch == RANDOM_BATCH; + + r = ioctl(dev->control, VHOST_NET_SET_BACKEND, &backend); + assert(!r); + + if (!reset_n) { + next_reset = INT_MAX; + } + + for (;;) { + virtqueue_disable_cb(vq->vq); + completed_before = completed; + started_before = started; + do { + const bool reset = completed > next_reset; + if (random_batch) + batch = (random() % vq->vring.num) + 1; + + while (started < bufs && + (started - completed) < batch) { + sg_init_one(&sl, dev->buf, dev->buf_size); + r = virtqueue_add_outbuf(vq->vq, &sl, 1, + dev->buf + started, + GFP_ATOMIC); + if (unlikely(r != 0)) { + if (r == -ENOSPC && + started > started_before) + r = 0; + else + r = -1; + break; + } + + ++started; + + if (unlikely(!virtqueue_kick(vq->vq))) { + r = -1; + break; + } + } + + if (started >= bufs) + r = -1; + + if (reset) { + r = ioctl(dev->control, VHOST_NET_SET_BACKEND, + &no_backend); + assert(!r); + } + + /* Flush out completed bufs if any */ + while (virtqueue_get_buf(vq->vq, &len)) { + ++completed; + r = 0; + } + + if (reset) { + struct vhost_vring_state s = { .index = 0 }; + + vq_reset(vq, vq->vring.num, &dev->vdev); + + r = ioctl(dev->control, VHOST_GET_VRING_BASE, + &s); + assert(!r); + + s.num = 0; + r = ioctl(dev->control, VHOST_SET_VRING_BASE, + &null_state); + assert(!r); + + r = ioctl(dev->control, VHOST_NET_SET_BACKEND, + &backend); + assert(!r); + + started = completed; + while (completed > next_reset) + next_reset += completed; + } + } while (r == 0); + if (completed == completed_before && started == started_before) + ++spurious; + assert(completed <= bufs); + assert(started <= bufs); + if (completed == bufs) + break; + if (delayed) { + if (virtqueue_enable_cb_delayed(vq->vq)) + wait_for_interrupt(dev); + } else { + if (virtqueue_enable_cb(vq->vq)) + wait_for_interrupt(dev); + } + } + fprintf(stderr, + "spurious wakeups: 0x%llx started=0x%lx completed=0x%lx\n", + spurious, started, completed); +} + +const char optstring[] = "h"; +const struct option longopts[] = { + { + .name = "help", + .val = 'h', + }, + { + .name = "event-idx", + .val = 'E', + }, + { + .name = "no-event-idx", + .val = 'e', + }, + { + .name = "indirect", + .val = 'I', + }, + { + .name = "no-indirect", + .val = 'i', + }, + { + .name = "virtio-1", + .val = '1', + }, + { + .name = "no-virtio-1", + .val = '0', + }, + { + .name = "delayed-interrupt", + .val = 'D', + }, + { + .name = "no-delayed-interrupt", + .val = 'd', + }, + { + .name = "buf-num", + .val = 'n', + .has_arg = required_argument, + }, + { + .name = "batch", + .val = 'b', + .has_arg = required_argument, + }, + { + .name = "reset", + .val = 'r', + .has_arg = optional_argument, + }, + { + } +}; + +static void help(int status) +{ + fprintf(stderr, "Usage: virtio_test [--help]" + " [--no-indirect]" + " [--no-event-idx]" + " [--no-virtio-1]" + " [--delayed-interrupt]" + " [--batch=random/N]" + " [--reset=N]" + "\n"); + + exit(status); +} + +int main(int argc, char **argv) +{ + struct vdev_info dev; + unsigned long long features = (1ULL << VIRTIO_RING_F_INDIRECT_DESC) | + (1ULL << VIRTIO_RING_F_EVENT_IDX) | (1ULL << VIRTIO_F_VERSION_1); + long batch = 1, reset = 0, nbufs = 0x100000; + int o; + bool delayed = false; + + for (;;) { + o = getopt_long(argc, argv, optstring, longopts, NULL); + switch (o) { + case -1: + goto done; + case '?': + help(2); + case 'e': + features &= ~(1ULL << VIRTIO_RING_F_EVENT_IDX); + break; + case 'h': + help(0); + case 'i': + features &= ~(1ULL << VIRTIO_RING_F_INDIRECT_DESC); + break; + case '0': + features &= ~(1ULL << VIRTIO_F_VERSION_1); + break; + case 'D': + delayed = true; + break; + case 'b': + if (0 == strcmp(optarg, "random")) { + batch = RANDOM_BATCH; + } else { + batch = strtol(optarg, NULL, 10); + assert(batch > 0); + assert(batch < (long)INT_MAX + 1); + } + break; + case 'r': + if (!optarg) { + reset = 1; + } else { + reset = strtol(optarg, NULL, 10); + assert(reset > 0); + assert(reset < (long)INT_MAX + 1); + } + break; + case 'n': + nbufs = strtol(optarg, NULL, 10); + assert(nbufs > 0); + break; + default: + assert(0); + break; + } + } + +done: + backend.fd = tun_alloc(); + assert(backend.fd >= 0); + vdev_info_init(&dev, features); + vq_info_add(&dev, 256); + run_test(&dev, &dev.vqs[0], delayed, batch, reset, nbufs); + return 0; +}