From patchwork Wed Dec 4 17:21:40 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: David Wei X-Patchwork-Id: 13894112 Received: from mail-pl1-f169.google.com (mail-pl1-f169.google.com [209.85.214.169]) (using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 8EC4E205AB3 for ; Wed, 4 Dec 2024 17:22:43 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=209.85.214.169 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1733332965; cv=none; b=b4BNqsT2d+KoneLMbdU/PIz2byzIw9fXuaeaQWYkBbouqGFA6epQN867R70r7vuvh1oRnQIZm/mkuIt1FODn31Eh7DYVr+7eBvn2FUTllVy6P5mf1t+Uynu9joDvLol4O3j/3LP2WSgKeGMqdric7kra6fL2Hlxt2FABn3iJyTs= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1733332965; c=relaxed/simple; bh=XvRdkvazrDmZ1Kd7Oyb1j/+3ssBezOG7u0Y2B0T4Z0k=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=csdaEUAqbak6yjORG6Zaj7cmqNMPdO6pcEGJXkKDZfox3mAj9HeId3YE/N72YGbYpI00cQ4aY7l/2N7f1oWAxBJRycF0P4k+AsPkBQf2C8Kmo1IfxkOxnUBcKdRK/ZeRfqnBBttLenzY7PQHb2T6xwXiVeHX1B/aqRvhbhE3Kjk= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=none (p=none dis=none) header.from=davidwei.uk; spf=none smtp.mailfrom=davidwei.uk; dkim=pass (2048-bit key) header.d=davidwei-uk.20230601.gappssmtp.com header.i=@davidwei-uk.20230601.gappssmtp.com header.b=dsxAUX8d; arc=none smtp.client-ip=209.85.214.169 Authentication-Results: smtp.subspace.kernel.org; dmarc=none (p=none dis=none) header.from=davidwei.uk Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=davidwei.uk Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=davidwei-uk.20230601.gappssmtp.com header.i=@davidwei-uk.20230601.gappssmtp.com header.b="dsxAUX8d" Received: by mail-pl1-f169.google.com with SMTP id d9443c01a7336-215810fff52so42443915ad.1 for ; Wed, 04 Dec 2024 09:22:43 -0800 (PST) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=davidwei-uk.20230601.gappssmtp.com; s=20230601; t=1733332963; x=1733937763; darn=vger.kernel.org; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:from:to:cc:subject:date :message-id:reply-to; bh=ED4HczsmESLV9aByZwZf3sriAwBjJXMtxojVRAGQXDk=; b=dsxAUX8djduAXYOfg4kS1D9X94hWuXRFNGGxdVuHI5cuSNfQkHllV/7Fq3qRWd6vpt 0wFVQxe5njrA5RBGB4Lbli8katkygN1NloUZTSmGiu5ZId6BpntkckGdXDZPLmEtANfw KsArPglTwMRD6vSrGJeB5Ath0tjSQzMelMSbsRNqu5XHaOtyfLFFyvgYMqz1U5ZL29v/ YQj+ESnwwH74n/I4nn9H/MswZvFdyIbIhjYL8AYMxubJuT3aaFfWu5BgqelvgGgQOuHe 7ZooZ9vEPJ2YJx639dziHyLBG2K4WNfyx53ajfcTZrbvwurmDvG1CRfVW2JCgRzFuKPQ KwPg== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20230601; t=1733332963; x=1733937763; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc :subject:date:message-id:reply-to; bh=ED4HczsmESLV9aByZwZf3sriAwBjJXMtxojVRAGQXDk=; b=RyacE+4PPH6WOI4Tc79xJq5M1yxEN8SXytjSVf9SSjc+GDzFlS0fd9eBZ9OnFsZnH7 QnBAn95fsS2gdluGxUfxr/T4hB6IfG7T7B7sy6B6ZC8LQ7WC4g/ChbBvI6+AB6UDotWG 09qrQUp5fXk9rusXhJAw5SIX4Q03G1MnxMyK9JWuZalpkDazk5pUPApQq4EZM3n217BN GgdChC8oRxfOq4nhd2eSSU5ZS4aDv99nC2V1/voYV4L64/KflawMKJ0vWDNI/V3eQBfQ E598wajYcp8YEYve36slFtHFktNnrhuaUX3poJzpPTkATQ9I7PX0wlwbrSEXQA4BqOnd RFuw== X-Gm-Message-State: AOJu0YzQdJbpss737MqyqTMFZUuVjnB9pDswqKLTHexdHscBvJ2bk0Jy QJPu3x9GJEoBeuJfEJxGmUsDx1w8N29wTDdvp7+LnLouqSBgFmf4KFY10c0Ls1pkalAuMZ0SqLB q X-Gm-Gg: ASbGncs2rBFR+LJA90NEqbdpq7JruTB9Q3AGMGiH9pxyBYojDL9mlBEtWNY+4RoBIm8 QFc8tD7+S7vNj6LkgqedAPvsMGS3Wr3KDAeAeJEaDQRgEpa7J+UrJDeNWhFN9zJsCQo/cvPT4tC ThtLkLZvROT4QsfLbysfKjwx/9m474WXUWruX/ER0vIECJCzUv7etSarA4te8rEf5GoY0rBIvaD Ig04ABduZRNY6O17Doide5jQHCSLnzwNw== X-Google-Smtp-Source: AGHT+IGVbPmbB7ozafq1v0+R5E2YCOhtJ/A0zSt3AergeaNjiTx9OBuin0TzWvA0ObY/YTRvSiBMqg== X-Received: by 2002:a17:902:d504:b0:215:620f:8de4 with SMTP id d9443c01a7336-215bd1b4700mr96159175ad.2.1733332962844; Wed, 04 Dec 2024 09:22:42 -0800 (PST) Received: from localhost ([2a03:2880:ff:8::]) by smtp.gmail.com with ESMTPSA id d9443c01a7336-2158b08969bsm59633955ad.265.2024.12.04.09.22.42 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Wed, 04 Dec 2024 09:22:42 -0800 (PST) From: David Wei To: io-uring@vger.kernel.org, netdev@vger.kernel.org Cc: Jens Axboe , Pavel Begunkov , Jakub Kicinski , Paolo Abeni , "David S. Miller" , Eric Dumazet , Jesper Dangaard Brouer , David Ahern , Mina Almasry , Stanislav Fomichev , Joe Damato , Pedro Tammela Subject: [PATCH net-next v8 01/17] net: prefix devmem specific helpers Date: Wed, 4 Dec 2024 09:21:40 -0800 Message-ID: <20241204172204.4180482-2-dw@davidwei.uk> X-Mailer: git-send-email 2.43.5 In-Reply-To: <20241204172204.4180482-1-dw@davidwei.uk> References: <20241204172204.4180482-1-dw@davidwei.uk> Precedence: bulk X-Mailing-List: io-uring@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Pavel Begunkov Add prefixes to all helpers that are specific to devmem TCP, i.e. net_iov_binding[_id]. Signed-off-by: Pavel Begunkov Signed-off-by: David Wei Reviewed-by: Mina Almasry --- net/core/devmem.c | 2 +- net/core/devmem.h | 14 +++++++------- net/ipv4/tcp.c | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/net/core/devmem.c b/net/core/devmem.c index 11b91c12ee11..858982858f81 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -93,7 +93,7 @@ net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding) void net_devmem_free_dmabuf(struct net_iov *niov) { - struct net_devmem_dmabuf_binding *binding = net_iov_binding(niov); + struct net_devmem_dmabuf_binding *binding = net_devmem_iov_binding(niov); unsigned long dma_addr = net_devmem_get_dma_addr(niov); if (WARN_ON(!gen_pool_has_addr(binding->chunk_pool, dma_addr, diff --git a/net/core/devmem.h b/net/core/devmem.h index 76099ef9c482..99782ddeca40 100644 --- a/net/core/devmem.h +++ b/net/core/devmem.h @@ -86,11 +86,16 @@ static inline unsigned int net_iov_idx(const struct net_iov *niov) } static inline struct net_devmem_dmabuf_binding * -net_iov_binding(const struct net_iov *niov) +net_devmem_iov_binding(const struct net_iov *niov) { return net_iov_owner(niov)->binding; } +static inline u32 net_devmem_iov_binding_id(const struct net_iov *niov) +{ + return net_devmem_iov_binding(niov)->id; +} + static inline unsigned long net_iov_virtual_addr(const struct net_iov *niov) { struct dmabuf_genpool_chunk_owner *owner = net_iov_owner(niov); @@ -99,11 +104,6 @@ static inline unsigned long net_iov_virtual_addr(const struct net_iov *niov) ((unsigned long)net_iov_idx(niov) << PAGE_SHIFT); } -static inline u32 net_iov_binding_id(const struct net_iov *niov) -{ - return net_iov_owner(niov)->binding->id; -} - static inline void net_devmem_dmabuf_binding_get(struct net_devmem_dmabuf_binding *binding) { @@ -171,7 +171,7 @@ static inline unsigned long net_iov_virtual_addr(const struct net_iov *niov) return 0; } -static inline u32 net_iov_binding_id(const struct net_iov *niov) +static inline u32 net_devmem_iov_binding_id(const struct net_iov *niov) { return 0; } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 0d704bda6c41..b872de9a8271 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2494,7 +2494,7 @@ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb, /* Will perform the exchange later */ dmabuf_cmsg.frag_token = tcp_xa_pool.tokens[tcp_xa_pool.idx]; - dmabuf_cmsg.dmabuf_id = net_iov_binding_id(niov); + dmabuf_cmsg.dmabuf_id = net_devmem_iov_binding_id(niov); offset += copy; remaining_len -= copy; From patchwork Wed Dec 4 17:21:41 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: David Wei X-Patchwork-Id: 13894113 Received: from mail-pg1-f176.google.com (mail-pg1-f176.google.com [209.85.215.176]) (using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id D498A2139AF for ; Wed, 4 Dec 2024 17:22:44 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=209.85.215.176 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1733332966; cv=none; b=gOZG3SAH5/4ix6uJu3GUzGGJSrdOC+f6uNztx8namx/iTjPHOQRKt563SKqvKEchNijuCB6bD/nlZ/YDQlch/cHElUNjLU53VfE3DZkBVrLvgqMLp1W988i2WXlKwM7RjSPgmetepGZizHBFr87AgKrWBOTl8Q2hsvZPM5N+DCE= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1733332966; c=relaxed/simple; bh=frqeAzwkJ9oMlnR6KCmqaH2uYtAqksDh6+nrwl2/U64=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=mI24x+bpb5/WxK/Tm9GRds+tIv3s/Rvl8WU8hGhPcmtYdJ0W9l2bUPGqOOML+uS9VbBKdwLeNLRDnB44Exo1PA0iQRSYqZZTAcLGcitpEFvc5ROum96zwdsuITgBFJwGVpHTQtWIawn+A540gCzlwYMHQvPV0tEzvxc3rP32JgA= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=none (p=none dis=none) header.from=davidwei.uk; spf=none smtp.mailfrom=davidwei.uk; dkim=pass (2048-bit key) header.d=davidwei-uk.20230601.gappssmtp.com header.i=@davidwei-uk.20230601.gappssmtp.com header.b=a5kyLfUP; arc=none smtp.client-ip=209.85.215.176 Authentication-Results: smtp.subspace.kernel.org; dmarc=none (p=none dis=none) header.from=davidwei.uk Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=davidwei.uk Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=davidwei-uk.20230601.gappssmtp.com header.i=@davidwei-uk.20230601.gappssmtp.com header.b="a5kyLfUP" Received: by mail-pg1-f176.google.com with SMTP id 41be03b00d2f7-7fbc1ca1046so49385a12.0 for ; Wed, 04 Dec 2024 09:22:44 -0800 (PST) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=davidwei-uk.20230601.gappssmtp.com; s=20230601; t=1733332964; x=1733937764; darn=vger.kernel.org; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:from:to:cc:subject:date :message-id:reply-to; bh=nsSDsak9FzFqdyI2Y+zWFMkiWrzR17UOTNp3LeYZscU=; b=a5kyLfUP0QNGoxXgbrrkF55HxlzVfpm/W62HRfVIqX/iIQxX1oZlv5xOPp+e9n5tgz HYYsFytDurN6+WbrtwjRXp7g7EhXcHq8HOnOWGap5BjUK41T129b5VrKr0XLvf2tHVf/ iAGKwOHzKVCc01aQY1kDKGd3l1/Sbx03aVNYdYr/s4QVD8a+cWp9GrbhOc9e6AvrQADZ EpEi/F3Hk42zBRXWvsMdDuMQlj3aFh/936QzXAWGCXFQZwnl28NxsLIkgPvvSzHlPZtQ vHjoDV5NsTfRenisbiQWkB1DVVP/Cosfxjdz3OErQdkR8K/xRT/aGLxutYx/d2ODMPJp JfqA== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20230601; t=1733332964; x=1733937764; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc :subject:date:message-id:reply-to; bh=nsSDsak9FzFqdyI2Y+zWFMkiWrzR17UOTNp3LeYZscU=; b=C61DuXo05AVYKYF53h7xITbs7fu6yPIiRUMW1D2GjxQZz0+jZ7e4dhpP0Z2eqf8tvi Txu5/sKoKyow6p9FZ8uCLYz4V01Bi0OZ8m+OqJ7PffubS1mS6EoqggkfzwAWI9NA4zIf 1LwLfdTMvED1WvZR1jzlNHfHbsYOc8RgQy8BywwKDtH+7Z46V+hVqCkFsLqNYKos9L4H rrnOhGVao4vY1aWlEjjJI5/zpf25M3iDBI9ZYGOzq+iqXMdRYK5HT47gl7RDi6ju0TZb Lh8lfa+Boy01BCU9shLyQhWcGMU4bNFA26jfi8nrUyGBvu9sk+OQDLG1oo4BECZ/cJCt FmRw== X-Gm-Message-State: AOJu0YxRpKQs0ojJD+VqMGO+GcoWDy1vAA1KdEn9npo8LsWXi98dsbC+ qTRQKfte9LNPuAEYMxcnJ3BK7ktGG/84GCbaU6P+/SfvZzUzyBOJwQl7Uv4yMQGW7nUOhFLcPoH U X-Gm-Gg: ASbGncut8VUyy5N+Nsy9LitU3oZf2Rkk/dQUYADPagbQyXoTIk99cqAN6JK4QgC94Tf 31WYJw/4sbA+8WtsxEn04IQCTNKtPKKiI702FfCvnD4ghQcILlVopQTPfRGt9j3DpfYAtEtiv5V Z0XERPbSYf1wAoFOBWEmo0Q1R+wn2fm2iyNFlvX/8t/btfAl9s2ZfCXnqU08yYODimnvG3QGbZ0 HDoN6u6GSSI+7qk3pMdBzSfqOYnDdvD6w== X-Google-Smtp-Source: AGHT+IEC4bk4CdmOVdnpPRWZFJhbzn+ahGy9UpBnkTNPDZgSdGijktQsaZAHSW0Pbhm4QCPQ6BVQNA== X-Received: by 2002:a05:6a20:9144:b0:1e0:d796:b079 with SMTP id adf61e73a8af0-1e1653bb99dmr11860869637.17.1733332964158; Wed, 04 Dec 2024 09:22:44 -0800 (PST) Received: from localhost ([2a03:2880:ff:6::]) by smtp.gmail.com with ESMTPSA id 41be03b00d2f7-7fc9c2d4bc6sm11747507a12.9.2024.12.04.09.22.43 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Wed, 04 Dec 2024 09:22:43 -0800 (PST) From: David Wei To: io-uring@vger.kernel.org, netdev@vger.kernel.org Cc: Jens Axboe , Pavel Begunkov , Jakub Kicinski , Paolo Abeni , "David S. Miller" , Eric Dumazet , Jesper Dangaard Brouer , David Ahern , Mina Almasry , Stanislav Fomichev , Joe Damato , Pedro Tammela Subject: [PATCH net-next v8 02/17] net: generalise net_iov chunk owners Date: Wed, 4 Dec 2024 09:21:41 -0800 Message-ID: <20241204172204.4180482-3-dw@davidwei.uk> X-Mailer: git-send-email 2.43.5 In-Reply-To: <20241204172204.4180482-1-dw@davidwei.uk> References: <20241204172204.4180482-1-dw@davidwei.uk> Precedence: bulk X-Mailing-List: io-uring@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Pavel Begunkov Currently net_iov stores a pointer to struct dmabuf_genpool_chunk_owner, which serves as a useful abstraction to share data and provide a context. However, it's too devmem specific, and we want to reuse it for other memory providers, and for that we need to decouple net_iov from devmem. Make net_iov to point to a new base structure called net_iov_area, which dmabuf_genpool_chunk_owner extends. Signed-off-by: Pavel Begunkov Signed-off-by: David Wei Reviewed-by: Mina Almasry --- include/net/netmem.h | 21 ++++++++++++++++++++- net/core/devmem.c | 25 +++++++++++++------------ net/core/devmem.h | 25 +++++++++---------------- 3 files changed, 42 insertions(+), 29 deletions(-) diff --git a/include/net/netmem.h b/include/net/netmem.h index 8a6e20be4b9d..3795ded30d2c 100644 --- a/include/net/netmem.h +++ b/include/net/netmem.h @@ -24,11 +24,20 @@ struct net_iov { unsigned long __unused_padding; unsigned long pp_magic; struct page_pool *pp; - struct dmabuf_genpool_chunk_owner *owner; + struct net_iov_area *owner; unsigned long dma_addr; atomic_long_t pp_ref_count; }; +struct net_iov_area { + /* Array of net_iovs for this area. */ + struct net_iov *niovs; + size_t num_niovs; + + /* Offset into the dma-buf where this chunk starts. */ + unsigned long base_virtual; +}; + /* These fields in struct page are used by the page_pool and net stack: * * struct { @@ -54,6 +63,16 @@ NET_IOV_ASSERT_OFFSET(dma_addr, dma_addr); NET_IOV_ASSERT_OFFSET(pp_ref_count, pp_ref_count); #undef NET_IOV_ASSERT_OFFSET +static inline struct net_iov_area *net_iov_owner(const struct net_iov *niov) +{ + return niov->owner; +} + +static inline unsigned int net_iov_idx(const struct net_iov *niov) +{ + return niov - net_iov_owner(niov)->niovs; +} + /* netmem */ /** diff --git a/net/core/devmem.c b/net/core/devmem.c index 858982858f81..5c10cf0e2a18 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -32,14 +32,15 @@ static void net_devmem_dmabuf_free_chunk_owner(struct gen_pool *genpool, { struct dmabuf_genpool_chunk_owner *owner = chunk->owner; - kvfree(owner->niovs); + kvfree(owner->area.niovs); kfree(owner); } static dma_addr_t net_devmem_get_dma_addr(const struct net_iov *niov) { - struct dmabuf_genpool_chunk_owner *owner = net_iov_owner(niov); + struct dmabuf_genpool_chunk_owner *owner; + owner = net_devmem_iov_to_chunk_owner(niov); return owner->base_dma_addr + ((dma_addr_t)net_iov_idx(niov) << PAGE_SHIFT); } @@ -82,7 +83,7 @@ net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding) offset = dma_addr - owner->base_dma_addr; index = offset / PAGE_SIZE; - niov = &owner->niovs[index]; + niov = &owner->area.niovs[index]; niov->pp_magic = 0; niov->pp = NULL; @@ -250,9 +251,9 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, goto err_free_chunks; } - owner->base_virtual = virtual; + owner->area.base_virtual = virtual; owner->base_dma_addr = dma_addr; - owner->num_niovs = len / PAGE_SIZE; + owner->area.num_niovs = len / PAGE_SIZE; owner->binding = binding; err = gen_pool_add_owner(binding->chunk_pool, dma_addr, @@ -264,17 +265,17 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, goto err_free_chunks; } - owner->niovs = kvmalloc_array(owner->num_niovs, - sizeof(*owner->niovs), - GFP_KERNEL); - if (!owner->niovs) { + owner->area.niovs = kvmalloc_array(owner->area.num_niovs, + sizeof(*owner->area.niovs), + GFP_KERNEL); + if (!owner->area.niovs) { err = -ENOMEM; goto err_free_chunks; } - for (i = 0; i < owner->num_niovs; i++) { - niov = &owner->niovs[i]; - niov->owner = owner; + for (i = 0; i < owner->area.num_niovs; i++) { + niov = &owner->area.niovs[i]; + niov->owner = &owner->area; page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov), net_devmem_get_dma_addr(niov)); } diff --git a/net/core/devmem.h b/net/core/devmem.h index 99782ddeca40..a2b9913e9a17 100644 --- a/net/core/devmem.h +++ b/net/core/devmem.h @@ -10,6 +10,8 @@ #ifndef _NET_DEVMEM_H #define _NET_DEVMEM_H +#include + struct netlink_ext_ack; struct net_devmem_dmabuf_binding { @@ -51,17 +53,11 @@ struct net_devmem_dmabuf_binding { * allocations from this chunk. */ struct dmabuf_genpool_chunk_owner { - /* Offset into the dma-buf where this chunk starts. */ - unsigned long base_virtual; + struct net_iov_area area; + struct net_devmem_dmabuf_binding *binding; /* dma_addr of the start of the chunk. */ dma_addr_t base_dma_addr; - - /* Array of net_iovs for this chunk. */ - struct net_iov *niovs; - size_t num_niovs; - - struct net_devmem_dmabuf_binding *binding; }; void __net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding); @@ -75,20 +71,17 @@ int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, void dev_dmabuf_uninstall(struct net_device *dev); static inline struct dmabuf_genpool_chunk_owner * -net_iov_owner(const struct net_iov *niov) +net_devmem_iov_to_chunk_owner(const struct net_iov *niov) { - return niov->owner; -} + struct net_iov_area *owner = net_iov_owner(niov); -static inline unsigned int net_iov_idx(const struct net_iov *niov) -{ - return niov - net_iov_owner(niov)->niovs; + return container_of(owner, struct dmabuf_genpool_chunk_owner, area); } static inline struct net_devmem_dmabuf_binding * net_devmem_iov_binding(const struct net_iov *niov) { - return net_iov_owner(niov)->binding; + return net_devmem_iov_to_chunk_owner(niov)->binding; } static inline u32 net_devmem_iov_binding_id(const struct net_iov *niov) @@ -98,7 +91,7 @@ static inline u32 net_devmem_iov_binding_id(const struct net_iov *niov) static inline unsigned long net_iov_virtual_addr(const struct net_iov *niov) { - struct dmabuf_genpool_chunk_owner *owner = net_iov_owner(niov); + struct net_iov_area *owner = net_iov_owner(niov); return owner->base_virtual + ((unsigned long)net_iov_idx(niov) << PAGE_SHIFT); From patchwork Wed Dec 4 17:21:42 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: David Wei X-Patchwork-Id: 13894114 Received: from mail-pf1-f173.google.com (mail-pf1-f173.google.com [209.85.210.173]) (using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 137E62144A0 for ; Wed, 4 Dec 2024 17:22:45 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=209.85.210.173 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1733332967; cv=none; b=b3qFRJSuXChZ6IkUHLLdVbZjtCyQ3zujYyCFWcAvEizkXVXuLg+FiHHT5uj9o1Mtn1+thtRDiq6PfcJGzzt8Iw7klmZzZIMeqAJImfl/aNosp3DGh4wGGmSQhyEWuPJdPyWWHaWPbPC/Ng7QoHqZh1KnznELkVXwHkB2gl55gec= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1733332967; c=relaxed/simple; bh=CWs2zlA0ej7HQHqSKA3Q8FsG7wJD5tSuONWopDWc4ew=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=PaBN3cVVuJ+XfSeD9PWHAcDI+hzoWpjHGSMJ/EMzt8Bz1M31J7aLzDmEWtVIWE6pbfR4lh7iBjYciU6d8VbqyKXY75mZh5Pr7qdmyJLmcus73hTmvsHoyTu0l1yZMeHZYyZYJzd2SaopYxVeNMK6uL9kLD42XTL4SLdqK5sU5uc= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=none (p=none dis=none) header.from=davidwei.uk; spf=none smtp.mailfrom=davidwei.uk; dkim=pass (2048-bit key) header.d=davidwei-uk.20230601.gappssmtp.com header.i=@davidwei-uk.20230601.gappssmtp.com header.b=kygfaWUB; arc=none smtp.client-ip=209.85.210.173 Authentication-Results: smtp.subspace.kernel.org; dmarc=none (p=none dis=none) header.from=davidwei.uk Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=davidwei.uk Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=davidwei-uk.20230601.gappssmtp.com header.i=@davidwei-uk.20230601.gappssmtp.com header.b="kygfaWUB" Received: by mail-pf1-f173.google.com with SMTP id d2e1a72fcca58-7259d6ae0c8so39028b3a.0 for ; Wed, 04 Dec 2024 09:22:45 -0800 (PST) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=davidwei-uk.20230601.gappssmtp.com; s=20230601; t=1733332965; x=1733937765; darn=vger.kernel.org; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:from:to:cc:subject:date :message-id:reply-to; bh=JH3riVm4NLkjQBfMuLeksWLMkrvRu8BR3SQaEJKT7Rw=; b=kygfaWUBSdVz5xqx8JTRoo/8pG1jbR1Y3TFFfRnEgExSYLjrgWi8CNyZ9hNDt4GV6S uzeVbhnO3N9hRoGEqYikKASMMHsWpUYhHstp2CrohbtAWq0SX9UphorbduWT32ZTqdFb LWFF+DpyYvLYwTaeDWrz863jSH5x1LgXYC+pis9jB+cE81dBj5TheZ+alYSpmT5QYWWg j+WZTnuf4v5VqqOz9e3hz5IfC4U7FhACTJVVY5JLT+6XEX0OhF90z37AxnUn5EBxkD+c aVNlfoy9qicyuWh1YFmly6+0EEbzgtHkPnD8oaVFw2ueUpOYv+BZUkSbxmPkFwIusSyG 0eDQ== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20230601; t=1733332965; x=1733937765; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc :subject:date:message-id:reply-to; bh=JH3riVm4NLkjQBfMuLeksWLMkrvRu8BR3SQaEJKT7Rw=; b=f5tsGyQnigjIi7eyKzG7fjjeX01R8CXAfBoItDhEu0folLLVis8v5WbWhYgjgapcLO 33z7ZDRnEyGfDQRmHZs+KswplI740hpK6Hk2YMKay5JAxBcgRr1/QiRmDS0l/fFoB+sw Tmi1ZrHEUk+zExuQ4mzqWGtT9ZVLJYAo7qwF0AIcrjD+CPG6wxKV8Uk714/paHLTLcKq ZDE9V0iTD/FFZXeIueaNP7SZdYSaazXZwRXAl3yW1ZGq5dYAXCmuEGV6Oopexl6PW97e NtGdS5+ur0z5PFRoUzGpeou6XyLdrkLnNT5x4pKdgFa5KyOssz9aIyjdn1tfAk7uhZCc UvcA== X-Gm-Message-State: AOJu0YxgF9sj5dJ5hBWXnSDKHqug+wr6Uxsh6rk2OOR3Gu7Yf8AKZ+mf hcLBn0tAaw+UdrHt0yvNaLKzaBM673yv8ZiIs82Aw9Z5KOsjwz1YR4OVI6PnbjoDsalsu3Bar1g D X-Gm-Gg: ASbGncsUo3OuCatwPOR1Z1OHjV8miyga3ibpI/LRNBPNZjMfedLZpim/8//qRms7xxN H/K7HLPHNkb0oARyX9aP/G3m6ZW+B0V33hyVe5xiWtgINtamBshR+nDgGG+BW1BEKRg944BhVMC juvLh4dYtphiP+/0CN+iJJjRhUSSerPl8JDNh7W0Bdjw2vcLbKyPbRygQQuS2pf2IgIwN6EI9EY s7Tyk4vAEGflA4Lf2aNIlqS2CJYcQ+sS+c= X-Google-Smtp-Source: AGHT+IFLNESG52Hxk+Q3BHxEn2/pmAxxJclMcKSQNELp4I9dJY56e6lubxSsLfm7mcXVC2UoKfGrxw== X-Received: by 2002:a17:902:d4c5:b0:211:fcad:d6ea with SMTP id d9443c01a7336-215bd244a30mr62174785ad.45.1733332965454; Wed, 04 Dec 2024 09:22:45 -0800 (PST) Received: from localhost ([2a03:2880:ff:21::]) by smtp.gmail.com with ESMTPSA id d9443c01a7336-2158394eb05sm63052455ad.82.2024.12.04.09.22.44 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Wed, 04 Dec 2024 09:22:44 -0800 (PST) From: David Wei To: io-uring@vger.kernel.org, netdev@vger.kernel.org Cc: Jens Axboe , Pavel Begunkov , Jakub Kicinski , Paolo Abeni , "David S. Miller" , Eric Dumazet , Jesper Dangaard Brouer , David Ahern , Mina Almasry , Stanislav Fomichev , Joe Damato , Pedro Tammela Subject: [PATCH net-next v8 03/17] net: page_pool: create hooks for custom page providers Date: Wed, 4 Dec 2024 09:21:42 -0800 Message-ID: <20241204172204.4180482-4-dw@davidwei.uk> X-Mailer: git-send-email 2.43.5 In-Reply-To: <20241204172204.4180482-1-dw@davidwei.uk> References: <20241204172204.4180482-1-dw@davidwei.uk> Precedence: bulk X-Mailing-List: io-uring@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Jakub Kicinski The page providers which try to reuse the same pages will need to hold onto the ref, even if page gets released from the pool - as in releasing the page from the pp just transfers the "ownership" reference from pp to the provider, and provider will wait for other references to be gone before feeding this page back into the pool. Signed-off-by: Jakub Kicinski [Pavel] Rebased, renamed callback, +converted devmem Signed-off-by: Pavel Begunkov Signed-off-by: David Wei --- include/net/page_pool/types.h | 9 +++++++++ net/core/devmem.c | 14 +++++++++++++- net/core/page_pool.c | 17 +++++++++-------- 3 files changed, 31 insertions(+), 9 deletions(-) diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h index c022c410abe3..8a35fe474adb 100644 --- a/include/net/page_pool/types.h +++ b/include/net/page_pool/types.h @@ -152,8 +152,16 @@ struct page_pool_stats { */ #define PAGE_POOL_FRAG_GROUP_ALIGN (4 * sizeof(long)) +struct memory_provider_ops { + netmem_ref (*alloc_netmems)(struct page_pool *pool, gfp_t gfp); + bool (*release_netmem)(struct page_pool *pool, netmem_ref netmem); + int (*init)(struct page_pool *pool); + void (*destroy)(struct page_pool *pool); +}; + struct pp_memory_provider_params { void *mp_priv; + const struct memory_provider_ops *mp_ops; }; struct page_pool { @@ -215,6 +223,7 @@ struct page_pool { struct ptr_ring ring; void *mp_priv; + const struct memory_provider_ops *mp_ops; #ifdef CONFIG_PAGE_POOL_STATS /* recycle stats are per-cpu to avoid locking */ diff --git a/net/core/devmem.c b/net/core/devmem.c index 5c10cf0e2a18..01738029e35c 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -26,6 +26,8 @@ /* Protected by rtnl_lock() */ static DEFINE_XARRAY_FLAGS(net_devmem_dmabuf_bindings, XA_FLAGS_ALLOC1); +static const struct memory_provider_ops dmabuf_devmem_ops; + static void net_devmem_dmabuf_free_chunk_owner(struct gen_pool *genpool, struct gen_pool_chunk *chunk, void *not_used) @@ -117,6 +119,7 @@ void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding) WARN_ON(rxq->mp_params.mp_priv != binding); rxq->mp_params.mp_priv = NULL; + rxq->mp_params.mp_ops = NULL; rxq_idx = get_netdev_rx_queue_index(rxq); @@ -142,7 +145,7 @@ int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, } rxq = __netif_get_rx_queue(dev, rxq_idx); - if (rxq->mp_params.mp_priv) { + if (rxq->mp_params.mp_ops) { NL_SET_ERR_MSG(extack, "designated queue already memory provider bound"); return -EEXIST; } @@ -160,6 +163,7 @@ int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, return err; rxq->mp_params.mp_priv = binding; + rxq->mp_params.mp_ops = &dmabuf_devmem_ops; err = netdev_rx_queue_restart(dev, rxq_idx); if (err) @@ -169,6 +173,7 @@ int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, err_xa_erase: rxq->mp_params.mp_priv = NULL; + rxq->mp_params.mp_ops = NULL; xa_erase(&binding->bound_rxqs, xa_idx); return err; @@ -388,3 +393,10 @@ bool mp_dmabuf_devmem_release_page(struct page_pool *pool, netmem_ref netmem) /* We don't want the page pool put_page()ing our net_iovs. */ return false; } + +static const struct memory_provider_ops dmabuf_devmem_ops = { + .init = mp_dmabuf_devmem_init, + .destroy = mp_dmabuf_devmem_destroy, + .alloc_netmems = mp_dmabuf_devmem_alloc_netmems, + .release_netmem = mp_dmabuf_devmem_release_page, +}; diff --git a/net/core/page_pool.c b/net/core/page_pool.c index f89cf93f6eb4..36f61a1e4ffe 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -284,10 +284,11 @@ static int page_pool_init(struct page_pool *pool, rxq = __netif_get_rx_queue(pool->slow.netdev, pool->slow.queue_idx); pool->mp_priv = rxq->mp_params.mp_priv; + pool->mp_ops = rxq->mp_params.mp_ops; } - if (pool->mp_priv) { - err = mp_dmabuf_devmem_init(pool); + if (pool->mp_ops) { + err = pool->mp_ops->init(pool); if (err) { pr_warn("%s() mem-provider init failed %d\n", __func__, err); @@ -584,8 +585,8 @@ netmem_ref page_pool_alloc_netmem(struct page_pool *pool, gfp_t gfp) return netmem; /* Slow-path: cache empty, do real allocation */ - if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_priv) - netmem = mp_dmabuf_devmem_alloc_netmems(pool, gfp); + if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_ops) + netmem = pool->mp_ops->alloc_netmems(pool, gfp); else netmem = __page_pool_alloc_pages_slow(pool, gfp); return netmem; @@ -676,8 +677,8 @@ void page_pool_return_page(struct page_pool *pool, netmem_ref netmem) bool put; put = true; - if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_priv) - put = mp_dmabuf_devmem_release_page(pool, netmem); + if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_ops) + put = pool->mp_ops->release_netmem(pool, netmem); else __page_pool_release_page_dma(pool, netmem); @@ -1010,8 +1011,8 @@ static void __page_pool_destroy(struct page_pool *pool) page_pool_unlist(pool); page_pool_uninit(pool); - if (pool->mp_priv) { - mp_dmabuf_devmem_destroy(pool); + if (pool->mp_ops) { + pool->mp_ops->destroy(pool); static_branch_dec(&page_pool_mem_providers); } From patchwork Wed Dec 4 17:21:43 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: David Wei X-Patchwork-Id: 13894115 Received: from mail-pg1-f175.google.com (mail-pg1-f175.google.com [209.85.215.175]) (using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 7B71B2144B3 for ; Wed, 4 Dec 2024 17:22:47 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=209.85.215.175 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1733332968; cv=none; b=JKsKxEbJwmvpkedJmwWKWqcUGshn/ufMfyYwyzaXOTWyH99UVGtvF0JiYu4CmPU/9Uk4h/4R6T2bidpC1Sj3GsySB+8g6sLMs7FrRvClJ+K3hhled5+gMTovcdTESvqI0w0rZJD9c6OEhYNxlIVApW/ic1bysnxkozY04K55m0Y= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1733332968; c=relaxed/simple; bh=P7m2r+GGyBrkWHJpfb3T2MQ50jGabcCk7WvGavxVWCo=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=IsX2nMjrzN/9Ie364i4A9ozz3iOWRyJ7UTA8UqcobdRIjf/j0OdoBoHLcDdRHAeqzB0ifhKpu4HGlrnFS/ausWtMwoaXcCZkH7ukx3rc2Ji2L6DtTMQ1ZT9UKa9BONpccP+YofewVesOSQSrKBqkXFHGYYXDChdoDGaCE7AKEhs= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=none (p=none dis=none) header.from=davidwei.uk; spf=none smtp.mailfrom=davidwei.uk; dkim=pass (2048-bit key) header.d=davidwei-uk.20230601.gappssmtp.com header.i=@davidwei-uk.20230601.gappssmtp.com header.b=L4wKlChl; arc=none smtp.client-ip=209.85.215.175 Authentication-Results: smtp.subspace.kernel.org; dmarc=none (p=none dis=none) header.from=davidwei.uk Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=davidwei.uk Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=davidwei-uk.20230601.gappssmtp.com header.i=@davidwei-uk.20230601.gappssmtp.com header.b="L4wKlChl" Received: by mail-pg1-f175.google.com with SMTP id 41be03b00d2f7-7fc340eb006so38525a12.0 for ; Wed, 04 Dec 2024 09:22:47 -0800 (PST) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=davidwei-uk.20230601.gappssmtp.com; s=20230601; t=1733332967; x=1733937767; darn=vger.kernel.org; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:from:to:cc:subject:date :message-id:reply-to; bh=UAFZQHh3Txu/JunB7wOIdrBL3yg0y/5ImZqDpMhl7A4=; b=L4wKlChlne7SjUQA/cuIChLwVQE2Xn+SuaXU6hUSAeGkyVOKs7VmpuRja+i6qTHLG4 Up0ezqegZNT1qbPmEgNRUoULn6hvSsWRm7SFOPSvL1Z/RCNVLmoKzsgDRf6LAQ5uj6qb rshnPg5qFY4TiRJ1v5TIBmTcECCZ5PWoFI8kPa+tOmt3cEPeG4C5sw4Gu7vX/ypH9lOa X1JyMI9uN44hsUxavD/CwqkhF0Fp6LDENOLj5gkXKmai9HdlYUjUBKeDcHWrNe6kldP/ hgRuCArQSQAXMkaA4aDh7Jj+5lu2LF7G/0xETJ86iMKqZ3ppyv9DWuD1J9m5RnW1YyDV 7A/w== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20230601; t=1733332967; x=1733937767; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc :subject:date:message-id:reply-to; bh=UAFZQHh3Txu/JunB7wOIdrBL3yg0y/5ImZqDpMhl7A4=; b=A0kn4amWF2VfXcabawIKnTlB3xEzCCBsTyVRS1wlqljTHmsu7l3Dhj/bQvSoVKKEFM 94GA5ywRwwk4GHsYJlno+zmc2tH1D6pt/gtGe19i+oPj/ER8A4Gtb8EWZotEUNT72GVY EmbefA2iYVjQdQgBLLaOa10AzavwgDxkqsWT/2CBe1iqsAFNmlQ6MOyAhE5Kodsn9lY4 w7R6gV5dfrsyLqZw4bEmmrvZ6pMGnhPXYUelIkPZASb72zMuh0eDvLUxI1e11+GtvY6C WRDa0jy4z42mwzTOdDmY4yj/CqE+vdOEMSJltOYmRop1vqTk2MTIgrIAT8Wdj1sB7XuE mQVg== X-Gm-Message-State: AOJu0YwiDS7lqAqJNrxq6Li8N+CX6GIhdliSsoHHIv53AVfLjwlVwGYg K8uiXps7MaItEwmfdS4YpAJiUxxOgdRqlR9P6yxRGKipK5XXxIyYohPw6+9NrgpgwL0EEQ25kPo 4 X-Gm-Gg: ASbGncsZGiDvF/BHyjzpUbV89CTTjJ1JODwxd0tOk8pf3ghFpSLAnRHvbETzz1Jl19d Xl9LnZGmYuRJBpAS72UqfJpnJPtEQvdrAf5ZKEcLGywXmfMJI8V0bNhOtMSxZuClxGfZMOmaSog sfe6SxQIZHbyqFZPxtbbHue3iEGxcWWRLGuckJutyD4GWr0m8Vw4KBiRVmenK3/2bgxPnunqYkl Xz9vwipY1R6eYU67Z2w3hwXjN68ROaI0WY= X-Google-Smtp-Source: AGHT+IFVRKzIKF0vxlA7OrnnNXDfVXr1Fm5L5r0h9paEQoDxU+ErYnNastbW5tDfNZhy9/bW+sQkqA== X-Received: by 2002:a05:6a20:2587:b0:1e0:d766:8da1 with SMTP id adf61e73a8af0-1e16541097bmr9838822637.39.1733332966818; Wed, 04 Dec 2024 09:22:46 -0800 (PST) Received: from localhost ([2a03:2880:ff:74::]) by smtp.gmail.com with ESMTPSA id d2e1a72fcca58-72541849c1csm12572290b3a.200.2024.12.04.09.22.46 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Wed, 04 Dec 2024 09:22:46 -0800 (PST) From: David Wei To: io-uring@vger.kernel.org, netdev@vger.kernel.org Cc: Jens Axboe , Pavel Begunkov , Jakub Kicinski , Paolo Abeni , "David S. Miller" , Eric Dumazet , Jesper Dangaard Brouer , David Ahern , Mina Almasry , Stanislav Fomichev , Joe Damato , Pedro Tammela Subject: [PATCH net-next v8 04/17] net: prepare for non devmem TCP memory providers Date: Wed, 4 Dec 2024 09:21:43 -0800 Message-ID: <20241204172204.4180482-5-dw@davidwei.uk> X-Mailer: git-send-email 2.43.5 In-Reply-To: <20241204172204.4180482-1-dw@davidwei.uk> References: <20241204172204.4180482-1-dw@davidwei.uk> Precedence: bulk X-Mailing-List: io-uring@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Pavel Begunkov There is a good bunch of places in generic paths assuming that the only page pool memory provider is devmem TCP. As we want to reuse the net_iov and provider infrastructure, we need to patch it up and explicitly check the provider type when we branch into devmem TCP code. Signed-off-by: Pavel Begunkov Signed-off-by: David Wei Reviewed-by: Mina Almasry --- net/core/devmem.c | 10 ++++++++-- net/core/devmem.h | 8 ++++++++ net/core/page_pool_user.c | 15 +++++++++------ net/ipv4/tcp.c | 6 ++++++ 4 files changed, 31 insertions(+), 8 deletions(-) diff --git a/net/core/devmem.c b/net/core/devmem.c index 01738029e35c..78983a98e5dc 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -28,6 +28,12 @@ static DEFINE_XARRAY_FLAGS(net_devmem_dmabuf_bindings, XA_FLAGS_ALLOC1); static const struct memory_provider_ops dmabuf_devmem_ops; +bool net_is_devmem_page_pool_ops(const struct memory_provider_ops *ops) +{ + return ops == &dmabuf_devmem_ops; +} +EXPORT_SYMBOL_GPL(net_is_devmem_page_pool_ops); + static void net_devmem_dmabuf_free_chunk_owner(struct gen_pool *genpool, struct gen_pool_chunk *chunk, void *not_used) @@ -316,10 +322,10 @@ void dev_dmabuf_uninstall(struct net_device *dev) unsigned int i; for (i = 0; i < dev->real_num_rx_queues; i++) { - binding = dev->_rx[i].mp_params.mp_priv; - if (!binding) + if (dev->_rx[i].mp_params.mp_ops != &dmabuf_devmem_ops) continue; + binding = dev->_rx[i].mp_params.mp_priv; xa_for_each(&binding->bound_rxqs, xa_idx, rxq) if (rxq == &dev->_rx[i]) { xa_erase(&binding->bound_rxqs, xa_idx); diff --git a/net/core/devmem.h b/net/core/devmem.h index a2b9913e9a17..a3fdd66bb05b 100644 --- a/net/core/devmem.h +++ b/net/core/devmem.h @@ -116,6 +116,8 @@ struct net_iov * net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding); void net_devmem_free_dmabuf(struct net_iov *ppiov); +bool net_is_devmem_page_pool_ops(const struct memory_provider_ops *ops); + #else struct net_devmem_dmabuf_binding; @@ -168,6 +170,12 @@ static inline u32 net_devmem_iov_binding_id(const struct net_iov *niov) { return 0; } + +static inline bool +net_is_devmem_page_pool_ops(const struct memory_provider_ops *ops) +{ + return false; +} #endif #endif /* _NET_DEVMEM_H */ diff --git a/net/core/page_pool_user.c b/net/core/page_pool_user.c index 48335766c1bf..604862a73535 100644 --- a/net/core/page_pool_user.c +++ b/net/core/page_pool_user.c @@ -214,7 +214,7 @@ static int page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool, const struct genl_info *info) { - struct net_devmem_dmabuf_binding *binding = pool->mp_priv; + struct net_devmem_dmabuf_binding *binding; size_t inflight, refsz; void *hdr; @@ -244,8 +244,11 @@ page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool, pool->user.detach_time)) goto err_cancel; - if (binding && nla_put_u32(rsp, NETDEV_A_PAGE_POOL_DMABUF, binding->id)) - goto err_cancel; + if (net_is_devmem_page_pool_ops(pool->mp_ops)) { + binding = pool->mp_priv; + if (nla_put_u32(rsp, NETDEV_A_PAGE_POOL_DMABUF, binding->id)) + goto err_cancel; + } genlmsg_end(rsp, hdr); @@ -353,16 +356,16 @@ void page_pool_unlist(struct page_pool *pool) int page_pool_check_memory_provider(struct net_device *dev, struct netdev_rx_queue *rxq) { - struct net_devmem_dmabuf_binding *binding = rxq->mp_params.mp_priv; + void *mp_priv = rxq->mp_params.mp_priv; struct page_pool *pool; struct hlist_node *n; - if (!binding) + if (!mp_priv) return 0; mutex_lock(&page_pools_lock); hlist_for_each_entry_safe(pool, n, &dev->page_pools, user.list) { - if (pool->mp_priv != binding) + if (pool->mp_priv != mp_priv) continue; if (pool->slow.queue_idx == get_netdev_rx_queue_index(rxq)) { diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index b872de9a8271..f22005c70fd3 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -277,6 +277,7 @@ #include #include #include +#include #include #include @@ -2476,6 +2477,11 @@ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb, } niov = skb_frag_net_iov(frag); + if (net_is_devmem_page_pool_ops(niov->pp->mp_ops)) { + err = -ENODEV; + goto out; + } + end = start + skb_frag_size(frag); copy = end - offset; From patchwork Wed Dec 4 17:21:44 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: David Wei X-Patchwork-Id: 13894116 Received: from mail-pf1-f177.google.com (mail-pf1-f177.google.com [209.85.210.177]) (using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id D363F2144C9 for ; Wed, 4 Dec 2024 17:22:48 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=209.85.210.177 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1733332970; cv=none; b=mcO1gXXrEItSu5tmxMXn+02QOPiIj5YRM86mh5t4G28TtCejeV1Eq8u9UiDR0Ecr4BPqFevNGRU1sXZqhbRFynZr+LlMTjc1mXdKfR3uSTV+/ioy946So2dGMC4Sj6aueACsV6C1J2gkefwSHmJ7pelBXLEXsx6idxxCWtbkLE0= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1733332970; c=relaxed/simple; bh=APKr5aLd6vQflFkrq4T1x51i6M4tU+UpDKsCgobut2E=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=H5aTFRNBGv4XMlULhfWCGiRxr4VCKsNUOkgBN+SHvpKewhlKFm7cHToMhKW4axtW/Vgw0yd6RAmCMdTUuPyc8hULMOyLiXuWU/Bo75RjomXQJnb954MnRDtnLnve5iflCWjjEzvDjaVNSaPOGbvVC2equ8C4MA5S865cL4jZ5WQ= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=none (p=none dis=none) header.from=davidwei.uk; spf=none smtp.mailfrom=davidwei.uk; dkim=pass (2048-bit key) header.d=davidwei-uk.20230601.gappssmtp.com header.i=@davidwei-uk.20230601.gappssmtp.com header.b=0fZViDl0; arc=none smtp.client-ip=209.85.210.177 Authentication-Results: smtp.subspace.kernel.org; dmarc=none (p=none dis=none) header.from=davidwei.uk Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=davidwei.uk Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=davidwei-uk.20230601.gappssmtp.com header.i=@davidwei-uk.20230601.gappssmtp.com header.b="0fZViDl0" Received: by mail-pf1-f177.google.com with SMTP id d2e1a72fcca58-724f383c5bfso54989b3a.1 for ; Wed, 04 Dec 2024 09:22:48 -0800 (PST) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=davidwei-uk.20230601.gappssmtp.com; s=20230601; t=1733332968; x=1733937768; darn=vger.kernel.org; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:from:to:cc:subject:date :message-id:reply-to; bh=M2vkmdBzDyqn2IwGI+dU78wy5gRI5n87yNySC9Hmw6M=; b=0fZViDl0WrAIih3ykgYs3fLinyyR/oc1Fkx6ehElvo3YjQW2a0mDsdjMVRDpJjgJgi XaZbWa/XDHBbP17qwkFlAkUQbXPNcrtZhu92TDZnOUhlZMEcr1HvBQ3RETyNXirRjHKR kW2K6jr3AAPTyReO+4r/ldJAKcsbO7L8MWVdTra5lJNosBiAhSlEDd5s+pdQwrP456W0 OqS7CGUkyTU1D2P4UONWCeaLY8BcMcrVZicI8PvCY0Q2l+x01PUWNlWvAduwgsuKNvWw T8CNbMthwtcgRS67xPG6zCwMz7Tdm+wtPETuX3jojbByffqz7tJ/3dw/yWQGrlP1YFWX mkKg== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20230601; t=1733332968; x=1733937768; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc :subject:date:message-id:reply-to; bh=M2vkmdBzDyqn2IwGI+dU78wy5gRI5n87yNySC9Hmw6M=; b=AvjIEWr7+l+qLvOXdNdLn2avEJSlfSGK+1g2sP8lLQ7UgBK+bZCMJWFt3NiofqrDV7 QRDtbPlm261kFhfadOezDNUOXi4rc4rtnxwBIxW+9sk5l+5RbIKcXDp4VH493H4glyd7 yfRWWiLkAJ1TxAB+KJ1dDmXFX4Sf6j79ANlCw1IXBc2oGXc+RKyXXTKQpdAQUZpFCzmE btE5jil8mQJpyasFNYqH/MnMs7bcrQntMp57xc0dcqlmnwhct9jSVCECXIqgo2j+6vGU koGYYL4/7UNU7by8sQrwYJW+rvRn2eKLVDX6HlLDGWAz+EhK8rWkwvFmXy7js7Ie5P/x d0SQ== X-Gm-Message-State: AOJu0YxjYJDKP/TLNVhqj71DEuZk4TnfZhf+4l4XWK7wWuedex9A0yUr bcXkoNIhb+P96WwErrGct9rNubKrhcbMrQySHzwJJjBL/qK4OE+7Q6ctTDNP0wv4sm3+UladPHs R X-Gm-Gg: ASbGncv/KnmWkrf2HFlDcWaPf5uUmCFfYNaKbLy3X443hGccr26AY9yEnMZEuXdGp8B 7LzI6W0U8Z2a6Z99MJBCqwO43PUVfiwAZ9bBF27oW4yKSKlnm8H9AygdcWsWzHoyuFmHdUNS2kc tD3T/54bE73aR3Uilt8DM8OO5nimdVDye7bAMIzjRP8W078ldnwiY70Zn9OU+OF9lPkUbj8BQr7 jlaAptQO4jrmatHdcc+YL0S5RjCRvc8S6o= X-Google-Smtp-Source: AGHT+IG9Kld6tAEKAy17eJTwxd2vNTg24V286fmwbwIZ10oR7zVgKPAWK1HWQD7C1mnwVSOsUpjlyQ== X-Received: by 2002:a17:90b:17c9:b0:2ee:6e22:bfd0 with SMTP id 98e67ed59e1d1-2ef1ce997f8mr6252786a91.21.1733332968233; Wed, 04 Dec 2024 09:22:48 -0800 (PST) Received: from localhost ([2a03:2880:ff:10::]) by smtp.gmail.com with ESMTPSA id 98e67ed59e1d1-2ef27062fd7sm1675569a91.46.2024.12.04.09.22.47 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Wed, 04 Dec 2024 09:22:47 -0800 (PST) From: David Wei To: io-uring@vger.kernel.org, netdev@vger.kernel.org Cc: Jens Axboe , Pavel Begunkov , Jakub Kicinski , Paolo Abeni , "David S. Miller" , Eric Dumazet , Jesper Dangaard Brouer , David Ahern , Mina Almasry , Stanislav Fomichev , Joe Damato , Pedro Tammela Subject: [PATCH net-next v8 05/17] net: page_pool: add ->scrub mem provider callback Date: Wed, 4 Dec 2024 09:21:44 -0800 Message-ID: <20241204172204.4180482-6-dw@davidwei.uk> X-Mailer: git-send-email 2.43.5 In-Reply-To: <20241204172204.4180482-1-dw@davidwei.uk> References: <20241204172204.4180482-1-dw@davidwei.uk> Precedence: bulk X-Mailing-List: io-uring@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Pavel Begunkov Some page pool memory providers like io_uring need to catch the point when the page pool is asked to be destroyed. ->destroy is not enough because it relies on the page pool to wait for its buffers first, but for that to happen a provider might need to react, e.g. to collect all buffers that are currently given to the user space. Add a new provider's scrub callback serving the purpose and called off the pp's generic (cold) scrubbing path, i.e. page_pool_scrub(). Signed-off-by: Pavel Begunkov Signed-off-by: David Wei Reviewed-by: Mina Almasry --- include/net/page_pool/types.h | 1 + net/core/page_pool.c | 3 +++ 2 files changed, 4 insertions(+) diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h index 8a35fe474adb..fd0376ad0d26 100644 --- a/include/net/page_pool/types.h +++ b/include/net/page_pool/types.h @@ -157,6 +157,7 @@ struct memory_provider_ops { bool (*release_netmem)(struct page_pool *pool, netmem_ref netmem); int (*init)(struct page_pool *pool); void (*destroy)(struct page_pool *pool); + void (*scrub)(struct page_pool *pool); }; struct pp_memory_provider_params { diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 36f61a1e4ffe..13f1a4a63760 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -1038,6 +1038,9 @@ static void page_pool_empty_alloc_cache_once(struct page_pool *pool) static void page_pool_scrub(struct page_pool *pool) { + if (pool->mp_ops && pool->mp_ops->scrub) + pool->mp_ops->scrub(pool); + page_pool_empty_alloc_cache_once(pool); pool->destroy_cnt++; From patchwork Wed Dec 4 17:21:45 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: David Wei X-Patchwork-Id: 13894117 Received: from mail-pj1-f42.google.com (mail-pj1-f42.google.com [209.85.216.42]) (using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 5FEC0214A60 for ; Wed, 4 Dec 2024 17:22:50 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=209.85.216.42 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1733332971; cv=none; b=JlmcDjpXDcgCQFIT6c5L8TazxHDcE61KyJAk7ysWzbI6cEuFyzpk/KBiVvdrYfKvkDYpW1l57w14H2XRZ2tetMjF2VrvBAY+tdRvi8ZebJqJ8AmNxnPvmVhWdJsMwIDDmxK4oqlPPALjLnSuYutok2aNNTZZW6dpWm3z1j04Un4= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1733332971; c=relaxed/simple; bh=2Ly0e5Yijxu/LfGPalbYYW0IDXn2bJJLdjQ35I/F9lc=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=N/fhlgTEbR1ywtrpALnyJMKWwhU71zTM8avnBKZHWOZtw2IRY+mq22IZGgX+fYK+13PEda6r09ojKpUwmOFui2TuXr/QKmVtbEtWmm5WeP15RLm6ST/768lG6hK4E5fX722J4D0PMV/Rw4evH+QvcH6vxUquypm4XGHwfzvnodc= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=none (p=none dis=none) header.from=davidwei.uk; spf=none smtp.mailfrom=davidwei.uk; dkim=pass (2048-bit key) header.d=davidwei-uk.20230601.gappssmtp.com header.i=@davidwei-uk.20230601.gappssmtp.com header.b=OxZni3Gl; arc=none smtp.client-ip=209.85.216.42 Authentication-Results: smtp.subspace.kernel.org; dmarc=none (p=none dis=none) header.from=davidwei.uk Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=davidwei.uk Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=davidwei-uk.20230601.gappssmtp.com header.i=@davidwei-uk.20230601.gappssmtp.com header.b="OxZni3Gl" Received: by mail-pj1-f42.google.com with SMTP id 98e67ed59e1d1-2eecc01b5ebso58311a91.1 for ; Wed, 04 Dec 2024 09:22:50 -0800 (PST) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=davidwei-uk.20230601.gappssmtp.com; s=20230601; t=1733332970; x=1733937770; darn=vger.kernel.org; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:from:to:cc:subject:date :message-id:reply-to; bh=vVA+/b4qCsGeF/r3+y87HajxoHUhHCxTyMRpyjDTudI=; b=OxZni3GluqnhCSfAC2Z1Kc277FqzUyf6VKnVMYqXM5rI+eZLuiG2nwAgKSVosuAVg0 WbLkVx3S4Suk0nPcbyKvIrKi/N2K8wUtox1nKDXYbwWcBl72lsg2CQoILEUfTpJMmjdF 2nhR64Cuo3WYDKBiNaJvpH2hUoDq81Y0/3bYyk5Ciqge0q8jFjLOD/VLGRq9j+nFjeaq Oo1lSWhNl8zPQyH8WrsgdXN/H0erUEvhyTPc12oRFE7XzRQ7MvWTaBVX09eXEY7+5pai naeM+5sorHse8Mjucwp2kvtSBIvhJ5LMGTHEnlpiPcmqm5Q7EJ7N6+qWXbKd/DR7ivZ3 V+Fg== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20230601; t=1733332970; x=1733937770; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc :subject:date:message-id:reply-to; bh=vVA+/b4qCsGeF/r3+y87HajxoHUhHCxTyMRpyjDTudI=; b=e2USXPmteRb8VjwEEuHTmjSnik2IG1K6EzLJSKx33+njMJw//blcjFdAH0I8ScDybL JFtsOzqs7qObNqq7pOguSDgLDs58W761lJglcMcMRKXln85ehqolhuhOFZDncz4IGmOx n0s4DBGTX2ZUGlD4TCyTy1dj90aYH3S++hX/g6nTgn/NWmESaG3RhHLAR3+r6gbf5t56 cvUZYEt1C32YtKduDgY3PFrb/9icY929IKtyG+HjhBFBxxRSpBguCbazM8jMSzfOhy4g mAeADdDhVhsNutGoOUUk59nMJJwMULCfGrn+ct73qS/cdnnXAZxt4Qs31t0uf54Fiw91 9y2A== X-Gm-Message-State: AOJu0YwoDiy8NrMmFC8BzFNtQjH2ZM7ti2ye6YrawDiaPX71RmAEITyP 0xRXX0U9V+xvXCCPORRvIUQ//GSqA7nqXuJGmzeX/i87HfjDiCToSrWhssBijcoUBTREeuMkTLo 2 X-Gm-Gg: ASbGncvzViqOpOO0OGsUzgetPvUlyt+RDuBzNL4KeBJIlPEJjZtfMlnjV9MK23n8hTc KWBHbsGlb1cpeE2GTB/ySaF74J+j8gaK4s/qaZAZnX2gXe11no3wKV+waJjvXjmLpJdrvKKDxTw XVOCZZpGpUgB5zuT1pCERkqKHo5/MfjZ7J4L4QLpn0KZ4Qa5tuAhRDALAoaQqZv/jHqWCIkAvFG J0ZA2lCSQLxwgstYwUQzWEopxw3njIRml0= X-Google-Smtp-Source: AGHT+IEuhl76LZuTMA9l7NFby+dpNqNLOXPZPUIKpzVvQ3cP55eSMyTH71gxdXUKQ8wKcIyAwFFBwA== X-Received: by 2002:a17:90b:2547:b0:2ee:fa3f:4740 with SMTP id 98e67ed59e1d1-2ef012759b6mr11280221a91.35.1733332969694; Wed, 04 Dec 2024 09:22:49 -0800 (PST) Received: from localhost ([2a03:2880:ff:16::]) by smtp.gmail.com with ESMTPSA id 98e67ed59e1d1-2ef2701d9a2sm1681179a91.28.2024.12.04.09.22.48 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Wed, 04 Dec 2024 09:22:49 -0800 (PST) From: David Wei To: io-uring@vger.kernel.org, netdev@vger.kernel.org Cc: Jens Axboe , Pavel Begunkov , Jakub Kicinski , Paolo Abeni , "David S. Miller" , Eric Dumazet , Jesper Dangaard Brouer , David Ahern , Mina Almasry , Stanislav Fomichev , Joe Damato , Pedro Tammela Subject: [PATCH net-next v8 06/17] net: page pool: add helper creating area from pages Date: Wed, 4 Dec 2024 09:21:45 -0800 Message-ID: <20241204172204.4180482-7-dw@davidwei.uk> X-Mailer: git-send-email 2.43.5 In-Reply-To: <20241204172204.4180482-1-dw@davidwei.uk> References: <20241204172204.4180482-1-dw@davidwei.uk> Precedence: bulk X-Mailing-List: io-uring@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Pavel Begunkov Add a helper that takes an array of pages and initialises passed in memory provider's area with them, where each net_iov takes one page. It's also responsible for setting up dma mappings. We keep it in page_pool.c not to leak netmem details to outside providers like io_uring, which don't have access to netmem_priv.h and other private helpers. Signed-off-by: Pavel Begunkov Signed-off-by: David Wei --- include/net/page_pool/memory_provider.h | 10 ++++ net/core/page_pool.c | 63 ++++++++++++++++++++++++- 2 files changed, 71 insertions(+), 2 deletions(-) create mode 100644 include/net/page_pool/memory_provider.h diff --git a/include/net/page_pool/memory_provider.h b/include/net/page_pool/memory_provider.h new file mode 100644 index 000000000000..83d7eec0058d --- /dev/null +++ b/include/net/page_pool/memory_provider.h @@ -0,0 +1,10 @@ +#ifndef _NET_PAGE_POOL_MEMORY_PROVIDER_H +#define _NET_PAGE_POOL_MEMORY_PROVIDER_H + +int page_pool_mp_init_paged_area(struct page_pool *pool, + struct net_iov_area *area, + struct page **pages); +void page_pool_mp_release_area(struct page_pool *pool, + struct net_iov_area *area); + +#endif diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 13f1a4a63760..d17e536ba8b8 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -13,6 +13,7 @@ #include #include +#include #include #include @@ -459,7 +460,8 @@ page_pool_dma_sync_for_device(const struct page_pool *pool, __page_pool_dma_sync_for_device(pool, netmem, dma_sync_size); } -static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem) +static bool page_pool_dma_map_page(struct page_pool *pool, netmem_ref netmem, + struct page *page) { dma_addr_t dma; @@ -468,7 +470,7 @@ static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem) * into page private data (i.e 32bit cpu with 64bit DMA caps) * This mapping is kept for lifetime of page, until leaving pool. */ - dma = dma_map_page_attrs(pool->p.dev, netmem_to_page(netmem), 0, + dma = dma_map_page_attrs(pool->p.dev, page, 0, (PAGE_SIZE << pool->p.order), pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); @@ -490,6 +492,11 @@ static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem) return false; } +static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem) +{ + return page_pool_dma_map_page(pool, netmem, netmem_to_page(netmem)); +} + static struct page *__page_pool_alloc_page_order(struct page_pool *pool, gfp_t gfp) { @@ -1154,3 +1161,55 @@ void page_pool_update_nid(struct page_pool *pool, int new_nid) } } EXPORT_SYMBOL(page_pool_update_nid); + +static void page_pool_release_page_dma(struct page_pool *pool, + netmem_ref netmem) +{ + __page_pool_release_page_dma(pool, netmem); +} + +int page_pool_mp_init_paged_area(struct page_pool *pool, + struct net_iov_area *area, + struct page **pages) +{ + struct net_iov *niov; + netmem_ref netmem; + int i, ret = 0; + + if (!pool->dma_map) + return -EOPNOTSUPP; + + for (i = 0; i < area->num_niovs; i++) { + niov = &area->niovs[i]; + netmem = net_iov_to_netmem(niov); + + page_pool_set_pp_info(pool, netmem); + if (!page_pool_dma_map_page(pool, netmem, pages[i])) { + ret = -EINVAL; + goto err_unmap_dma; + } + } + return 0; + +err_unmap_dma: + while (i--) { + netmem = net_iov_to_netmem(&area->niovs[i]); + page_pool_release_page_dma(pool, netmem); + } + return ret; +} + +void page_pool_mp_release_area(struct page_pool *pool, + struct net_iov_area *area) +{ + int i; + + if (!pool->dma_map) + return; + + for (i = 0; i < area->num_niovs; i++) { + struct net_iov *niov = &area->niovs[i]; + + page_pool_release_page_dma(pool, net_iov_to_netmem(niov)); + } +} From patchwork Wed Dec 4 17:21:46 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: David Wei X-Patchwork-Id: 13894118 Received: from mail-pf1-f181.google.com (mail-pf1-f181.google.com [209.85.210.181]) (using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 6D3DB214A76 for ; Wed, 4 Dec 2024 17:22:52 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=209.85.210.181 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1733332973; cv=none; b=jmae7LARcv721wx8GWy+7QZMX0wCx1BYBr0vn1OkNyUxku/f/J80I1YFC+gkivuE3i8hnUjeNNq85JUK9cM6n3twx7mt/B4gNHf6VbRUxKfYSl0Y0Vrve9jvkc41cfV9qCWMoVa7UXxxJdyYsaN/bW9vHn8/JMjpe+2t604iuMs= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1733332973; c=relaxed/simple; bh=U4Cdy5bsVigQBeIL6KWDF95WYZe8Tqk1pM22YTjXHD8=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=LROKsfiTNzursMQBGMjdvvuVr42AVYNI0W8sVySmuo5jvUnAjHPCs0U+WAQLByC/eZM6JxmkYFSs0kKblUNQSYCDBasXZTQ4Gt9l3qq0dPpX5EYUNSKhY1l9K+RE/1OlBWd/00K87nx0Ezbm0yb5OR2KCjYimzR154Zzyo3v7Ds= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=none (p=none dis=none) header.from=davidwei.uk; spf=none smtp.mailfrom=davidwei.uk; dkim=pass (2048-bit key) header.d=davidwei-uk.20230601.gappssmtp.com header.i=@davidwei-uk.20230601.gappssmtp.com header.b=LjebwYQO; arc=none smtp.client-ip=209.85.210.181 Authentication-Results: smtp.subspace.kernel.org; dmarc=none (p=none dis=none) header.from=davidwei.uk Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=davidwei.uk Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=davidwei-uk.20230601.gappssmtp.com header.i=@davidwei-uk.20230601.gappssmtp.com header.b="LjebwYQO" Received: by mail-pf1-f181.google.com with SMTP id d2e1a72fcca58-724ffe64923so62574b3a.2 for ; Wed, 04 Dec 2024 09:22:52 -0800 (PST) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=davidwei-uk.20230601.gappssmtp.com; s=20230601; t=1733332972; x=1733937772; darn=vger.kernel.org; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:from:to:cc:subject:date :message-id:reply-to; bh=Ull/mV+AjRgR++Q1JZdqTKwRQ/UNffKR3r7S+Qmcp54=; b=LjebwYQOznhQZqGz3L5QGY0WtGz4sfW/bhxQm9Lc0T2AIiBxedJZfmsdfOmkvgHlcU i9jqss4VwofNqphP1xFgxCSTqswIAmJ7h1b1BemvAOfqdNjmkkkSPeUfAJYGkVH9b+Nx qp/NXYa+xYKQrzY9jDjMzPJtGASJbLvftURBEF06+hymTsRJlM2tuThVJYGqMQAwof+Y K4vNCOKVjUlxsbLq2fdnvDhYQzKY41Wlvd77F15Q+8HJNNnD8uSoulfsMkvh3TRMSVR0 kEiTH88QX53NPNDOsZ5HQR710T78TmoU72OUxGsQ4EZv9e9Nxx8sRWMf7IeQbTpgmUCK tgIw== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20230601; t=1733332972; x=1733937772; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc :subject:date:message-id:reply-to; bh=Ull/mV+AjRgR++Q1JZdqTKwRQ/UNffKR3r7S+Qmcp54=; b=QZA8JcqR1PJyvXwUPK/+l6lqitl6ap25pwqXJBNH/VWY5SnVLZZORooQwcDZwP8wKG YKnFHRz6Fhkbfnt/aju4qG2Cho1IpJO6oMlcpodRAnAnq09zToiPzUByRqbys/iidjW5 0ABzcaq3/ku2IOMnY5uau5IzctRpJU+IYbsW/a+zP3KYQCpTm7Ta97z3mr1rhmbMZRbO jE71SG4bj5DbdkD8NnnMbXFDaDpnLujAjzl6dZzt64/F2GRprLaxOWMeSl4YzbLVea3j pH4xIwEOQZOp6qAtoz192ldClsn90ENYYlTUMOAKQ3KuRhkmA/iXkGeOxahGdaYC6jJw GXPA== X-Gm-Message-State: AOJu0Ywt2nMx/I9m7N4TBoH2QFGbuh4b9d++Zexs04OcnWlt6rpc8glE vEXdAMq15K4oGQbUFKAvLJ++Q3Au6joTTKQc3SlM2jgzHpdQGr50xQZQEpKCcFkky9dSvWc/Gbe 3 X-Gm-Gg: ASbGncuaxyD2y3ImwFQy5HWNvVM0RmE4DJLeEUb5uMtEnGrOA4y4rPHKkM71gQp25ZM SNW4sps7W5pIVqUo8JhyBNzoD7G/EzX6TgWJG76E+pYx+QLlZuTmJF2UBWEcCNpo1Yn8siJN9U1 yz9NtRog0PYUmoXutrFKynbrkBE5YjFXnK3VkV31L6k3Pgx/EC2hclwHp18HRflcsC0Vm7tDc+l THprZeFQSnvKIVWy8FQ/X6T4mH3BnSchKI= X-Google-Smtp-Source: AGHT+IGOTgkLRnA4czWTMkZeEXKvCq1PTMgt5kmHHIjqfFD7w7UmXVYqNmZZa5sXoC8Pmg6yWW7SZw== X-Received: by 2002:a05:6a00:1487:b0:725:3bd4:9b56 with SMTP id d2e1a72fcca58-7257fa44e93mr10562740b3a.5.1733332970951; Wed, 04 Dec 2024 09:22:50 -0800 (PST) Received: from localhost ([2a03:2880:ff:74::]) by smtp.gmail.com with ESMTPSA id 41be03b00d2f7-7fc9c2e1b2fsm11784589a12.20.2024.12.04.09.22.50 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Wed, 04 Dec 2024 09:22:50 -0800 (PST) From: David Wei To: io-uring@vger.kernel.org, netdev@vger.kernel.org Cc: Jens Axboe , Pavel Begunkov , Jakub Kicinski , Paolo Abeni , "David S. Miller" , Eric Dumazet , Jesper Dangaard Brouer , David Ahern , Mina Almasry , Stanislav Fomichev , Joe Damato , Pedro Tammela Subject: [PATCH net-next v8 07/17] net: page_pool: introduce page_pool_mp_return_in_cache Date: Wed, 4 Dec 2024 09:21:46 -0800 Message-ID: <20241204172204.4180482-8-dw@davidwei.uk> X-Mailer: git-send-email 2.43.5 In-Reply-To: <20241204172204.4180482-1-dw@davidwei.uk> References: <20241204172204.4180482-1-dw@davidwei.uk> Precedence: bulk X-Mailing-List: io-uring@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Pavel Begunkov Add a helper that allows a page pool memory provider to efficiently return a netmem off the allocation callback. Signed-off-by: Pavel Begunkov Signed-off-by: David Wei Reviewed-by: Mina Almasry --- include/net/page_pool/memory_provider.h | 4 ++++ net/core/page_pool.c | 19 +++++++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/include/net/page_pool/memory_provider.h b/include/net/page_pool/memory_provider.h index 83d7eec0058d..352b3a35d31c 100644 --- a/include/net/page_pool/memory_provider.h +++ b/include/net/page_pool/memory_provider.h @@ -1,3 +1,5 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + #ifndef _NET_PAGE_POOL_MEMORY_PROVIDER_H #define _NET_PAGE_POOL_MEMORY_PROVIDER_H @@ -7,4 +9,6 @@ int page_pool_mp_init_paged_area(struct page_pool *pool, void page_pool_mp_release_area(struct page_pool *pool, struct net_iov_area *area); +void page_pool_mp_return_in_cache(struct page_pool *pool, netmem_ref netmem); + #endif diff --git a/net/core/page_pool.c b/net/core/page_pool.c index d17e536ba8b8..24f29bdd70ab 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -1213,3 +1213,22 @@ void page_pool_mp_release_area(struct page_pool *pool, page_pool_release_page_dma(pool, net_iov_to_netmem(niov)); } } + +/* + * page_pool_mp_return_in_cache() - return a netmem to the allocation cache. + * @pool: pool from which pages were allocated + * @netmem: netmem to return + * + * Return already allocated and accounted netmem to the page pool's allocation + * cache. The function doesn't provide synchronisation and must only be called + * from the napi context. + */ +void page_pool_mp_return_in_cache(struct page_pool *pool, netmem_ref netmem) +{ + if (WARN_ON_ONCE(pool->alloc.count >= PP_ALLOC_CACHE_REFILL)) + return; + + page_pool_dma_sync_for_device(pool, netmem, -1); + page_pool_fragment_netmem(netmem, 1); + pool->alloc.cache[pool->alloc.count++] = netmem; +} From patchwork Wed Dec 4 17:21:47 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: David Wei X-Patchwork-Id: 13894119 Received: from mail-pg1-f179.google.com (mail-pg1-f179.google.com [209.85.215.179]) (using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id CF3FE214A7C for ; Wed, 4 Dec 2024 17:22:52 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=209.85.215.179 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1733332974; cv=none; b=YYaZx7KQQEVmwjJPIV/8Ljaz7chwwk3VMEe9GAuoQbpvx78S5NChn96JxglRjf/AK+MVJyjnx7nWrYgH2jr+QdqJtOTOoMZJQ4rXic7780hYZ6pFXvr+MylTmOazbm5d9sPKvDqlrK9+HLr/9zwHjhpVwKPdWbj7WW3E4/TuCMs= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1733332974; c=relaxed/simple; bh=WHdljndrLX/WO0D/Hih/8dp2+dGDExOdcjt97asBdaw=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=NIGKLs1hKk+zSmHTkLt1tF2/pfw8cYpqiEoi97s73Mt+h7+g4yhKrAoMN7kFhsKxbRakI1L1ZkUAb8iAVzJf1gppZOVcZdEZlpqab8n3jCoiqTYvV5ko6c1uK+7k5zeKptNuyAACO6L2DDS46XQRAiKHudgYRYTj/7Hw07dXsAM= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=none (p=none dis=none) header.from=davidwei.uk; spf=none smtp.mailfrom=davidwei.uk; dkim=pass (2048-bit key) header.d=davidwei-uk.20230601.gappssmtp.com header.i=@davidwei-uk.20230601.gappssmtp.com header.b=Dm3D/f/s; arc=none smtp.client-ip=209.85.215.179 Authentication-Results: smtp.subspace.kernel.org; dmarc=none (p=none dis=none) header.from=davidwei.uk Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=davidwei.uk Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=davidwei-uk.20230601.gappssmtp.com header.i=@davidwei-uk.20230601.gappssmtp.com header.b="Dm3D/f/s" Received: by mail-pg1-f179.google.com with SMTP id 41be03b00d2f7-7ea9739647bso38816a12.0 for ; Wed, 04 Dec 2024 09:22:52 -0800 (PST) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=davidwei-uk.20230601.gappssmtp.com; s=20230601; t=1733332972; x=1733937772; darn=vger.kernel.org; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:from:to:cc:subject:date :message-id:reply-to; bh=clZWVy4EcXukFFQnbW8FtGJr5BpqkMoC1pIMi8Z2Vw8=; b=Dm3D/f/sFIFBJaWN+ZfBZuZYphWCX/5E7LBuZb0Uos9sVZ+VRYi+AuSdmzH96tCZl4 fly1gEN3n5whXXi017SvVwfko7drpfGhZg5Vt7YrujWQLhWj4Ren/BRVSI5YOGqO1TxA mdEDLBRICHlRuuIQjKd1N+a/G880ycAv4ahR+H/LaCfhIC115m1x1Dxo/WEIPOUoPCBM mBETBYm4GD7bd9ci21JBEu8NRyTdFvOPW9i96mgsUHt417+wiD8kOpp+/xEzDANikoyK itboLWblqkxfAdAVIT2K88mNVTKtHdQct2fUAStjUJHSZCaU1jLEOQ3UKjTtUrksZjAn qJ/A== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20230601; t=1733332972; x=1733937772; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc :subject:date:message-id:reply-to; bh=clZWVy4EcXukFFQnbW8FtGJr5BpqkMoC1pIMi8Z2Vw8=; b=NDqkTZDwH9dakpa1rxhmzHiQj38QWg+F85zVIC01zucVmZ1WcUM1uJdhzBuLVfoe56 Of6++uIW4k5OS0wlBCpbLKE740im0CQedOd3zGKZr7R9HBnN8pUkZSw5ON8eJ+A9F+/u RMv/59e1sD+jw/FcOKxyIdVC+E8RPJLXnJtPYtLKpQg+b9Wl/Cdy8n90P8rVJCMExIWP 7d1Ttk3lfKEJJG7VdA8jRNTqaYruJSMoHPz0rEi4cxxAuqP+OewMOJJd81aaFP9oxuXw wb2IX/xJvUHLDZ19azevYGWE94oCqyn9O7Dx/mUL+S+z3XD7Z5DCAE4j2u8AcS20w4V1 WWWQ== X-Gm-Message-State: AOJu0Yy1ewzzGJJq/LJXzkYtstY/M/oPOLB7dnUhYofD58on1YMLZ/dH eekuZd4BLbqwUpBr1aEMXQFa3cRMS+EA03sTswGazRqEkzhDaqhepv/NPQEFzh+O6xmyW6dEClN 0 X-Gm-Gg: ASbGncu9l6vI7k7hHKpvHrS3Fi7uVqUzaAlb0sk/JXKslxfaa+YNrJayQhwHRiCvO2j AW1CPVV4wrat4d2qqOZCM8ziK+UbMhsKAdj/ns6C/c1uDkS6v/KGANGn8VF52RTrpRPQp1jrdvY p0LgzI6tC41IOHLKt0ARZHF+qWBl1mE8G42NXxjotRYJvRY1L4kMa5QbNWUMVg6C5uFL3DvTio9 scSygZ9Ttp9iY5wUG8ypeIzmQvO1f6lHlg= X-Google-Smtp-Source: AGHT+IHYNGJ20r1qF8Ua9ywD8LbGi631u93tMXMSNXdEzlI+eP3aqY2pccobSUDIDagwc7wnxdzzjw== X-Received: by 2002:a05:6a20:3d85:b0:1e0:ca95:2de8 with SMTP id adf61e73a8af0-1e16542f2b2mr10586118637.46.1733332972203; Wed, 04 Dec 2024 09:22:52 -0800 (PST) Received: from localhost ([2a03:2880:ff:11::]) by smtp.gmail.com with ESMTPSA id d2e1a72fcca58-72565b4868dsm7674371b3a.105.2024.12.04.09.22.51 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Wed, 04 Dec 2024 09:22:51 -0800 (PST) From: David Wei To: io-uring@vger.kernel.org, netdev@vger.kernel.org Cc: Jens Axboe , Pavel Begunkov , Jakub Kicinski , Paolo Abeni , "David S. Miller" , Eric Dumazet , Jesper Dangaard Brouer , David Ahern , Mina Almasry , Stanislav Fomichev , Joe Damato , Pedro Tammela Subject: [PATCH net-next v8 08/17] net: add helper executing custom callback from napi Date: Wed, 4 Dec 2024 09:21:47 -0800 Message-ID: <20241204172204.4180482-9-dw@davidwei.uk> X-Mailer: git-send-email 2.43.5 In-Reply-To: <20241204172204.4180482-1-dw@davidwei.uk> References: <20241204172204.4180482-1-dw@davidwei.uk> Precedence: bulk X-Mailing-List: io-uring@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Pavel Begunkov It's useful to have napi private bits and pieces like page pool's fast allocating cache, so that the hot allocation path doesn't have to do any additional synchronisation. In case of io_uring memory provider introduced in following patches, we keep the consumer end of the io_uring's refill queue private to napi as it's a hot path. However, from time to time we need to synchronise with the napi, for example to add more user memory or allocate fallback buffers. Add a helper function napi_execute that allows to run a custom callback from under napi context so that it can access and modify napi protected parts of io_uring. It works similar to busy polling and stops napi from running in the meantime, so it's supposed to be a slow control path. Signed-off-by: Pavel Begunkov Signed-off-by: David Wei --- include/net/busy_poll.h | 6 +++ net/core/dev.c | 81 ++++++++++++++++++++++++++++++++--------- 2 files changed, 70 insertions(+), 17 deletions(-) diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index c858270141bc..d2ae1b4bf20c 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -47,6 +47,7 @@ bool sk_busy_loop_end(void *p, unsigned long start_time); void napi_busy_loop(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), void *loop_end_arg, bool prefer_busy_poll, u16 budget); +void napi_execute(unsigned napi_id, void (*cb)(void *), void *cb_arg); void napi_busy_loop_rcu(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), @@ -66,6 +67,11 @@ static inline bool sk_can_busy_loop(struct sock *sk) return false; } +static inline void napi_execute(unsigned napi_id, + void (*cb)(void *), void *cb_arg) +{ +} + #endif /* CONFIG_NET_RX_BUSY_POLL */ static inline unsigned long busy_loop_current_time(void) diff --git a/net/core/dev.c b/net/core/dev.c index 13d00fc10f55..590ded8cc544 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6351,6 +6351,30 @@ enum { NAPI_F_END_ON_RESCHED = 2, }; +static inline bool napi_state_start_busy_polling(struct napi_struct *napi, + unsigned flags) +{ + unsigned long val = READ_ONCE(napi->state); + + /* If multiple threads are competing for this napi, + * we avoid dirtying napi->state as much as we can. + */ + if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED | + NAPIF_STATE_IN_BUSY_POLL)) + goto fail; + + if (cmpxchg(&napi->state, val, + val | NAPIF_STATE_IN_BUSY_POLL | + NAPIF_STATE_SCHED) != val) + goto fail; + + return true; +fail: + if (flags & NAPI_F_PREFER_BUSY_POLL) + set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); + return false; +} + static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, unsigned flags, u16 budget) { @@ -6426,24 +6450,8 @@ static void __napi_busy_loop(unsigned int napi_id, local_bh_disable(); bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); if (!napi_poll) { - unsigned long val = READ_ONCE(napi->state); - - /* If multiple threads are competing for this napi, - * we avoid dirtying napi->state as much as we can. - */ - if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED | - NAPIF_STATE_IN_BUSY_POLL)) { - if (flags & NAPI_F_PREFER_BUSY_POLL) - set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); + if (!napi_state_start_busy_polling(napi, flags)) goto count; - } - if (cmpxchg(&napi->state, val, - val | NAPIF_STATE_IN_BUSY_POLL | - NAPIF_STATE_SCHED) != val) { - if (flags & NAPI_F_PREFER_BUSY_POLL) - set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); - goto count; - } have_poll_lock = netpoll_poll_lock(napi); napi_poll = napi->poll; } @@ -6507,6 +6515,45 @@ void napi_busy_loop(unsigned int napi_id, } EXPORT_SYMBOL(napi_busy_loop); +void napi_execute(unsigned napi_id, + void (*cb)(void *), void *cb_arg) +{ + unsigned flags = NAPI_F_PREFER_BUSY_POLL; + void *have_poll_lock = NULL; + struct napi_struct *napi; + + rcu_read_lock(); + napi = napi_by_id(napi_id); + if (!napi) { + rcu_read_unlock(); + return; + } + + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); + + for (;;) { + local_bh_disable(); + + if (napi_state_start_busy_polling(napi, flags)) { + have_poll_lock = netpoll_poll_lock(napi); + cb(cb_arg); + local_bh_enable(); + busy_poll_stop(napi, have_poll_lock, flags, 1); + break; + } + + local_bh_enable(); + if (unlikely(need_resched())) + break; + cpu_relax(); + } + + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); + rcu_read_unlock(); +} + void napi_suspend_irqs(unsigned int napi_id) { struct napi_struct *napi; From patchwork Wed Dec 4 17:21:48 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: David Wei X-Patchwork-Id: 13894120 Received: from mail-pl1-f169.google.com (mail-pl1-f169.google.com [209.85.214.169]) (using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 50726214A89 for ; Wed, 4 Dec 2024 17:22:54 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=209.85.214.169 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1733332975; cv=none; b=NU6OtPzViCytcSv5nvSznWMUOJEDgIJUQbItBFBT54L2VTF+pAsilHo86vViGNaoNXEUSfQDkrmiXFIL/6mGYgTRxHiM6sNx8EzsZlpEy9ZejbcPrjNoCjjagQq+3sCa8kPY2ckjobk9YdCiFJY3KlkeMz1ziCqjjqqKHwWoP78= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1733332975; c=relaxed/simple; bh=NZwkTdorjTEyEl51wSx2wPFP4zd5K3gzsRQQ8p9rNn4=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=K2MRMs//61jRBRoc/L4uaGt77TZZOuZasjN0xZ1/DgVKLYQPydaBgqlZ8NFNhy4bgefQrxvq2gTQm3tLg9hPNihok9aWmTA9mJvdWAU0AhoNYOrThpTTNeJ02xA+D6EF8xRGFoKRnIpDQHMKhgL+75VckW7HOYsKMQtnPw/mGoA= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=none (p=none dis=none) header.from=davidwei.uk; spf=none smtp.mailfrom=davidwei.uk; dkim=pass (2048-bit key) header.d=davidwei-uk.20230601.gappssmtp.com header.i=@davidwei-uk.20230601.gappssmtp.com header.b=0Ycon0WI; arc=none smtp.client-ip=209.85.214.169 Authentication-Results: smtp.subspace.kernel.org; dmarc=none (p=none dis=none) header.from=davidwei.uk Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=davidwei.uk Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=davidwei-uk.20230601.gappssmtp.com header.i=@davidwei-uk.20230601.gappssmtp.com header.b="0Ycon0WI" Received: by mail-pl1-f169.google.com with SMTP id d9443c01a7336-2155157c31fso404785ad.1 for ; Wed, 04 Dec 2024 09:22:54 -0800 (PST) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=davidwei-uk.20230601.gappssmtp.com; s=20230601; t=1733332973; x=1733937773; darn=vger.kernel.org; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:from:to:cc:subject:date :message-id:reply-to; bh=oln9qOJpmPmUR8SzkW6a4gArOpC7AXVPSCm5DGuxgNI=; b=0Ycon0WIK1IUb/KaqEJBtgDNFsLGOlWqPQVnMbKygfB67Ag6GYHf7B78QgAm2r9uHS o4uZdjQo/7yv74eAwFK1HnICU+j53TY3DNZB8AtDwI+5GrxFgcTAlHDGvD3zzQ8B4WLk aZ8le3peo1nSxuMZu6aSOgbKCKoOdObPMZEqXLq9iNJxv2y6TpQQ5DcDZxvmsWXUJiar BXWVaBmS75E38w5MqJ7nIPTrJEUYPWaBerWncBAqe04dobXU8RixbW5nkNreYNMk+3N4 YB2lkefKsVy4fdtWT77TVgHEfIzG9dzTA/K5POoXxyh56xBlyO0VfQzqmzB/hpqKoT56 L9WA== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20230601; t=1733332973; x=1733937773; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc :subject:date:message-id:reply-to; bh=oln9qOJpmPmUR8SzkW6a4gArOpC7AXVPSCm5DGuxgNI=; b=kVoUxx/DHHVradddMo7/HLQQd2XCWgvrq1RTAN41kdefmyLeOyda45FKWn+2N0+oT3 WNF885V1vrWkdXiwNoFWqebKtrITdSDM6cCzU0lboe4UuXWGMOiA0d3M4PkvEq4QSoZ2 LhWCJ6CHiRVeXpHDHAKHYKY8zKkHIFGqUFmpDB5ife+eOVAuJ0sPv/Td+NAACMsW+Py7 VztwVkFq2UskpFDr0vp2esaJd1C+4QIHl6isqtTG3oDYNmhABh9fgyTjwpZGWkm/vMFP gFW1cVYnw+rgRYp/NQINLqoULRDuWF8V2elaI6OcjwWdBSwvNOOUkLYsKSKMtkAE8LDb eq8w== X-Gm-Message-State: AOJu0YxaeLgckvc4QY7m0UcqrTQXed2BwVhx9uPA7//+cARF3fpSPq+Y O/MA4u6MpGLq68cGkT8qa93/SpLOAqrZCu6PkFh1tTLVeszY7EghugmMeaKbpqLTnWn4xy3jkA7 l X-Gm-Gg: ASbGnctEnkaRaPtXhro+L/5vNJM5PUG8ZHRdLL7yshHQ1+PmM9nZjIZJ+Pr/EsajUMm mbBj4cucnG1/EIVD+magwh7WpGhTHGhXh76HO+WpAeJUzoZB2q3H8eZQyL8qfxB7GFRqmNU1Rm/ XFmBgpzAOWOfVnYNOMRBm5/rsVIqrw/89GN+qtfpvYsxl6+pY5skNBRw2JZ0xN+YIwvlzJ8+mQW 0AfJ/FNJ1ol34NpTnMF+jIZD7RRZsCxmQ== X-Google-Smtp-Source: AGHT+IFXBjfraEq/YO/NA0GRDjQe4IJrB9YPmNz2tSXZR2tjKNKlf40KDjJOt7z6ERBevmSF5gMOJg== X-Received: by 2002:a17:902:d485:b0:20c:6bff:fcb1 with SMTP id d9443c01a7336-215f3c5a646mr2436205ad.1.1733332973562; Wed, 04 Dec 2024 09:22:53 -0800 (PST) Received: from localhost ([2a03:2880:ff:e::]) by smtp.gmail.com with ESMTPSA id d9443c01a7336-2155157c2cfsm85734625ad.115.2024.12.04.09.22.52 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Wed, 04 Dec 2024 09:22:53 -0800 (PST) From: David Wei To: io-uring@vger.kernel.org, netdev@vger.kernel.org Cc: Jens Axboe , Pavel Begunkov , Jakub Kicinski , Paolo Abeni , "David S. Miller" , Eric Dumazet , Jesper Dangaard Brouer , David Ahern , Mina Almasry , Stanislav Fomichev , Joe Damato , Pedro Tammela Subject: [PATCH net-next v8 09/17] io_uring/zcrx: add interface queue and refill queue Date: Wed, 4 Dec 2024 09:21:48 -0800 Message-ID: <20241204172204.4180482-10-dw@davidwei.uk> X-Mailer: git-send-email 2.43.5 In-Reply-To: <20241204172204.4180482-1-dw@davidwei.uk> References: <20241204172204.4180482-1-dw@davidwei.uk> Precedence: bulk X-Mailing-List: io-uring@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: David Wei Add a new object called an interface queue (ifq) that represents a net rx queue that has been configured for zero copy. Each ifq is registered using a new registration opcode IORING_REGISTER_ZCRX_IFQ. The refill queue is allocated by the kernel and mapped by userspace using a new offset IORING_OFF_RQ_RING, in a similar fashion to the main SQ/CQ. It is used by userspace to return buffers that it is done with, which will then be re-used by the netdev again. The main CQ ring is used to notify userspace of received data by using the upper 16 bytes of a big CQE as a new struct io_uring_zcrx_cqe. Each entry contains the offset + len to the data. For now, each io_uring instance only has a single ifq. Signed-off-by: David Wei --- Kconfig | 2 + include/linux/io_uring_types.h | 4 + include/uapi/linux/io_uring.h | 43 +++++++++- io_uring/KConfig | 10 +++ io_uring/Makefile | 1 + io_uring/io_uring.c | 7 ++ io_uring/register.c | 7 ++ io_uring/zcrx.c | 151 +++++++++++++++++++++++++++++++++ io_uring/zcrx.h | 38 +++++++++ 9 files changed, 262 insertions(+), 1 deletion(-) create mode 100644 io_uring/KConfig create mode 100644 io_uring/zcrx.c create mode 100644 io_uring/zcrx.h diff --git a/Kconfig b/Kconfig index 745bc773f567..529ea7694ba9 100644 --- a/Kconfig +++ b/Kconfig @@ -30,3 +30,5 @@ source "lib/Kconfig" source "lib/Kconfig.debug" source "Documentation/Kconfig" + +source "io_uring/KConfig" diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 593c10a02144..fecd53544a93 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -40,6 +40,8 @@ enum io_uring_cmd_flags { IO_URING_F_TASK_DEAD = (1 << 13), }; +struct io_zcrx_ifq; + struct io_wq_work_node { struct io_wq_work_node *next; }; @@ -377,6 +379,8 @@ struct io_ring_ctx { struct wait_queue_head poll_wq; struct io_restriction restrictions; + struct io_zcrx_ifq *ifq; + u32 pers_next; struct xarray personalities; diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 4418d0192959..552377a1e496 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -622,7 +622,8 @@ enum io_uring_register_op { /* send MSG_RING without having a ring */ IORING_REGISTER_SEND_MSG_RING = 31, - /* 32 reserved for zc rx */ + /* register a netdev hw rx queue for zerocopy */ + IORING_REGISTER_ZCRX_IFQ = 32, /* resize CQ ring */ IORING_REGISTER_RESIZE_RINGS = 33, @@ -953,6 +954,46 @@ enum io_uring_socket_op { SOCKET_URING_OP_SETSOCKOPT, }; +/* Zero copy receive refill queue entry */ +struct io_uring_zcrx_rqe { + __u64 off; + __u32 len; + __u32 __pad; +}; + +struct io_uring_zcrx_cqe { + __u64 off; + __u64 __pad; +}; + +/* The bit from which area id is encoded into offsets */ +#define IORING_ZCRX_AREA_SHIFT 48 +#define IORING_ZCRX_AREA_MASK (~(((__u64)1 << IORING_ZCRX_AREA_SHIFT) - 1)) + +struct io_uring_zcrx_offsets { + __u32 head; + __u32 tail; + __u32 rqes; + __u32 __resv2; + __u64 __resv[2]; +}; + +/* + * Argument for IORING_REGISTER_ZCRX_IFQ + */ +struct io_uring_zcrx_ifq_reg { + __u32 if_idx; + __u32 if_rxq; + __u32 rq_entries; + __u32 flags; + + __u64 area_ptr; /* pointer to struct io_uring_zcrx_area_reg */ + __u64 region_ptr; /* struct io_uring_region_desc * */ + + struct io_uring_zcrx_offsets offsets; + __u64 __resv[4]; +}; + #ifdef __cplusplus } #endif diff --git a/io_uring/KConfig b/io_uring/KConfig new file mode 100644 index 000000000000..9e2a4beba1ef --- /dev/null +++ b/io_uring/KConfig @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# io_uring configuration +# + +config IO_URING_ZCRX + def_bool y + depends on PAGE_POOL + depends on INET + depends on NET_RX_BUSY_POLL diff --git a/io_uring/Makefile b/io_uring/Makefile index 53167bef37d7..a95b0b8229c9 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -14,6 +14,7 @@ obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \ epoll.o statx.o timeout.o fdinfo.o \ cancel.o waitid.o register.o \ truncate.o memmap.o +obj-$(CONFIG_IO_URING_ZCRX) += zcrx.o obj-$(CONFIG_IO_WQ) += io-wq.o obj-$(CONFIG_FUTEX) += futex.o obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 801293399883..a69d6afe62f6 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -97,6 +97,7 @@ #include "uring_cmd.h" #include "msg_ring.h" #include "memmap.h" +#include "zcrx.h" #include "timeout.h" #include "poll.h" @@ -2700,6 +2701,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) mutex_lock(&ctx->uring_lock); io_sqe_buffers_unregister(ctx); io_sqe_files_unregister(ctx); + io_unregister_zcrx_ifqs(ctx); io_cqring_overflow_kill(ctx); io_eventfd_unregister(ctx); io_alloc_cache_free(&ctx->apoll_cache, kfree); @@ -2865,6 +2867,11 @@ static __cold void io_ring_exit_work(struct work_struct *work) io_cqring_overflow_kill(ctx); mutex_unlock(&ctx->uring_lock); } + if (ctx->ifq) { + mutex_lock(&ctx->uring_lock); + io_shutdown_zcrx_ifqs(ctx); + mutex_unlock(&ctx->uring_lock); + } if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) io_move_task_work_from_local(ctx); diff --git a/io_uring/register.c b/io_uring/register.c index 1a60f4916649..8c68465b4f4c 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -30,6 +30,7 @@ #include "eventfd.h" #include "msg_ring.h" #include "memmap.h" +#include "zcrx.h" #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ IORING_REGISTER_LAST + IORING_OP_LAST) @@ -803,6 +804,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_register_clone_buffers(ctx, arg); break; + case IORING_REGISTER_ZCRX_IFQ: + ret = -EINVAL; + if (!arg || nr_args != 1) + break; + ret = io_register_zcrx_ifq(ctx, arg); + break; case IORING_REGISTER_RESIZE_RINGS: ret = -EINVAL; if (!arg || nr_args != 1) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c new file mode 100644 index 000000000000..3e5644718f54 --- /dev/null +++ b/io_uring/zcrx.c @@ -0,0 +1,151 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include + +#include + +#include "io_uring.h" +#include "kbuf.h" +#include "memmap.h" +#include "zcrx.h" + +#define IO_RQ_MAX_ENTRIES 32768 + +static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, + struct io_uring_zcrx_ifq_reg *reg, + struct io_uring_region_desc *rd) +{ + size_t off, size; + void *ptr; + int ret; + + off = sizeof(struct io_uring); + size = off + sizeof(struct io_uring_zcrx_rqe) * reg->rq_entries; + if (size > rd->size) + return -EINVAL; + + ret = io_create_region(ifq->ctx, &ifq->region, rd); + if (ret < 0) + return ret; + + ptr = io_region_get_ptr(&ifq->region); + ifq->rq_ring = (struct io_uring *)ptr; + ifq->rqes = (struct io_uring_zcrx_rqe *)(ptr + off); + return 0; +} + +static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq) +{ + io_free_region(ifq->ctx, &ifq->region); + ifq->rq_ring = NULL; + ifq->rqes = NULL; +} + +static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx) +{ + struct io_zcrx_ifq *ifq; + + ifq = kzalloc(sizeof(*ifq), GFP_KERNEL); + if (!ifq) + return NULL; + + ifq->if_rxq = -1; + ifq->ctx = ctx; + return ifq; +} + +static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq) +{ + io_free_rbuf_ring(ifq); + kfree(ifq); +} + +int io_register_zcrx_ifq(struct io_ring_ctx *ctx, + struct io_uring_zcrx_ifq_reg __user *arg) +{ + struct io_uring_zcrx_ifq_reg reg; + struct io_uring_region_desc rd; + struct io_zcrx_ifq *ifq; + size_t ring_sz, rqes_sz; + int ret; + + /* + * 1. Interface queue allocation. + * 2. It can observe data destined for sockets of other tasks. + */ + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + /* mandatory io_uring features for zc rx */ + if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN && + ctx->flags & IORING_SETUP_CQE32)) + return -EINVAL; + if (ctx->ifq) + return -EBUSY; + if (copy_from_user(®, arg, sizeof(reg))) + return -EFAULT; + if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd))) + return -EFAULT; + if (memchr_inv(®.__resv, 0, sizeof(reg.__resv))) + return -EINVAL; + if (reg.if_rxq == -1 || !reg.rq_entries || reg.flags) + return -EINVAL; + if (reg.rq_entries > IO_RQ_MAX_ENTRIES) { + if (!(ctx->flags & IORING_SETUP_CLAMP)) + return -EINVAL; + reg.rq_entries = IO_RQ_MAX_ENTRIES; + } + reg.rq_entries = roundup_pow_of_two(reg.rq_entries); + + if (!reg.area_ptr) + return -EFAULT; + + ifq = io_zcrx_ifq_alloc(ctx); + if (!ifq) + return -ENOMEM; + + ret = io_allocate_rbuf_ring(ifq, ®, &rd); + if (ret) + goto err; + + ifq->rq_entries = reg.rq_entries; + ifq->if_rxq = reg.if_rxq; + + ring_sz = sizeof(struct io_uring); + rqes_sz = sizeof(struct io_uring_zcrx_rqe) * ifq->rq_entries; + reg.offsets.rqes = ring_sz; + reg.offsets.head = offsetof(struct io_uring, head); + reg.offsets.tail = offsetof(struct io_uring, tail); + + if (copy_to_user(arg, ®, sizeof(reg)) || + copy_to_user(u64_to_user_ptr(reg.region_ptr), &rd, sizeof(rd))) { + ret = -EFAULT; + goto err; + } + + ctx->ifq = ifq; + return 0; +err: + io_zcrx_ifq_free(ifq); + return ret; +} + +void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) +{ + struct io_zcrx_ifq *ifq = ctx->ifq; + + lockdep_assert_held(&ctx->uring_lock); + + if (!ifq) + return; + + ctx->ifq = NULL; + io_zcrx_ifq_free(ifq); +} + +void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx) +{ + lockdep_assert_held(&ctx->uring_lock); +} diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h new file mode 100644 index 000000000000..178c515fea04 --- /dev/null +++ b/io_uring/zcrx.h @@ -0,0 +1,38 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef IOU_ZC_RX_H +#define IOU_ZC_RX_H + +#include + +struct io_zcrx_ifq { + struct io_ring_ctx *ctx; + struct net_device *dev; + struct io_uring *rq_ring; + struct io_uring_zcrx_rqe *rqes; + u32 rq_entries; + + u32 if_rxq; + + struct io_mapped_region region; +}; + +#if defined(CONFIG_IO_URING_ZCRX) +int io_register_zcrx_ifq(struct io_ring_ctx *ctx, + struct io_uring_zcrx_ifq_reg __user *arg); +void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx); +void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx); +#else +static inline int io_register_zcrx_ifq(struct io_ring_ctx *ctx, + struct io_uring_zcrx_ifq_reg __user *arg) +{ + return -EOPNOTSUPP; +} +static inline void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) +{ +} +static inline void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx) +{ +} +#endif + +#endif From patchwork Wed Dec 4 17:21:49 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: David Wei X-Patchwork-Id: 13894121 Received: from mail-pf1-f175.google.com (mail-pf1-f175.google.com [209.85.210.175]) (using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 6F364214A75 for ; Wed, 4 Dec 2024 17:22:55 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=209.85.210.175 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1733332977; cv=none; b=WN0MBC3F+78WE4xX0OHXgoKbOkH2t6B9yySNp71nWxBjnZbE1QVAmJr+wXrmFHfDcX1UBxLO499b+YsdgV0yDtHs5vX7hFaHuh8fUgGNHN4ZH/VpN0yGLWc39HXNIXgGx6AROz1dOVq8SUtjBcAtS32sSGN//mn041wPBwrqfZ4= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1733332977; c=relaxed/simple; bh=+5zZnzySyKjY5FRXuqhU2gF3FaGaQHlkyYzGIZAsyQ0=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=nMm3QQ5TNLdNb1N8mR6Gdt5hZB8qWZLMSsArMBJqAxV6FwcAXbBfR7wC370jSUX2xd9iTrSlhkVyMoBARiyD5cMRQrGmt//DlDJvu24GI32C+HY8UnerjTj1fs3DgBUWwtIbx9y2Bui6+TvoZuaYZddRf0+386TzX+lo1gGGxaQ= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=none (p=none dis=none) header.from=davidwei.uk; spf=none smtp.mailfrom=davidwei.uk; dkim=pass (2048-bit key) header.d=davidwei-uk.20230601.gappssmtp.com header.i=@davidwei-uk.20230601.gappssmtp.com header.b=VGPg/tOQ; arc=none smtp.client-ip=209.85.210.175 Authentication-Results: smtp.subspace.kernel.org; dmarc=none (p=none dis=none) header.from=davidwei.uk Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=davidwei.uk Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=davidwei-uk.20230601.gappssmtp.com header.i=@davidwei-uk.20230601.gappssmtp.com header.b="VGPg/tOQ" Received: by mail-pf1-f175.google.com with SMTP id d2e1a72fcca58-724e5fb3f9dso45828b3a.3 for ; Wed, 04 Dec 2024 09:22:55 -0800 (PST) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=davidwei-uk.20230601.gappssmtp.com; s=20230601; t=1733332975; x=1733937775; darn=vger.kernel.org; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:from:to:cc:subject:date :message-id:reply-to; bh=9eW3RY+rkgbVN3oi+45jX+ajyTy+70cXpyds41HL1l8=; b=VGPg/tOQ5oURAwA84XqKzJnw565AxCLSwAjvvrAIES8DCHoovvG3j+oArms/DDh51T iNzvFLpXMnu2zy7HI4FdmKW/PIx2yNMwwQi5ZYh8cl6gCULNvNuOZ0A1U8GkhcoySYTD P8SMYzFC1B5KkBezNqqaJHck6LbrSzV/2tlyFSRByyF2Ny3Bde2cVHG+rDpnn5pxnH6B EgzpP2SmvEWcb8EE3E48ZSAGuwKD13ZMkj1JVU1DbOj/XHeIAi/FZyUGoQ2rQ9dsgCCa NVFUQeTvfPY5LNzE0JDERDTeL4Cmpr/t1s4NTEFahHicApSYpY8Zl7nhsjqVNOrkiqSx MFlg== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20230601; t=1733332975; x=1733937775; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc :subject:date:message-id:reply-to; bh=9eW3RY+rkgbVN3oi+45jX+ajyTy+70cXpyds41HL1l8=; b=U3W6fBlUh6Vn7sW/2oF7wCAzYKXvvcT5EIPAxnPSk3C8EFx+syEllm937Zz88HBvC/ 1SR/X4ksD9X2rpJko82Fwyx0TuCj2vfNSuPr7xGqZYeABtzjvjyypFv7eJckiNPtTi5E llyxbjnKvoAeqwNtcQe4Q2GKWMhKhFmMwJF4kTKtwjPALSIQ/fgTf0+UyG5pOJPOlC3D D0k1r8bhlXbp2HCElm1NshRjWtacOYnIs17YXd3nTe8+5Nv8dir2eDgHfEOFCwbod+dY 7avqmcEp+N51decoTFXanLF5zSvVHQVbq2B6Ue8synmzgF6QthtpxHC6Sd112MsEqxqE ev6g== X-Gm-Message-State: AOJu0YxzZhwnV7WPk7u+Hnffd3Xmg+EacdrImbft8M/hE9dHV/+gA9ED 2ttuqrFBJnqVshtmN0ceGYslBOnlU2ycPBLGVC2P5hKEuRCfx6lYe/VDLcpcFuPr6ehvH68Woqh t X-Gm-Gg: ASbGncv9huTsCXxhneBPnyyvKSQQGwEjlssaPplExYQl8nsb3t+T8h/Sz7DerktT/57 csUS5WylJynVxvDsAwSI8CL1zTiKFjdzZkKmpj2xPR509iDgXfJWk7CzzrJZILAf5YXAagv9/MF WwxyHk7oHZtkvJozeE3fO4Tg/CVwAVg3Jv6tHBjoiwwsLTkIcWSleDgLLvIWNLBdJT648O6WpWR /prNMerztdQQxublh4bqhdmCJVvw5SiZw== X-Google-Smtp-Source: AGHT+IHf0rf3CKqV4bulbAZAaa4Zetqh7IjrIKbWMv8AWOquhksshr66cX2RAdIJ7xMTZhPbVQAlGw== X-Received: by 2002:a05:6a00:7089:b0:725:8c0f:6fa3 with SMTP id d2e1a72fcca58-7258c0f78efmr4356379b3a.22.1733332974807; Wed, 04 Dec 2024 09:22:54 -0800 (PST) Received: from localhost ([2a03:2880:ff:f::]) by smtp.gmail.com with ESMTPSA id d2e1a72fcca58-72590f9f93dsm1487364b3a.87.2024.12.04.09.22.54 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Wed, 04 Dec 2024 09:22:54 -0800 (PST) From: David Wei To: io-uring@vger.kernel.org, netdev@vger.kernel.org Cc: Jens Axboe , Pavel Begunkov , Jakub Kicinski , Paolo Abeni , "David S. Miller" , Eric Dumazet , Jesper Dangaard Brouer , David Ahern , Mina Almasry , Stanislav Fomichev , Joe Damato , Pedro Tammela Subject: [PATCH net-next v8 10/17] io_uring/zcrx: add io_zcrx_area Date: Wed, 4 Dec 2024 09:21:49 -0800 Message-ID: <20241204172204.4180482-11-dw@davidwei.uk> X-Mailer: git-send-email 2.43.5 In-Reply-To: <20241204172204.4180482-1-dw@davidwei.uk> References: <20241204172204.4180482-1-dw@davidwei.uk> Precedence: bulk X-Mailing-List: io-uring@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: David Wei Add io_zcrx_area that represents a region of userspace memory that is used for zero copy. During ifq registration, userspace passes in the uaddr and len of userspace memory, which is then pinned by the kernel. Each net_iov is mapped to one of these pages. The freelist is a spinlock protected list that keeps track of all the net_iovs/pages that aren't used. For now, there is only one area per ifq and area registration happens implicitly as part of ifq registration. There is no API for adding/removing areas yet. The struct for area registration is there for future extensibility once we support multiple areas and TCP devmem. Reviewed-by: Jens Axboe Signed-off-by: Pavel Begunkov Signed-off-by: David Wei --- include/uapi/linux/io_uring.h | 9 ++++ io_uring/rsrc.c | 2 +- io_uring/rsrc.h | 1 + io_uring/zcrx.c | 93 ++++++++++++++++++++++++++++++++++- io_uring/zcrx.h | 16 ++++++ 5 files changed, 118 insertions(+), 3 deletions(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 552377a1e496..7d72de92378d 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -978,6 +978,15 @@ struct io_uring_zcrx_offsets { __u64 __resv[2]; }; +struct io_uring_zcrx_area_reg { + __u64 addr; + __u64 len; + __u64 rq_area_token; + __u32 flags; + __u32 __resv1; + __u64 __resv2[2]; +}; + /* * Argument for IORING_REGISTER_ZCRX_IFQ */ diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index adaae8630932..0d05e9944b0e 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -77,7 +77,7 @@ static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) return 0; } -static int io_buffer_validate(struct iovec *iov) +int io_buffer_validate(struct iovec *iov) { unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1); diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 7a4668deaa1a..bd23387c1549 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -65,6 +65,7 @@ int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, unsigned size, unsigned type); int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, unsigned int size, unsigned int type); +int io_buffer_validate(struct iovec *iov); static inline struct io_rsrc_node *io_rsrc_node_lookup(struct io_rsrc_data *data, int index) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 3e5644718f54..8f838add94a4 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -10,6 +10,7 @@ #include "kbuf.h" #include "memmap.h" #include "zcrx.h" +#include "rsrc.h" #define IO_RQ_MAX_ENTRIES 32768 @@ -43,6 +44,83 @@ static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq) ifq->rqes = NULL; } +static void io_zcrx_free_area(struct io_zcrx_area *area) +{ + if (area->freelist) + kvfree(area->freelist); + if (area->nia.niovs) + kvfree(area->nia.niovs); + if (area->pages) { + unpin_user_pages(area->pages, area->nia.num_niovs); + kvfree(area->pages); + } + kfree(area); +} + +static int io_zcrx_create_area(struct io_ring_ctx *ctx, + struct io_zcrx_ifq *ifq, + struct io_zcrx_area **res, + struct io_uring_zcrx_area_reg *area_reg) +{ + struct io_zcrx_area *area; + int i, ret, nr_pages; + struct iovec iov; + + if (area_reg->flags || area_reg->rq_area_token) + return -EINVAL; + if (area_reg->__resv1 || area_reg->__resv2[0] || area_reg->__resv2[1]) + return -EINVAL; + if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK) + return -EINVAL; + + iov.iov_base = u64_to_user_ptr(area_reg->addr); + iov.iov_len = area_reg->len; + ret = io_buffer_validate(&iov); + if (ret) + return ret; + + ret = -ENOMEM; + area = kzalloc(sizeof(*area), GFP_KERNEL); + if (!area) + goto err; + + area->pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len, + &nr_pages); + if (IS_ERR(area->pages)) { + ret = PTR_ERR(area->pages); + area->pages = NULL; + goto err; + } + area->nia.num_niovs = nr_pages; + + area->nia.niovs = kvmalloc_array(nr_pages, sizeof(area->nia.niovs[0]), + GFP_KERNEL | __GFP_ZERO); + if (!area->nia.niovs) + goto err; + + area->freelist = kvmalloc_array(nr_pages, sizeof(area->freelist[0]), + GFP_KERNEL | __GFP_ZERO); + if (!area->freelist) + goto err; + + for (i = 0; i < nr_pages; i++) { + area->freelist[i] = i; + } + + area->free_count = nr_pages; + area->ifq = ifq; + /* we're only supporting one area per ifq for now */ + area->area_id = 0; + area_reg->rq_area_token = (u64)area->area_id << IORING_ZCRX_AREA_SHIFT; + spin_lock_init(&area->freelist_lock); + *res = area; + return 0; +err: + if (area) + io_zcrx_free_area(area); + return ret; +} + static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx) { struct io_zcrx_ifq *ifq; @@ -58,6 +136,9 @@ static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx) static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq) { + if (ifq->area) + io_zcrx_free_area(ifq->area); + io_free_rbuf_ring(ifq); kfree(ifq); } @@ -65,6 +146,7 @@ static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq) int io_register_zcrx_ifq(struct io_ring_ctx *ctx, struct io_uring_zcrx_ifq_reg __user *arg) { + struct io_uring_zcrx_area_reg area; struct io_uring_zcrx_ifq_reg reg; struct io_uring_region_desc rd; struct io_zcrx_ifq *ifq; @@ -99,7 +181,7 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, } reg.rq_entries = roundup_pow_of_two(reg.rq_entries); - if (!reg.area_ptr) + if (copy_from_user(&area, u64_to_user_ptr(reg.area_ptr), sizeof(area))) return -EFAULT; ifq = io_zcrx_ifq_alloc(ctx); @@ -110,6 +192,10 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, if (ret) goto err; + ret = io_zcrx_create_area(ctx, ifq, &ifq->area, &area); + if (ret) + goto err; + ifq->rq_entries = reg.rq_entries; ifq->if_rxq = reg.if_rxq; @@ -124,7 +210,10 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, ret = -EFAULT; goto err; } - + if (copy_to_user(u64_to_user_ptr(reg.area_ptr), &area, sizeof(area))) { + ret = -EFAULT; + goto err; + } ctx->ifq = ifq; return 0; err: diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h index 178c515fea04..07742c0cfcf3 100644 --- a/io_uring/zcrx.h +++ b/io_uring/zcrx.h @@ -3,10 +3,26 @@ #define IOU_ZC_RX_H #include +#include + +struct io_zcrx_area { + struct net_iov_area nia; + struct io_zcrx_ifq *ifq; + + u16 area_id; + struct page **pages; + + /* freelist */ + spinlock_t freelist_lock ____cacheline_aligned_in_smp; + u32 free_count; + u32 *freelist; +}; struct io_zcrx_ifq { struct io_ring_ctx *ctx; struct net_device *dev; + struct io_zcrx_area *area; + struct io_uring *rq_ring; struct io_uring_zcrx_rqe *rqes; u32 rq_entries; From patchwork Wed Dec 4 17:21:50 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: David Wei X-Patchwork-Id: 13894122 Received: from mail-pf1-f170.google.com (mail-pf1-f170.google.com [209.85.210.170]) (using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id AB0E9215F4C for ; Wed, 4 Dec 2024 17:22:56 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=209.85.210.170 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1733332978; cv=none; b=g9+l6CZvqhnNnutsI4YMdk5QjnyDfawdp2nZUEgeQrE3OLYtr6PUt3Kxi4DHtdM4ltdJe30GHpPcGy8vpYUwkpPleRoDwkOwkvu+UUG66ScizWdx5ojlzqIuiewsIawW/HaWCgfqzLHgH241g0/OMBq/ZcVlh5Vw1OWTYkxoZBs= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1733332978; c=relaxed/simple; bh=7AS+9tBSgI7VlYar3ZmBNBANXy1BPT0oGmeQWHLrQmI=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=TVzwXmDmh4Tjh4FSQ3JuIUz+qLFyhvgo+yhwL6lJrBbHbRqofMf3BH2dlZWwYnxzwKZvge0ffx5l5wR0lHneQrHJS5UEn+SkmcQpi54JurKs7qEtE76xQEtO3GuvXkpmvSFGCTnqwQBRf+YHH4rjDtWFuuWqDjQKVe3uPuLSZSI= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=none (p=none dis=none) header.from=davidwei.uk; spf=none smtp.mailfrom=davidwei.uk; dkim=pass (2048-bit key) header.d=davidwei-uk.20230601.gappssmtp.com header.i=@davidwei-uk.20230601.gappssmtp.com header.b=kQtfq0L9; arc=none smtp.client-ip=209.85.210.170 Authentication-Results: smtp.subspace.kernel.org; dmarc=none (p=none dis=none) header.from=davidwei.uk Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=davidwei.uk Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=davidwei-uk.20230601.gappssmtp.com header.i=@davidwei-uk.20230601.gappssmtp.com header.b="kQtfq0L9" Received: by mail-pf1-f170.google.com with SMTP id d2e1a72fcca58-725935d001cso47459b3a.2 for ; Wed, 04 Dec 2024 09:22:56 -0800 (PST) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=davidwei-uk.20230601.gappssmtp.com; s=20230601; t=1733332976; x=1733937776; darn=vger.kernel.org; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:from:to:cc:subject:date :message-id:reply-to; bh=vFNGCB1W1eYtsSjdWKUaYilBUOhH2kRp5MrbYu76ozo=; b=kQtfq0L9rd6+Fgo0Jn2rmu60NX3VdlKxhB1TCwJ2sBnBoiOv9ue8Pb8oBh4gu/gJJo LfXz/SFPE9UghFwmrtjTL7IFP15yJQI+FmyH/nrPGT5hpnG7wVcA+DIWc40MZAvwAWnJ IabKYpLxA0gO96FHYAAOYsJ+QL782lVlezucAH4NHqJeAxk6x8DpLwEQ3gcxLDX8U7My zbANi0PMXAD6DUmB6NY6cyP1rHj33RWQA0QGYD3AlX1ejissAgtVZKqz+KtgVkhJmF0P BeeQgVmmgnFPmIKVnh93+22rbW8Sq2Zx36rhmNh2dfCzRI6NTSSWN6omGSjxVKMYUX47 7+HQ== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20230601; t=1733332976; x=1733937776; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc :subject:date:message-id:reply-to; bh=vFNGCB1W1eYtsSjdWKUaYilBUOhH2kRp5MrbYu76ozo=; b=Lep77yHbNsNhB0HvOcfcUK+cnygVv7CUFMgdiWydsvZFPKc9WIyCm2CXpC6huWY8uD pAoo/2C50ETUUr9I8++7gQXUFZUge/43qHa6JfZsasDmABEKtYFHP4IVCO8fKKIWr0Xe P7qSa2STMcJHDWdyjrXjwM0oEoNyQwf8dhrXsWxabEgYQLgteSUAjYnBB7MFem5/b1qy JO59v/mrIQAtX7zGKT+sg/Elai6dT6Tl805JLRFusn9h/RqLnntCEV4umDwetjsvoZlx GgvU28u2wiH/9eEoVKsWNRquDscQVpOgsj3nyxUzmDuJRhoZe4Q604e2/NySqHaJW9SO anRQ== X-Gm-Message-State: AOJu0Yz3I8vWqz4PQ3b8tAixp3KQBWsK0tbF5YOmPmyBw1SycPL7CZkB VuhAYcwnqFUwaGQPTJp20sJsD2ueuOD3kn4UiFPFlNcBdMT0eQutJ9hbu2j6W4jQ6IqImhJ2PTt 2 X-Gm-Gg: ASbGncuDdXL3/gnqXKZMV1LATT6yxBEZZDKQx4T2dphdqUg6pDckVi3Pi2+z87fvpZV ROMoG9srccMPUSiUUxBVEfgOHQTSDdu7nxUvIJ1uat3PVI3+DGJZeVPIOy90v1LwzrHsgZ2IfqV ZNvE6s8XOLs7nCJ5tWRfyu6/5m3ysWl5wi3rFWS/Edoy/y+B2u9HMgptLU0kjYpKQ5B/3M1C36u QHi+a2yXYcOfaHifCdlFa+dg9rg5Rd49TU= X-Google-Smtp-Source: AGHT+IHXMTCe9fSw087F9KP0KOqX3ZWi1P+qCuGplly4whE2VZQOUGlCdN1Pc1/M3OWxIUx0yDY52Q== X-Received: by 2002:a17:902:cf0c:b0:215:8809:b3b7 with SMTP id d9443c01a7336-215bcfc4d64mr94562985ad.7.1733332976086; Wed, 04 Dec 2024 09:22:56 -0800 (PST) Received: from localhost ([2a03:2880:ff:1b::]) by smtp.gmail.com with ESMTPSA id d9443c01a7336-21521905f2bsm115331265ad.93.2024.12.04.09.22.55 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Wed, 04 Dec 2024 09:22:55 -0800 (PST) From: David Wei To: io-uring@vger.kernel.org, netdev@vger.kernel.org Cc: Jens Axboe , Pavel Begunkov , Jakub Kicinski , Paolo Abeni , "David S. Miller" , Eric Dumazet , Jesper Dangaard Brouer , David Ahern , Mina Almasry , Stanislav Fomichev , Joe Damato , Pedro Tammela Subject: [PATCH net-next v8 11/17] io_uring/zcrx: implement zerocopy receive pp memory provider Date: Wed, 4 Dec 2024 09:21:50 -0800 Message-ID: <20241204172204.4180482-12-dw@davidwei.uk> X-Mailer: git-send-email 2.43.5 In-Reply-To: <20241204172204.4180482-1-dw@davidwei.uk> References: <20241204172204.4180482-1-dw@davidwei.uk> Precedence: bulk X-Mailing-List: io-uring@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Pavel Begunkov Implement a page pool memory provider for io_uring to receieve in a zero copy fashion. For that, the provider allocates user pages wrapped around into struct net_iovs, that are stored in a previously registered struct net_iov_area. Unlike with traditional receives, for which pages from a page pool can be deallocated right after the user receives data, e.g. via recv(2), we extend the lifetime by recycling buffers only after the user space acknowledges that it's done processing the data via the refill queue. Before handing buffers to the user, we mark them by bumping the refcount by a bias value IO_ZC_RX_UREF, which will be checked when the buffer is returned back. When the corresponding io_uring instance and/or page pool are destroyed, we'll force back all buffers that are currently in the user space in ->io_pp_zc_scrub by clearing the bias. Refcounting and lifetime: Initially, all buffers are considered unallocated and stored in ->freelist, at which point they are not yet directly exposed to the core page pool code and not accounted to page pool's pages_state_hold_cnt. The ->alloc_netmems callback will allocate them by placing into the page pool's cache, setting the refcount to 1 as usual and adjusting pages_state_hold_cnt. Then, either the buffer is dropped and returns back to the page pool into the ->freelist via io_pp_zc_release_netmem, in which case the page pool will match hold_cnt for us with ->pages_state_release_cnt. Or more likely the buffer will go through the network/protocol stacks and end up in the corresponding socket's receive queue. From there the user can get it via an new io_uring request implemented in following patches. As mentioned above, before giving a buffer to the user we bump the refcount by IO_ZC_RX_UREF. Once the user is done with the buffer processing, it must return it back via the refill queue, from where our ->alloc_netmems implementation can grab it, check references, put IO_ZC_RX_UREF, and recycle the buffer if there are no more users left. As we place such buffers right back into the page pools fast cache and they didn't go through the normal pp release path, they are still considered "allocated" and no pp hold_cnt is required. For the same reason we dma sync buffers for the device in io_zc_add_pp_cache(). Reviewed-by: Jens Axboe Signed-off-by: Pavel Begunkov Signed-off-by: David Wei --- io_uring/zcrx.c | 215 ++++++++++++++++++++++++++++++++++++++++++++++++ io_uring/zcrx.h | 5 ++ 2 files changed, 220 insertions(+) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 8f838add94a4..7919f5e52c73 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -2,7 +2,12 @@ #include #include #include +#include +#include #include +#include +#include +#include #include @@ -14,6 +19,16 @@ #define IO_RQ_MAX_ENTRIES 32768 +__maybe_unused +static const struct memory_provider_ops io_uring_pp_zc_ops; + +static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov) +{ + struct net_iov_area *owner = net_iov_owner(niov); + + return container_of(owner, struct io_zcrx_area, nia); +} + static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, struct io_uring_zcrx_ifq_reg *reg, struct io_uring_region_desc *rd) @@ -104,6 +119,9 @@ static int io_zcrx_create_area(struct io_ring_ctx *ctx, goto err; for (i = 0; i < nr_pages; i++) { + struct net_iov *niov = &area->nia.niovs[i]; + + niov->owner = &area->nia; area->freelist[i] = i; } @@ -238,3 +256,200 @@ void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx) { lockdep_assert_held(&ctx->uring_lock); } + +static bool io_zcrx_niov_put(struct net_iov *niov, int nr) +{ + return atomic_long_sub_and_test(nr, &niov->pp_ref_count); +} + +static bool io_zcrx_put_niov_uref(struct net_iov *niov) +{ + if (atomic_long_read(&niov->pp_ref_count) < IO_ZC_RX_UREF) + return false; + + return io_zcrx_niov_put(niov, IO_ZC_RX_UREF); +} + +static inline void io_zc_add_pp_cache(struct page_pool *pp, + struct net_iov *niov) +{ +} + +static inline u32 io_zcrx_rqring_entries(struct io_zcrx_ifq *ifq) +{ + u32 entries; + + entries = smp_load_acquire(&ifq->rq_ring->tail) - ifq->cached_rq_head; + return min(entries, ifq->rq_entries); +} + +static struct io_uring_zcrx_rqe *io_zcrx_get_rqe(struct io_zcrx_ifq *ifq, + unsigned mask) +{ + unsigned int idx = ifq->cached_rq_head++ & mask; + + return &ifq->rqes[idx]; +} + +static void io_zcrx_ring_refill(struct page_pool *pp, + struct io_zcrx_ifq *ifq) +{ + unsigned int entries = io_zcrx_rqring_entries(ifq); + unsigned int mask = ifq->rq_entries - 1; + + entries = min_t(unsigned, entries, PP_ALLOC_CACHE_REFILL - pp->alloc.count); + if (unlikely(!entries)) + return; + + do { + struct io_uring_zcrx_rqe *rqe = io_zcrx_get_rqe(ifq, mask); + struct io_zcrx_area *area; + struct net_iov *niov; + unsigned niov_idx, area_idx; + + area_idx = rqe->off >> IORING_ZCRX_AREA_SHIFT; + niov_idx = (rqe->off & ~IORING_ZCRX_AREA_MASK) / PAGE_SIZE; + + if (unlikely(rqe->__pad || area_idx)) + continue; + area = ifq->area; + + if (unlikely(niov_idx >= area->nia.num_niovs)) + continue; + niov_idx = array_index_nospec(niov_idx, area->nia.num_niovs); + + niov = &area->nia.niovs[niov_idx]; + if (!io_zcrx_put_niov_uref(niov)) + continue; + page_pool_mp_return_in_cache(pp, net_iov_to_netmem(niov)); + } while (--entries); + + smp_store_release(&ifq->rq_ring->head, ifq->cached_rq_head); +} + +static void io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *ifq) +{ + struct io_zcrx_area *area = ifq->area; + + spin_lock_bh(&area->freelist_lock); + while (area->free_count && pp->alloc.count < PP_ALLOC_CACHE_REFILL) { + struct net_iov *niov; + u32 pgid; + + pgid = area->freelist[--area->free_count]; + niov = &area->nia.niovs[pgid]; + + page_pool_mp_return_in_cache(pp, net_iov_to_netmem(niov)); + + pp->pages_state_hold_cnt++; + trace_page_pool_state_hold(pp, net_iov_to_netmem(niov), + pp->pages_state_hold_cnt); + } + spin_unlock_bh(&area->freelist_lock); +} + +static void io_zcrx_recycle_niov(struct net_iov *niov) +{ + struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); + + spin_lock_bh(&area->freelist_lock); + area->freelist[area->free_count++] = net_iov_idx(niov); + spin_unlock_bh(&area->freelist_lock); +} + +static netmem_ref io_pp_zc_alloc_netmems(struct page_pool *pp, gfp_t gfp) +{ + struct io_zcrx_ifq *ifq = pp->mp_priv; + + /* pp should already be ensuring that */ + if (unlikely(pp->alloc.count)) + goto out_return; + + io_zcrx_ring_refill(pp, ifq); + if (likely(pp->alloc.count)) + goto out_return; + + io_zcrx_refill_slow(pp, ifq); + if (!pp->alloc.count) + return 0; +out_return: + return pp->alloc.cache[--pp->alloc.count]; +} + +static bool io_pp_zc_release_netmem(struct page_pool *pp, netmem_ref netmem) +{ + struct net_iov *niov; + + if (WARN_ON_ONCE(!netmem_is_net_iov(netmem))) + return false; + + niov = netmem_to_net_iov(netmem); + + if (io_zcrx_niov_put(niov, 1)) + io_zcrx_recycle_niov(niov); + return false; +} + +static void io_pp_zc_scrub(struct page_pool *pp) +{ + struct io_zcrx_ifq *ifq = pp->mp_priv; + struct io_zcrx_area *area = ifq->area; + int i; + + /* Reclaim back all buffers given to the user space. */ + for (i = 0; i < area->nia.num_niovs; i++) { + struct net_iov *niov = &area->nia.niovs[i]; + int count; + + if (!io_zcrx_put_niov_uref(niov)) + continue; + io_zcrx_recycle_niov(niov); + + count = atomic_inc_return_relaxed(&pp->pages_state_release_cnt); + trace_page_pool_state_release(pp, net_iov_to_netmem(niov), count); + } +} + +static int io_pp_zc_init(struct page_pool *pp) +{ + struct io_zcrx_ifq *ifq = pp->mp_priv; + struct io_zcrx_area *area = ifq->area; + int ret; + + if (!ifq) + return -EINVAL; + if (pp->p.order != 0) + return -EINVAL; + if (!pp->p.napi) + return -EINVAL; + + ret = page_pool_mp_init_paged_area(pp, &area->nia, area->pages); + if (ret) + return ret; + + percpu_ref_get(&ifq->ctx->refs); + ifq->pp = pp; + return 0; +} + +static void io_pp_zc_destroy(struct page_pool *pp) +{ + struct io_zcrx_ifq *ifq = pp->mp_priv; + struct io_zcrx_area *area = ifq->area; + + page_pool_mp_release_area(pp, &ifq->area->nia); + + ifq->pp = NULL; + + if (WARN_ON_ONCE(area->free_count != area->nia.num_niovs)) + return; + percpu_ref_put(&ifq->ctx->refs); +} + +static const struct memory_provider_ops io_uring_pp_zc_ops = { + .alloc_netmems = io_pp_zc_alloc_netmems, + .release_netmem = io_pp_zc_release_netmem, + .init = io_pp_zc_init, + .destroy = io_pp_zc_destroy, + .scrub = io_pp_zc_scrub, +}; diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h index 07742c0cfcf3..8515cde78a2c 100644 --- a/io_uring/zcrx.h +++ b/io_uring/zcrx.h @@ -5,6 +5,9 @@ #include #include +#define IO_ZC_RX_UREF 0x10000 +#define IO_ZC_RX_KREF_MASK (IO_ZC_RX_UREF - 1) + struct io_zcrx_area { struct net_iov_area nia; struct io_zcrx_ifq *ifq; @@ -22,10 +25,12 @@ struct io_zcrx_ifq { struct io_ring_ctx *ctx; struct net_device *dev; struct io_zcrx_area *area; + struct page_pool *pp; struct io_uring *rq_ring; struct io_uring_zcrx_rqe *rqes; u32 rq_entries; + u32 cached_rq_head; u32 if_rxq; From patchwork Wed Dec 4 17:21:51 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: David Wei X-Patchwork-Id: 13894123 Received: from mail-pl1-f169.google.com (mail-pl1-f169.google.com [209.85.214.169]) (using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 32EA5215F61 for ; Wed, 4 Dec 2024 17:22:58 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=209.85.214.169 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1733332979; cv=none; b=q818k2ww+0UrZXm0ucEgni93mggqPQbTJBU0Nyv38Yeq1rZ9oT5oPT5hm+e/bshcHzjx31NWJce19Tr4+b/HFkBdK/T8ASYx+8JKgEsp80QXO5q5+BgpfEtcF2uQbFiibdneheaMYqs9lHtzXHXIH0n1YfsUUOZ8q8c8ggkJ9j8= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1733332979; c=relaxed/simple; bh=x/uZA805S3iVIhd3bfJ1Xb1AvU2SLvAinmwQ3ZLvkDI=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=QUnmtePi6f8Tv/5cFMNmtsu06gRmS20cfu0064HrxFvcNNlX0l3HCQjff/ZIlSKRdK3GyaUJnU98BD7iq3XwLCcshrqscedWIjMUdw7BkUC91cvpn1TcWd1K5O6ZizP9WvQaxVcMCOaL1jHENcQFEAYLzTfk72MS0P/oz0CdfoU= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=none (p=none dis=none) header.from=davidwei.uk; spf=none smtp.mailfrom=davidwei.uk; dkim=pass (2048-bit key) header.d=davidwei-uk.20230601.gappssmtp.com header.i=@davidwei-uk.20230601.gappssmtp.com header.b=X3nTMss8; arc=none smtp.client-ip=209.85.214.169 Authentication-Results: smtp.subspace.kernel.org; dmarc=none (p=none dis=none) header.from=davidwei.uk Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=davidwei.uk Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=davidwei-uk.20230601.gappssmtp.com header.i=@davidwei-uk.20230601.gappssmtp.com header.b="X3nTMss8" Received: by mail-pl1-f169.google.com with SMTP id d9443c01a7336-215666ea06aso10143335ad.0 for ; Wed, 04 Dec 2024 09:22:57 -0800 (PST) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=davidwei-uk.20230601.gappssmtp.com; s=20230601; t=1733332977; x=1733937777; darn=vger.kernel.org; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:from:to:cc:subject:date :message-id:reply-to; bh=36AKyg8XNehMVPLo0eGDF2m2iBOmxflwQFk/rKqTLQo=; b=X3nTMss8K7wEAzl4hwwP8zpOoaQoqBVXvMOzYBwAtGK1B38G81r5L0QLjfW3TqDvd7 FgYK/BmR4zBgpsLHuAK/GYqhPZgFpnAVV1d6CjaP4IZ3fNRFlvzMJ5VaGIo6rTF514uF UAAzCU70jNfGSvANwVy7BtPgNlaX5rKz7+MyiAMfg2TGaX2ueRpc1uXlmIHK8AHLbzEX u92hYHU/KSMp5MIgEN2wZyF6OhmU+ImvxL6a3G9R0V6cLK4J+IFEcD4ufP94DJWBKO7+ dVtodhT7C9d/kt9mkXZ/fZtg+K9zPC20D6gY6j2ij0sFLb0D7qmihV+a+A7otSifZWZB 0PWw== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20230601; t=1733332977; x=1733937777; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc :subject:date:message-id:reply-to; bh=36AKyg8XNehMVPLo0eGDF2m2iBOmxflwQFk/rKqTLQo=; b=ITTwWt+szKIXpz6GuZWoj2DdfDUSNWfv5fGQujrAiMWI0J9Xlk4wAPk440MHJ2oZb5 KKmyxsRTzyp6kF9OAkQkqzb89a/wyKPr5gx9WY57gNyxcoF0ToMUKkI0s0iIZw9WFXXf lxdS/8SoJA+oNrQjYSUBx4UfvoylYSMDEYmiESuO0a9gJRYhd7302ldXioiONlssWyh/ yx0epwd1We/dZJY8jGJbjPmm6YY4gMbGemR6B1T9tKRM+Xxw66tjH+WvDGK//MtFHMDM z5ra4EmqMsdxCfWLPm6W7Ju/yGFsCfIaFkJjR/z+olej/hljJSFBn3HRD7aCrICzWoCf y9XQ== X-Gm-Message-State: AOJu0Yyg44S8J1xVOLMy3WZdp9MnBZUn1rRFfwXC9ygD62s56Dmg8RgI 186H6JFdQzwFIxe0JXjJA3yeCjUEhkzbUStfgL0UPFB5B0z53KMYWT3MauomnAMDOBkPkmLGU/R 6 X-Gm-Gg: ASbGncsuERsF78LwJYpEqwioUcpN6R08Ks0Hk8C+NJMdukVV+iRIPTLTErNsTGT65yM SMGXWThwBPX3SWyv0Ma3iJqPM7CXw7vRke8xPViq9MexN3WqPlgZpGBTS8ig5LQcMqTxRA8DnYA VAyBMI689gtgSE6TQ6b1urIuo2oiX3+TeAEBdlt+0uSsvHtMRZ5hfJGpaX74dGKdM1yA1aLNAVM 66OhBrl3ONve+6kRIe+pbgYiuK01X2ecg== X-Google-Smtp-Source: AGHT+IGzx/c2WVMwensNoW5kMz0LaqsR6R0VjtGxt2XKoBe6e2/suMvQRNbm1Iju6RoPSXD6zETw7w== X-Received: by 2002:a17:902:e743:b0:215:5d8c:7e3f with SMTP id d9443c01a7336-215f3cfb7f8mr1957905ad.23.1733332977439; Wed, 04 Dec 2024 09:22:57 -0800 (PST) Received: from localhost ([2a03:2880:ff:4::]) by smtp.gmail.com with ESMTPSA id d9443c01a7336-215609fadb6sm78440225ad.243.2024.12.04.09.22.56 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Wed, 04 Dec 2024 09:22:56 -0800 (PST) From: David Wei To: io-uring@vger.kernel.org, netdev@vger.kernel.org Cc: Jens Axboe , Pavel Begunkov , Jakub Kicinski , Paolo Abeni , "David S. Miller" , Eric Dumazet , Jesper Dangaard Brouer , David Ahern , Mina Almasry , Stanislav Fomichev , Joe Damato , Pedro Tammela Subject: [PATCH net-next v8 12/17] io_uring/zcrx: add io_recvzc request Date: Wed, 4 Dec 2024 09:21:51 -0800 Message-ID: <20241204172204.4180482-13-dw@davidwei.uk> X-Mailer: git-send-email 2.43.5 In-Reply-To: <20241204172204.4180482-1-dw@davidwei.uk> References: <20241204172204.4180482-1-dw@davidwei.uk> Precedence: bulk X-Mailing-List: io-uring@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Add io_uring opcode OP_RECV_ZC for doing zero copy reads out of a socket. Only the connection should be land on the specific rx queue set up for zero copy, and the socket must be handled by the io_uring instance that the rx queue was registered for zero copy with. That's because neither net_iovs / buffers from our queue can be read by outside applications, nor zero copy is possible if traffic for the zero copy connection goes to another queue. This coordination is outside of the scope of this patch series. Also, any traffic directed to the zero copy enabled queue is immediately visible to the application, which is why CAP_NET_ADMIN is required at the registeration step. Of course, no data is actually read out of the socket, it has already been copied by the netdev into userspace memory via DMA. OP_RECV_ZC reads skbs out of the socket and checks that its frags are indeed net_iovs that belong to io_uring. A cqe is queued for each one of these frags. Recall that each cqe is a big cqe, with the top half being an io_uring_zcrx_cqe. The cqe res field contains the len or error. The lower IORING_ZCRX_AREA_SHIFT bits of the struct io_uring_zcrx_cqe::off field contain the offset relative to the start of the zero copy area. The upper part of the off field is trivially zero, and will be used to carry the area id. For now, there is no limit as to how much work each OP_RECV_ZC request does. It will attempt to drain a socket of all available data. This request always operates in multishot mode. Reviewed-by: Jens Axboe Signed-off-by: David Wei --- include/uapi/linux/io_uring.h | 2 + io_uring/io_uring.h | 10 ++ io_uring/net.c | 72 +++++++++++++ io_uring/opdef.c | 16 +++ io_uring/zcrx.c | 186 +++++++++++++++++++++++++++++++++- io_uring/zcrx.h | 13 +++ 6 files changed, 298 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 7d72de92378d..6a8ee24a79c6 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -87,6 +87,7 @@ struct io_uring_sqe { union { __s32 splice_fd_in; __u32 file_index; + __u32 zcrx_ifq_idx; __u32 optlen; struct { __u16 addr_len; @@ -262,6 +263,7 @@ enum io_uring_op { IORING_OP_FTRUNCATE, IORING_OP_BIND, IORING_OP_LISTEN, + IORING_OP_RECV_ZC, /* this goes last, obviously */ IORING_OP_LAST, diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 4070d4c8ef97..0f54d73b80c5 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -182,6 +182,16 @@ static inline bool io_get_cqe(struct io_ring_ctx *ctx, struct io_uring_cqe **ret return io_get_cqe_overflow(ctx, ret, false); } +static inline bool io_defer_get_uncommited_cqe(struct io_ring_ctx *ctx, + struct io_uring_cqe **cqe_ret) +{ + io_lockdep_assert_cq_locked(ctx); + + ctx->cq_extra++; + ctx->submit_state.cq_flush = true; + return io_get_cqe(ctx, cqe_ret); +} + static __always_inline bool io_fill_cqe_req(struct io_ring_ctx *ctx, struct io_kiocb *req) { diff --git a/io_uring/net.c b/io_uring/net.c index df1f7dc6f1c8..f1431317182e 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -16,6 +16,7 @@ #include "net.h" #include "notif.h" #include "rsrc.h" +#include "zcrx.h" #if defined(CONFIG_NET) struct io_shutdown { @@ -88,6 +89,13 @@ struct io_sr_msg { */ #define MULTISHOT_MAX_RETRY 32 +struct io_recvzc { + struct file *file; + unsigned msg_flags; + u16 flags; + struct io_zcrx_ifq *ifq; +}; + int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown); @@ -1208,6 +1216,70 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags) return ret; } +int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc); + unsigned ifq_idx; + + if (unlikely(sqe->file_index || sqe->addr2 || sqe->addr || + sqe->len || sqe->addr3)) + return -EINVAL; + + ifq_idx = READ_ONCE(sqe->zcrx_ifq_idx); + if (ifq_idx != 0) + return -EINVAL; + zc->ifq = req->ctx->ifq; + if (!zc->ifq) + return -EINVAL; + + zc->flags = READ_ONCE(sqe->ioprio); + zc->msg_flags = READ_ONCE(sqe->msg_flags); + if (zc->msg_flags) + return -EINVAL; + if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT)) + return -EINVAL; + /* multishot required */ + if (!(zc->flags & IORING_RECV_MULTISHOT)) + return -EINVAL; + /* All data completions are posted as aux CQEs. */ + req->flags |= REQ_F_APOLL_MULTISHOT; + + return 0; +} + +int io_recvzc(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc); + struct socket *sock; + int ret; + + if (!(req->flags & REQ_F_POLLED) && + (zc->flags & IORING_RECVSEND_POLL_FIRST)) + return -EAGAIN; + + sock = sock_from_file(req->file); + if (unlikely(!sock)) + return -ENOTSOCK; + + ret = io_zcrx_recv(req, zc->ifq, sock, zc->msg_flags | MSG_DONTWAIT, + issue_flags); + if (unlikely(ret <= 0) && ret != -EAGAIN) { + if (ret == -ERESTARTSYS) + ret = -EINTR; + + req_set_fail(req); + io_req_set_res(req, ret, 0); + + if (issue_flags & IO_URING_F_MULTISHOT) + return IOU_STOP_MULTISHOT; + return IOU_OK; + } + + if (issue_flags & IO_URING_F_MULTISHOT) + return IOU_ISSUE_SKIP_COMPLETE; + return -EAGAIN; +} + void io_send_zc_cleanup(struct io_kiocb *req) { struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); diff --git a/io_uring/opdef.c b/io_uring/opdef.c index 3de75eca1c92..6ae00c0af9a8 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -36,6 +36,7 @@ #include "waitid.h" #include "futex.h" #include "truncate.h" +#include "zcrx.h" static int io_no_issue(struct io_kiocb *req, unsigned int issue_flags) { @@ -513,6 +514,18 @@ const struct io_issue_def io_issue_defs[] = { .async_size = sizeof(struct io_async_msghdr), #else .prep = io_eopnotsupp_prep, +#endif + }, + [IORING_OP_RECV_ZC] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + .pollin = 1, + .ioprio = 1, +#if defined(CONFIG_NET) + .prep = io_recvzc_prep, + .issue = io_recvzc, +#else + .prep = io_eopnotsupp_prep, #endif }, }; @@ -744,6 +757,9 @@ const struct io_cold_def io_cold_defs[] = { [IORING_OP_LISTEN] = { .name = "LISTEN", }, + [IORING_OP_RECV_ZC] = { + .name = "RECV_ZC", + }, }; const char *io_uring_get_opcode(u8 opcode) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 7919f5e52c73..004730d16e8f 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -8,6 +8,8 @@ #include #include #include +#include +#include #include @@ -19,7 +21,12 @@ #define IO_RQ_MAX_ENTRIES 32768 -__maybe_unused +struct io_zcrx_args { + struct io_kiocb *req; + struct io_zcrx_ifq *ifq; + struct socket *sock; +}; + static const struct memory_provider_ops io_uring_pp_zc_ops; static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov) @@ -257,6 +264,11 @@ void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx) lockdep_assert_held(&ctx->uring_lock); } +static void io_zcrx_get_buf_uref(struct net_iov *niov) +{ + atomic_long_add(IO_ZC_RX_UREF, &niov->pp_ref_count); +} + static bool io_zcrx_niov_put(struct net_iov *niov, int nr) { return atomic_long_sub_and_test(nr, &niov->pp_ref_count); @@ -453,3 +465,175 @@ static const struct memory_provider_ops io_uring_pp_zc_ops = { .destroy = io_pp_zc_destroy, .scrub = io_pp_zc_scrub, }; + +static bool io_zcrx_queue_cqe(struct io_kiocb *req, struct net_iov *niov, + struct io_zcrx_ifq *ifq, int off, int len) +{ + struct io_uring_zcrx_cqe *rcqe; + struct io_zcrx_area *area; + struct io_uring_cqe *cqe; + u64 offset; + + if (!io_defer_get_uncommited_cqe(req->ctx, &cqe)) + return false; + + cqe->user_data = req->cqe.user_data; + cqe->res = len; + cqe->flags = IORING_CQE_F_MORE; + + area = io_zcrx_iov_to_area(niov); + offset = off + (net_iov_idx(niov) << PAGE_SHIFT); + rcqe = (struct io_uring_zcrx_cqe *)(cqe + 1); + rcqe->off = offset + ((u64)area->area_id << IORING_ZCRX_AREA_SHIFT); + rcqe->__pad = 0; + return true; +} + +static int io_zcrx_recv_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq, + const skb_frag_t *frag, int off, int len) +{ + struct net_iov *niov; + + off += skb_frag_off(frag); + + if (unlikely(!skb_frag_is_net_iov(frag))) + return -EOPNOTSUPP; + + niov = netmem_to_net_iov(frag->netmem); + if (niov->pp->mp_ops != &io_uring_pp_zc_ops || + niov->pp->mp_priv != ifq) + return -EFAULT; + + if (!io_zcrx_queue_cqe(req, niov, ifq, off, len)) + return -ENOSPC; + io_zcrx_get_buf_uref(niov); + return len; +} + +static int +io_zcrx_recv_skb(read_descriptor_t *desc, struct sk_buff *skb, + unsigned int offset, size_t len) +{ + struct io_zcrx_args *args = desc->arg.data; + struct io_zcrx_ifq *ifq = args->ifq; + struct io_kiocb *req = args->req; + struct sk_buff *frag_iter; + unsigned start, start_off; + int i, copy, end, off; + int ret = 0; + + start = skb_headlen(skb); + start_off = offset; + + if (offset < start) + return -EOPNOTSUPP; + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + const skb_frag_t *frag; + + if (WARN_ON(start > offset + len)) + return -EFAULT; + + frag = &skb_shinfo(skb)->frags[i]; + end = start + skb_frag_size(frag); + + if (offset < end) { + copy = end - offset; + if (copy > len) + copy = len; + + off = offset - start; + ret = io_zcrx_recv_frag(req, ifq, frag, off, copy); + if (ret < 0) + goto out; + + offset += ret; + len -= ret; + if (len == 0 || ret != copy) + goto out; + } + start = end; + } + + skb_walk_frags(skb, frag_iter) { + if (WARN_ON(start > offset + len)) + return -EFAULT; + + end = start + frag_iter->len; + if (offset < end) { + copy = end - offset; + if (copy > len) + copy = len; + + off = offset - start; + ret = io_zcrx_recv_skb(desc, frag_iter, off, copy); + if (ret < 0) + goto out; + + offset += ret; + len -= ret; + if (len == 0 || ret != copy) + goto out; + } + start = end; + } + +out: + if (offset == start_off) + return ret; + return offset - start_off; +} + +static int io_zcrx_tcp_recvmsg(struct io_kiocb *req, struct io_zcrx_ifq *ifq, + struct sock *sk, int flags, + unsigned int issue_flags) +{ + struct io_zcrx_args args = { + .req = req, + .ifq = ifq, + .sock = sk->sk_socket, + }; + read_descriptor_t rd_desc = { + .count = 1, + .arg.data = &args, + }; + int ret; + + lock_sock(sk); + ret = tcp_read_sock(sk, &rd_desc, io_zcrx_recv_skb); + if (ret <= 0) { + if (ret < 0 || sock_flag(sk, SOCK_DONE)) + goto out; + if (sk->sk_err) + ret = sock_error(sk); + else if (sk->sk_shutdown & RCV_SHUTDOWN) + goto out; + else if (sk->sk_state == TCP_CLOSE) + ret = -ENOTCONN; + else + ret = -EAGAIN; + } else if (sock_flag(sk, SOCK_DONE)) { + /* Make it to retry until it finally gets 0. */ + if (issue_flags & IO_URING_F_MULTISHOT) + ret = IOU_REQUEUE; + else + ret = -EAGAIN; + } +out: + release_sock(sk); + return ret; +} + +int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq, + struct socket *sock, unsigned int flags, + unsigned int issue_flags) +{ + struct sock *sk = sock->sk; + const struct proto *prot = READ_ONCE(sk->sk_prot); + + if (prot->recvmsg != tcp_recvmsg) + return -EPROTONOSUPPORT; + + sock_rps_record_flow(sk); + return io_zcrx_tcp_recvmsg(req, ifq, sk, flags, issue_flags); +} diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h index 8515cde78a2c..ffc3e333b4af 100644 --- a/io_uring/zcrx.h +++ b/io_uring/zcrx.h @@ -3,6 +3,7 @@ #define IOU_ZC_RX_H #include +#include #include #define IO_ZC_RX_UREF 0x10000 @@ -42,6 +43,9 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, struct io_uring_zcrx_ifq_reg __user *arg); void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx); void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx); +int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq, + struct socket *sock, unsigned int flags, + unsigned int issue_flags); #else static inline int io_register_zcrx_ifq(struct io_ring_ctx *ctx, struct io_uring_zcrx_ifq_reg __user *arg) @@ -54,6 +58,15 @@ static inline void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) static inline void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx) { } +static inline int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq, + struct socket *sock, unsigned int flags, + unsigned int issue_flags) +{ + return -EOPNOTSUPP; +} #endif +int io_recvzc(struct io_kiocb *req, unsigned int issue_flags); +int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); + #endif From patchwork Wed Dec 4 17:21:52 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: David Wei X-Patchwork-Id: 13894124 Received: from mail-pf1-f176.google.com (mail-pf1-f176.google.com [209.85.210.176]) (using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 77ADA215F74 for ; Wed, 4 Dec 2024 17:22:59 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=209.85.210.176 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1733332981; cv=none; b=sSpfAZ+ELrVBLaNhAUdM2XlPBUkhJCEVY3TBi/NyfBM4jIq2eLLB+a/31KQrl+tj8ZXkZxJ6Ew60R26xc4SC7ML42ofcT2wUH7Yp9h6/DlQlAGrCCFivv8VQ/dtYBmrsDt5ei3kXpzx+PFpgPVLfB4xrShuM/QYFb+o+Jvo7ZIo= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1733332981; c=relaxed/simple; bh=yHkBS0Ae7tWKphGIsp/BscufJpA2CS64Q5wOxYY5YaY=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=iw+Qw2jmYrKWlrgRJRGMam1pC893IG2CmC79k/x3AW4ncuQHKGlvKdc1fQeRdRpo9mNNor1CThKgN/pABv/5fhJs7/4wLOTMokl1na0Hn+/fHZh4E3O/3GG0X7xhV6s/C7esDEG10kiKfjjLWi95wY4MFbBv4qLrtK91UluwRLU= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=none (p=none dis=none) header.from=davidwei.uk; spf=none smtp.mailfrom=davidwei.uk; dkim=pass (2048-bit key) header.d=davidwei-uk.20230601.gappssmtp.com header.i=@davidwei-uk.20230601.gappssmtp.com header.b=pGD6cees; arc=none smtp.client-ip=209.85.210.176 Authentication-Results: smtp.subspace.kernel.org; dmarc=none (p=none dis=none) header.from=davidwei.uk Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=davidwei.uk Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=davidwei-uk.20230601.gappssmtp.com header.i=@davidwei-uk.20230601.gappssmtp.com header.b="pGD6cees" Received: by mail-pf1-f176.google.com with SMTP id d2e1a72fcca58-7251331e756so57879b3a.3 for ; Wed, 04 Dec 2024 09:22:59 -0800 (PST) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=davidwei-uk.20230601.gappssmtp.com; s=20230601; t=1733332979; x=1733937779; darn=vger.kernel.org; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:from:to:cc:subject:date :message-id:reply-to; bh=2VznSQpWwqyyoWSCkKyMSe9FismvnJKrDHHL+sPjQaA=; b=pGD6ceesua5fqcoDyv6tYtlyfBSCa74Ez8d8BPc+DWDUbdTkR7oqw7Z06A4lDTpRqW cMB/SAy8Nx97kwed1h2TYDAtCewFDjz+A1pPUuAH/lfo3dMVhvhw4TJuS/nBbKyioINg j6uFVqFDg8cd/iRfoNmC5TKG6lJpXPTZZapOh7GNxQDeIcvYCURtyMoD8NkjY0LCaBHK 1+woBjiyw5H/r+SR/4j56/clo2iyPjidb3D2IR02hNRyvTkg6pS6EW9ZmuKKJAoBuGji xYXrH1mvI5fn9kELhX7Rmbixan/slt0NYgxPte5MhaVPDFxkOa9rVzFgbI37jaw7hqVP uYAA== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20230601; t=1733332979; x=1733937779; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc :subject:date:message-id:reply-to; bh=2VznSQpWwqyyoWSCkKyMSe9FismvnJKrDHHL+sPjQaA=; b=rozGZKqpGAve3UsDhBKG426PDI+w0g+IqetAQrXil3Qy1J6P1PZg9dbYbIGm3On71c Yv7xUahqc/SOHkpqAaN48xYELMmNd/mWeuRzRzvAHrlvr9Hs0bJJwwZYetGFqwNm68NJ iBf+6DsbJaP2nMv8SNjNMv6+Db3RCIVXhoahpAzj1KfFigeVaJIQC6zxIUPGmvK/m8RM 0IHrcFnfx+cyINBd5UcLkJzrZTcm1Gjcw/OdcCFJh/Deinn778/wJ5W1xSMgHTHhITzO A341C4CeDXFiY1a088mKHcXF6RVj8S/3gAEhV3ZWfRNUJYOkT0NWuZJjVNugtl3FbPVE 3D+w== X-Gm-Message-State: AOJu0YwTInomVSbNP/oNN0VO6ZGru80WvjTiw49aa76SQ5TzWlVSgtax B/e8ek2D0MvDmsleZG5NrPp9a+Pw3GKh2CLAWSWZ6dMVjIxmfy2zY17JQfk2O4epo1bzH6FmNlT l X-Gm-Gg: ASbGncuhbMB1utEr+6BsnGVodnR1RFsqoiMaTunLcmNcC+vj74os0IsioEkceL5Z1Eg CneizePHKXkrhpYMQ8Si8qXcfE0EZlcmTj116ksbh4t+UcYahbakkWNBiH1fsuy3+hT5prEIUzX ZycKl/n0F2DP/bvO0LWnBsfrcoiMxWRAI+8wZRHp2VtaUFECXp+cgoL0J8nfuxxFIdauMcbIe1Y 64qq9cb173Pv/I+kAirhkF68jWDq55m6Q== X-Google-Smtp-Source: AGHT+IEYZB2vCFuL7ZbQFmggnrjBxbbYL45SNh5yqfMSBWNINZX4pOWFcZW8ol1Bys8QeuJPGdAFuA== X-Received: by 2002:a05:6a00:3c96:b0:725:9ac3:f35 with SMTP id d2e1a72fcca58-7259ac31291mr1468948b3a.4.1733332978687; Wed, 04 Dec 2024 09:22:58 -0800 (PST) Received: from localhost ([2a03:2880:ff:e::]) by smtp.gmail.com with ESMTPSA id d2e1a72fcca58-72541762c0csm12633993b3a.33.2024.12.04.09.22.58 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Wed, 04 Dec 2024 09:22:58 -0800 (PST) From: David Wei To: io-uring@vger.kernel.org, netdev@vger.kernel.org Cc: Jens Axboe , Pavel Begunkov , Jakub Kicinski , Paolo Abeni , "David S. Miller" , Eric Dumazet , Jesper Dangaard Brouer , David Ahern , Mina Almasry , Stanislav Fomichev , Joe Damato , Pedro Tammela Subject: [PATCH net-next v8 13/17] io_uring/zcrx: set pp memory provider for an rx queue Date: Wed, 4 Dec 2024 09:21:52 -0800 Message-ID: <20241204172204.4180482-14-dw@davidwei.uk> X-Mailer: git-send-email 2.43.5 In-Reply-To: <20241204172204.4180482-1-dw@davidwei.uk> References: <20241204172204.4180482-1-dw@davidwei.uk> Precedence: bulk X-Mailing-List: io-uring@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: David Wei Set the page pool memory provider for the rx queue configured for zero copy to io_uring. Then the rx queue is reset using netdev_rx_queue_restart() and netdev core + page pool will take care of filling the rx queue from the io_uring zero copy memory provider. For now, there is only one ifq so its destruction happens implicitly during io_uring cleanup. Reviewed-by: Jens Axboe Signed-off-by: David Wei --- io_uring/zcrx.c | 92 +++++++++++++++++++++++++++++++++++++++++++++---- io_uring/zcrx.h | 2 ++ 2 files changed, 87 insertions(+), 7 deletions(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 004730d16e8f..0cba433c764a 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -36,6 +37,65 @@ static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *nio return container_of(owner, struct io_zcrx_area, nia); } +static int io_open_zc_rxq(struct io_zcrx_ifq *ifq, unsigned ifq_idx) +{ + struct netdev_rx_queue *rxq; + struct net_device *dev = ifq->dev; + int ret; + + ASSERT_RTNL(); + + if (ifq_idx >= dev->num_rx_queues) + return -EINVAL; + ifq_idx = array_index_nospec(ifq_idx, dev->num_rx_queues); + + rxq = __netif_get_rx_queue(ifq->dev, ifq_idx); + if (rxq->mp_params.mp_priv) + return -EEXIST; + + ifq->if_rxq = ifq_idx; + rxq->mp_params.mp_ops = &io_uring_pp_zc_ops; + rxq->mp_params.mp_priv = ifq; + ret = netdev_rx_queue_restart(ifq->dev, ifq->if_rxq); + if (ret) + goto fail; + return 0; +fail: + rxq->mp_params.mp_ops = NULL; + rxq->mp_params.mp_priv = NULL; + ifq->if_rxq = -1; + return ret; +} + +static void io_close_zc_rxq(struct io_zcrx_ifq *ifq) +{ + struct netdev_rx_queue *rxq; + int err; + + if (ifq->if_rxq == -1) + return; + + rtnl_lock(); + if (WARN_ON_ONCE(ifq->if_rxq >= ifq->dev->num_rx_queues)) { + rtnl_unlock(); + return; + } + + rxq = __netif_get_rx_queue(ifq->dev, ifq->if_rxq); + + WARN_ON_ONCE(rxq->mp_params.mp_priv != ifq); + + rxq->mp_params.mp_ops = NULL; + rxq->mp_params.mp_priv = NULL; + + err = netdev_rx_queue_restart(ifq->dev, ifq->if_rxq); + if (err) + pr_devel("io_uring: can't restart a queue on zcrx close\n"); + + rtnl_unlock(); + ifq->if_rxq = -1; +} + static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, struct io_uring_zcrx_ifq_reg *reg, struct io_uring_region_desc *rd) @@ -161,9 +221,12 @@ static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx) static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq) { + io_close_zc_rxq(ifq); + if (ifq->area) io_zcrx_free_area(ifq->area); - + if (ifq->dev) + netdev_put(ifq->dev, &ifq->netdev_tracker); io_free_rbuf_ring(ifq); kfree(ifq); } @@ -222,7 +285,18 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, goto err; ifq->rq_entries = reg.rq_entries; - ifq->if_rxq = reg.if_rxq; + + ret = -ENODEV; + rtnl_lock(); + ifq->dev = netdev_get_by_index(current->nsproxy->net_ns, reg.if_idx, + &ifq->netdev_tracker, GFP_KERNEL); + if (!ifq->dev) + goto err_rtnl_unlock; + + ret = io_open_zc_rxq(ifq, reg.if_rxq); + if (ret) + goto err_rtnl_unlock; + rtnl_unlock(); ring_sz = sizeof(struct io_uring); rqes_sz = sizeof(struct io_uring_zcrx_rqe) * ifq->rq_entries; @@ -231,16 +305,17 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, reg.offsets.tail = offsetof(struct io_uring, tail); if (copy_to_user(arg, ®, sizeof(reg)) || - copy_to_user(u64_to_user_ptr(reg.region_ptr), &rd, sizeof(rd))) { - ret = -EFAULT; - goto err; - } - if (copy_to_user(u64_to_user_ptr(reg.area_ptr), &area, sizeof(area))) { + copy_to_user(u64_to_user_ptr(reg.region_ptr), &rd, sizeof(rd)) || + copy_to_user(u64_to_user_ptr(reg.area_ptr), &area, sizeof(area))) { + io_close_zc_rxq(ifq); ret = -EFAULT; goto err; } ctx->ifq = ifq; return 0; + +err_rtnl_unlock: + rtnl_unlock(); err: io_zcrx_ifq_free(ifq); return ret; @@ -262,6 +337,9 @@ void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx) { lockdep_assert_held(&ctx->uring_lock); + + if (ctx->ifq) + io_close_zc_rxq(ctx->ifq); } static void io_zcrx_get_buf_uref(struct net_iov *niov) diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h index ffc3e333b4af..01a167e08c4b 100644 --- a/io_uring/zcrx.h +++ b/io_uring/zcrx.h @@ -5,6 +5,7 @@ #include #include #include +#include #define IO_ZC_RX_UREF 0x10000 #define IO_ZC_RX_KREF_MASK (IO_ZC_RX_UREF - 1) @@ -36,6 +37,7 @@ struct io_zcrx_ifq { u32 if_rxq; struct io_mapped_region region; + netdevice_tracker netdev_tracker; }; #if defined(CONFIG_IO_URING_ZCRX) From patchwork Wed Dec 4 17:21:53 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: David Wei X-Patchwork-Id: 13894126 Received: from mail-pl1-f178.google.com (mail-pl1-f178.google.com [209.85.214.178]) (using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 9919E216386 for ; Wed, 4 Dec 2024 17:23:00 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=209.85.214.178 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1733332983; cv=none; b=FiQSOPEZf76TqkAwuhvy8dG6mdltpCh6iD7npMjsAdrctp504Ezzw25gVRFOZk6mEKWZrtNU80oos4TNCeXSveJu2uQ5K4fwl51tP5X7NMytBZf9WsKl9wsFMgKHTKAp1CG6dsxTN9ylunV8aox3GDa35a+g8BcsVYylpaOAR+U= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1733332983; c=relaxed/simple; bh=rqdrXpLj1vIPDgTAK8qHnKFLuAGob9uTJRFFgJSyn4k=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=TXb65/qFxfv5rJahWgTFeWItfx+1QILwan1/D40A6NLmoOz/BSwYweExmVd/UlQiYBTCsh1XpvC6YFKW4mfjlfj0/j9uxow52qxOz/uvJ42qGCDgVUyz9fRpN6lLJqjcY/LfTZ/lMjOygpXPuhScxmAIHD7S7JwCbVV/z2LpgDM= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=none (p=none dis=none) header.from=davidwei.uk; spf=none smtp.mailfrom=davidwei.uk; dkim=pass (2048-bit key) header.d=davidwei-uk.20230601.gappssmtp.com header.i=@davidwei-uk.20230601.gappssmtp.com header.b=0dCaU245; arc=none smtp.client-ip=209.85.214.178 Authentication-Results: smtp.subspace.kernel.org; dmarc=none (p=none dis=none) header.from=davidwei.uk Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=davidwei.uk Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=davidwei-uk.20230601.gappssmtp.com header.i=@davidwei-uk.20230601.gappssmtp.com header.b="0dCaU245" Received: by mail-pl1-f178.google.com with SMTP id d9443c01a7336-215b9a754fbso99495ad.1 for ; Wed, 04 Dec 2024 09:23:00 -0800 (PST) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=davidwei-uk.20230601.gappssmtp.com; s=20230601; t=1733332980; x=1733937780; darn=vger.kernel.org; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:from:to:cc:subject:date :message-id:reply-to; bh=BHjD95dIspr2lkyny7hIYIXcPHOLxQo/POPnhRJUUBQ=; b=0dCaU245sodU0B2+bwRSrY0GYK1cWCOgxxDeelrUQAqTFHzeoot0g0utQdaWZ0oO8Z 9xExC2KJb4BynD939eyHojT6PdLb+9/O2tnTfngN6eXQvQghPLv73JmxO5/Vnn7jKvdg a2DoToYucCI3WbozveGe4oH9s784+XDMuOfoD2RT7FwfNJYAS5Fx8u7NshrV/aj9JjpP KLAwip7AXRmZd1F4yOdKhkUPaTelloNwS7aw0Vsb87R1/5UYUfHfcvKxALtjcIIqy5dG aT2SvwZnVCxuwhGdaslZAktNvZMzEQnezLntoQ8QjFceW2nnJCXFIxv70ihTcBVOmpCU 9ukg== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20230601; t=1733332980; x=1733937780; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc :subject:date:message-id:reply-to; bh=BHjD95dIspr2lkyny7hIYIXcPHOLxQo/POPnhRJUUBQ=; b=tywvXol1c2FtkVi/rhllTqU/awRJ1pOCmc/Dr8wxz/+bPyeAcfmMky3ArfYibtdpfS UXluFp1YuhqdqYKPhGov30KLFak4KFtza+/6iG8KFHp3m1zN0R9E/iEsTleKBYvwIPkX L/ubwJpPwyFZ2nUWlGY0r5zFQpw/dE+ZKcShswRHXfReMqVg4IGZo4pt8kQGvzMR4Poq 2t2852k0TovX4BLGiLbo+PHVUgd2Y7E/W+Aqz0RvTsOC7mPc/JimaaHp78nPPQEjjHYG 3Z3w+TOOjUsbD++0Ipm8NX9BY39DKmIbpAJ7BxE2r/yx+OHmbI3+31uNE9p7QdMuS1hm IaDA== X-Gm-Message-State: AOJu0YyFYZgDumpw5T9a1Ta5Sm4R4nHZHOE+B60ano5GaU57LmuWJABL W88Ves1J6BiA7IMcd38fis3n4NF709fBykoUO4JCyfx9Z8qdoYardT5oeRyZy0h9L73cZCQrqTR l X-Gm-Gg: ASbGncszG34RMZt0voZEhdmmlbp9AUxYLQzZGza2V2o/BpCA1iv2FUHIz1QcOCIzoJ3 ca0AvcyEh8+n3qdWt96vIsmMtAUdcCmPDpmosA4cdVfOxl2Kx2P9LOBWmvQR7wJtTgskxjsFOyI hSar0Sp222qkEy31cKhMmNjykZGlsbYEtS9wFH5JsnqDgr0e4f4ppydUOja2nQ5uC5ptJm65xMl oKQ0HozEZ0iUBZyIfPyIFKmD93DdYsnh7o= X-Google-Smtp-Source: AGHT+IFnS4P+EH2EQ6LLJk/1czqV13bbkASeblrjfxdxcHB07RmxAQGEPfiCnXUTpbfx5SrA4yO9+Q== X-Received: by 2002:a17:902:c94d:b0:215:5f18:48ef with SMTP id d9443c01a7336-215bd201e95mr78718725ad.34.1733332979972; Wed, 04 Dec 2024 09:22:59 -0800 (PST) Received: from localhost ([2a03:2880:ff:15::]) by smtp.gmail.com with ESMTPSA id d9443c01a7336-21521905f2bsm115331945ad.93.2024.12.04.09.22.59 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Wed, 04 Dec 2024 09:22:59 -0800 (PST) From: David Wei To: io-uring@vger.kernel.org, netdev@vger.kernel.org Cc: Jens Axboe , Pavel Begunkov , Jakub Kicinski , Paolo Abeni , "David S. Miller" , Eric Dumazet , Jesper Dangaard Brouer , David Ahern , Mina Almasry , Stanislav Fomichev , Joe Damato , Pedro Tammela Subject: [PATCH net-next v8 14/17] io_uring/zcrx: add copy fallback Date: Wed, 4 Dec 2024 09:21:53 -0800 Message-ID: <20241204172204.4180482-15-dw@davidwei.uk> X-Mailer: git-send-email 2.43.5 In-Reply-To: <20241204172204.4180482-1-dw@davidwei.uk> References: <20241204172204.4180482-1-dw@davidwei.uk> Precedence: bulk X-Mailing-List: io-uring@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Pavel Begunkov There are scenarios in which the zerocopy path might get a normal in-kernel buffer, it could be a mis-steered packet or simply the linear part of an skb. Another use case is to allow the driver to allocate kernel pages when it's out of zc buffers, which makes it more resilient to spikes in load and allow the user to choose the balance between the amount of memory provided and performance. At the moment we fail such requests. Instead, grab a buffer from the page pool, copy data there, and return back to user in the usual way. Because the refill ring is private to the napi our page pool is running from, it's done by stopping the napi via napi_execute() helper. It grabs only one buffer, which is inefficient, and improving it is left for follow up patches. Signed-off-by: Pavel Begunkov Signed-off-by: David Wei --- io_uring/zcrx.c | 133 +++++++++++++++++++++++++++++++++++++++++++++--- io_uring/zcrx.h | 1 + 2 files changed, 127 insertions(+), 7 deletions(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 0cba433c764a..8e4b9bfaed99 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -5,6 +5,8 @@ #include #include #include +#include +#include #include #include #include @@ -28,6 +30,11 @@ struct io_zcrx_args { struct socket *sock; }; +struct io_zc_refill_data { + struct io_zcrx_ifq *ifq; + struct net_iov *niov; +}; + static const struct memory_provider_ops io_uring_pp_zc_ops; static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov) @@ -37,6 +44,13 @@ static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *nio return container_of(owner, struct io_zcrx_area, nia); } +static inline struct page *io_zcrx_iov_page(const struct net_iov *niov) +{ + struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); + + return area->pages[net_iov_idx(niov)]; +} + static int io_open_zc_rxq(struct io_zcrx_ifq *ifq, unsigned ifq_idx) { struct netdev_rx_queue *rxq; @@ -59,6 +73,13 @@ static int io_open_zc_rxq(struct io_zcrx_ifq *ifq, unsigned ifq_idx) ret = netdev_rx_queue_restart(ifq->dev, ifq->if_rxq); if (ret) goto fail; + + if (WARN_ON_ONCE(!ifq->pp)) { + ret = -EFAULT; + goto fail; + } + /* grab napi_id while still under rtnl */ + ifq->napi_id = ifq->pp->p.napi->napi_id; return 0; fail: rxq->mp_params.mp_ops = NULL; @@ -530,6 +551,7 @@ static void io_pp_zc_destroy(struct page_pool *pp) page_pool_mp_release_area(pp, &ifq->area->nia); ifq->pp = NULL; + ifq->napi_id = 0; if (WARN_ON_ONCE(area->free_count != area->nia.num_niovs)) return; @@ -544,6 +566,34 @@ static const struct memory_provider_ops io_uring_pp_zc_ops = { .scrub = io_pp_zc_scrub, }; +static void io_napi_refill(void *data) +{ + struct io_zc_refill_data *rd = data; + struct io_zcrx_ifq *ifq = rd->ifq; + netmem_ref netmem; + + if (WARN_ON_ONCE(!ifq->pp)) + return; + + netmem = page_pool_alloc_netmem(ifq->pp, GFP_ATOMIC | __GFP_NOWARN); + if (!netmem) + return; + if (WARN_ON_ONCE(!netmem_is_net_iov(netmem))) + return; + + rd->niov = netmem_to_net_iov(netmem); +} + +static struct net_iov *io_zc_get_buf_task_safe(struct io_zcrx_ifq *ifq) +{ + struct io_zc_refill_data rd = { + .ifq = ifq, + }; + + napi_execute(ifq->napi_id, io_napi_refill, &rd); + return rd.niov; +} + static bool io_zcrx_queue_cqe(struct io_kiocb *req, struct net_iov *niov, struct io_zcrx_ifq *ifq, int off, int len) { @@ -567,6 +617,45 @@ static bool io_zcrx_queue_cqe(struct io_kiocb *req, struct net_iov *niov, return true; } +static ssize_t io_zcrx_copy_chunk(struct io_kiocb *req, struct io_zcrx_ifq *ifq, + void *data, unsigned int offset, size_t len) +{ + size_t copy_size, copied = 0; + int ret = 0, off = 0; + struct page *page; + u8 *vaddr; + + do { + struct net_iov *niov; + + niov = io_zc_get_buf_task_safe(ifq); + if (!niov) { + ret = -ENOMEM; + break; + } + + page = io_zcrx_iov_page(niov); + vaddr = kmap_local_page(page); + copy_size = min_t(size_t, PAGE_SIZE, len); + memcpy(vaddr, data + offset, copy_size); + kunmap_local(vaddr); + + if (!io_zcrx_queue_cqe(req, niov, ifq, off, copy_size)) { + napi_pp_put_page(net_iov_to_netmem(niov)); + return -ENOSPC; + } + + io_zcrx_get_buf_uref(niov); + napi_pp_put_page(net_iov_to_netmem(niov)); + + offset += copy_size; + len -= copy_size; + copied += copy_size; + } while (offset < len); + + return copied ? copied : ret; +} + static int io_zcrx_recv_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq, const skb_frag_t *frag, int off, int len) { @@ -574,8 +663,24 @@ static int io_zcrx_recv_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq, off += skb_frag_off(frag); - if (unlikely(!skb_frag_is_net_iov(frag))) - return -EOPNOTSUPP; + if (unlikely(!skb_frag_is_net_iov(frag))) { + struct page *page = skb_frag_page(frag); + u32 p_off, p_len, t, copied = 0; + u8 *vaddr; + int ret = 0; + + skb_frag_foreach_page(frag, off, len, + page, p_off, p_len, t) { + vaddr = kmap_local_page(page); + ret = io_zcrx_copy_chunk(req, ifq, vaddr, p_off, p_len); + kunmap_local(vaddr); + + if (ret < 0) + return copied ? copied : ret; + copied += ret; + } + return copied; + } niov = netmem_to_net_iov(frag->netmem); if (niov->pp->mp_ops != &io_uring_pp_zc_ops || @@ -596,15 +701,29 @@ io_zcrx_recv_skb(read_descriptor_t *desc, struct sk_buff *skb, struct io_zcrx_ifq *ifq = args->ifq; struct io_kiocb *req = args->req; struct sk_buff *frag_iter; - unsigned start, start_off; + unsigned start, start_off = offset; int i, copy, end, off; int ret = 0; - start = skb_headlen(skb); - start_off = offset; + if (unlikely(offset < skb_headlen(skb))) { + ssize_t copied; + size_t to_copy; - if (offset < start) - return -EOPNOTSUPP; + to_copy = min_t(size_t, skb_headlen(skb) - offset, len); + copied = io_zcrx_copy_chunk(req, ifq, skb->data, offset, to_copy); + if (copied < 0) { + ret = copied; + goto out; + } + offset += copied; + len -= copied; + if (!len) + goto out; + if (offset != skb_headlen(skb)) + goto out; + } + + start = skb_headlen(skb); for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { const skb_frag_t *frag; diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h index 01a167e08c4b..ff4aaecc560c 100644 --- a/io_uring/zcrx.h +++ b/io_uring/zcrx.h @@ -38,6 +38,7 @@ struct io_zcrx_ifq { struct io_mapped_region region; netdevice_tracker netdev_tracker; + unsigned napi_id; }; #if defined(CONFIG_IO_URING_ZCRX) From patchwork Wed Dec 4 17:21:54 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: David Wei X-Patchwork-Id: 13894125 Received: from mail-pl1-f177.google.com (mail-pl1-f177.google.com [209.85.214.177]) (using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id AECB1216395 for ; Wed, 4 Dec 2024 17:23:01 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=209.85.214.177 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1733332983; cv=none; b=iCwsAZwgReJK81UUKS0elqfHmCjOgOiClHZQjMEOubGvgGscrET+7IKW8ssFaF7jopHI8H/YrEk/qYwZbTUXZ4gT2K6/oizYi6SnoncmvVGYBC6+OkHCXdSH6Ffz38lZaY1uFTpESXDn6vAcCbzV71JIP39genpd5hQCd2PvHDA= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1733332983; c=relaxed/simple; bh=fIRUjS4vvOeMBBHkJIKjbB93rfw/xVgs/ceXybEIiQQ=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=X3oCLdpM3tz9CxxUf9oSU2sXrYahloOrhVL7UTM/60k8wM4mJyfQzeQPfqJzs2snDFHBg8p45ldxFGMsJmytbDyYt6rCRrqNrfXd71GauP7fSXhM5VteCSoyZhid0LD2FJWPbOVB2E+ROvZLkUa2Ke+VZ+hC75+7qHkVzQlsyr8= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=none (p=none dis=none) header.from=davidwei.uk; spf=none smtp.mailfrom=davidwei.uk; dkim=pass (2048-bit key) header.d=davidwei-uk.20230601.gappssmtp.com header.i=@davidwei-uk.20230601.gappssmtp.com header.b=VPIYiPnj; arc=none smtp.client-ip=209.85.214.177 Authentication-Results: smtp.subspace.kernel.org; dmarc=none (p=none dis=none) header.from=davidwei.uk Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=davidwei.uk Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=davidwei-uk.20230601.gappssmtp.com header.i=@davidwei-uk.20230601.gappssmtp.com header.b="VPIYiPnj" Received: by mail-pl1-f177.google.com with SMTP id d9443c01a7336-215b4681c94so203335ad.0 for ; Wed, 04 Dec 2024 09:23:01 -0800 (PST) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=davidwei-uk.20230601.gappssmtp.com; s=20230601; t=1733332981; x=1733937781; darn=vger.kernel.org; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:from:to:cc:subject:date :message-id:reply-to; bh=4lhNvgv8G0hs8gBqFhFT3X6E9Ed5HipxBFCScbaRG/E=; b=VPIYiPnj8d38QUwreM78GLNMAffERT7xOY6Ah5mjK+iqYMgaojuZNRPOm++FphmRMZ 5U9IVz7KRDyK+AfYv5lBOF4LdwqPeWUC997Qjs+prlo01PzdW+PpdL+BL07hgpz0frzF eeEY5ZRDPduMNfy96zuzjZA5e+llsdAknO/CoYUtWV5zRQya1951jvvpQFVGgZ24B5RM Szv4GtqyiICejlc9iEDXAzeMRT5uQNqoZjfrzA9RILuoA6CZwliMl5m9/9uUA2YUTVCQ uf/sg2lNs9n6QFoKbxn5AflTtEOCuyiQbsplx86LZs5NCS0OtPV6o7MSc/WKV8JNaY2L wPfA== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20230601; t=1733332981; x=1733937781; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc :subject:date:message-id:reply-to; bh=4lhNvgv8G0hs8gBqFhFT3X6E9Ed5HipxBFCScbaRG/E=; b=ByuXUT41btZrghUnV/5e6BkdSzn3zD1D/ACsr208ghvb/BW4X8FHNsq7Zs9feXXIvS jT7JT875exx04aF2Td5UjqPJBN1cscQ2078kgQOLklAqoAXIjxC5atKc6gjWRinfNDW9 kgcwI8W/kCS742APJXc4M8ZLwk+X71DnMrj9+EqyrkjK2HiGupF7nX7SteOappBCT3Uh oIAmUhOjwtm4goTcsLs2jKfKiUcJtjjpoJLbR5NGfnhgo+fr82A70kmVtJSG23JxQNE6 vvD3F55IBxkMeqTaXshFeQMEqHrTjn2PNqPR08FBUYW/UwzLm95mxNvv//nt0UK0nP2J 3fbQ== X-Gm-Message-State: AOJu0YzgzTMAyY3OlsuvKOLvgBiqKmQpX0Z9AABbR6H38iE/hU7Br0i7 eliXcP8CN0ApgCDXtVCYk8BeLbEnuUtVTiBSGJ48MAXA2unxc9/ysBkVDvLoSXDXJ4EXNNeHjJ0 + X-Gm-Gg: ASbGncuaU/66YDRngOmW8rYUYrE8mDYZo+SrtQ/m/Yt/9ZsJWgFBx+Q9fwASwm/tSJ3 +dJ0sdFo1ZVK+Q6dDJ/Oc2P4n+6HdgLQ8k0U2IOhhkuVh873tNse6bKLlpzX53O6vUEi1c8y8vS /ZP5glD4jngDp/vPoUGbnke2SHUly83JxwQBfsJyGulKb5zAUWs4O7ODsobPjX297iP6sHWxq0u 0+tOiY8tvbalf6hD/mtMv9roAGw9GetTw== X-Google-Smtp-Source: AGHT+IEyGgkHrloo1DbCEU1iG+0u4FRI3uHNEIIXRDGGus9M7foJV2KsxPqwGImtluQhQ5s+Dxdz0w== X-Received: by 2002:a17:902:cf04:b0:215:6489:cfb8 with SMTP id d9443c01a7336-215bcfc475cmr79920235ad.10.1733332981197; Wed, 04 Dec 2024 09:23:01 -0800 (PST) Received: from localhost ([2a03:2880:ff:e::]) by smtp.gmail.com with ESMTPSA id d9443c01a7336-2154e5d6965sm87999925ad.71.2024.12.04.09.23.00 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Wed, 04 Dec 2024 09:23:00 -0800 (PST) From: David Wei To: io-uring@vger.kernel.org, netdev@vger.kernel.org Cc: Jens Axboe , Pavel Begunkov , Jakub Kicinski , Paolo Abeni , "David S. Miller" , Eric Dumazet , Jesper Dangaard Brouer , David Ahern , Mina Almasry , Stanislav Fomichev , Joe Damato , Pedro Tammela Subject: [PATCH net-next v8 15/17] io_uring/zcrx: throttle receive requests Date: Wed, 4 Dec 2024 09:21:54 -0800 Message-ID: <20241204172204.4180482-16-dw@davidwei.uk> X-Mailer: git-send-email 2.43.5 In-Reply-To: <20241204172204.4180482-1-dw@davidwei.uk> References: <20241204172204.4180482-1-dw@davidwei.uk> Precedence: bulk X-Mailing-List: io-uring@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Pavel Begunkov io_zc_rx_tcp_recvmsg() continues until it fails or there is nothing to receive. If the other side sends fast enough, we might get stuck in io_zc_rx_tcp_recvmsg() producing more and more CQEs but not letting the user to handle them leading to unbound latencies. Break out of it based on an arbitrarily chosen limit, the upper layer will either return to userspace or requeue the request. Reviewed-by: Jens Axboe Signed-off-by: Pavel Begunkov Signed-off-by: David Wei --- io_uring/net.c | 2 ++ io_uring/zcrx.c | 9 +++++++++ 2 files changed, 11 insertions(+) diff --git a/io_uring/net.c b/io_uring/net.c index f1431317182e..c8d718d7cbe6 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -1266,6 +1266,8 @@ int io_recvzc(struct io_kiocb *req, unsigned int issue_flags) if (unlikely(ret <= 0) && ret != -EAGAIN) { if (ret == -ERESTARTSYS) ret = -EINTR; + if (ret == IOU_REQUEUE) + return IOU_REQUEUE; req_set_fail(req); io_req_set_res(req, ret, 0); diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 8e4b9bfaed99..130583fbe7ca 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -24,10 +24,13 @@ #define IO_RQ_MAX_ENTRIES 32768 +#define IO_SKBS_PER_CALL_LIMIT 20 + struct io_zcrx_args { struct io_kiocb *req; struct io_zcrx_ifq *ifq; struct socket *sock; + unsigned nr_skbs; }; struct io_zc_refill_data { @@ -705,6 +708,9 @@ io_zcrx_recv_skb(read_descriptor_t *desc, struct sk_buff *skb, int i, copy, end, off; int ret = 0; + if (unlikely(args->nr_skbs++ > IO_SKBS_PER_CALL_LIMIT)) + return -EAGAIN; + if (unlikely(offset < skb_headlen(skb))) { ssize_t copied; size_t to_copy; @@ -809,6 +815,9 @@ static int io_zcrx_tcp_recvmsg(struct io_kiocb *req, struct io_zcrx_ifq *ifq, ret = -ENOTCONN; else ret = -EAGAIN; + } else if (unlikely(args.nr_skbs > IO_SKBS_PER_CALL_LIMIT) && + (issue_flags & IO_URING_F_MULTISHOT)) { + ret = IOU_REQUEUE; } else if (sock_flag(sk, SOCK_DONE)) { /* Make it to retry until it finally gets 0. */ if (issue_flags & IO_URING_F_MULTISHOT) From patchwork Wed Dec 4 17:21:55 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: David Wei X-Patchwork-Id: 13894127 Received: from mail-pf1-f175.google.com (mail-pf1-f175.google.com [209.85.210.175]) (using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 173C12163A5 for ; Wed, 4 Dec 2024 17:23:02 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=209.85.210.175 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1733332984; cv=none; b=qI9gMmbu4Kkc3lAHptuOBIYzos/FPrUgMklAM6fYKB9rrbj8X83BX7mLFHjojSjo2QnKD5r+YbgaQ13FOBGznQ3o0jYJVbA3Ta+xOQfl8/Sx/tamTAhgRwTC27twQmJlWHUgxKlhva1eXmwA8AC/kimWiPn1Jsc6PbrSxFFZRPA= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1733332984; c=relaxed/simple; bh=G1zac/FrdM0gIe8YwspufTx9nda5a+tBvmguzkjovy8=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=SJoH66NDQPThFcIJE2m+g0EHJ9b9itxruJXdV9st9BiwJZpFbGPUSl1+WS1RCA1b3tzIkGNHHTagEs8Sy4aqb4M121yqGEV5HXpURcBEk1uTXtaMc050y7iX8yj5H6GHJxjAw2dPDd3K3dYSmi94wKRYnE3Cv/UxD1LK4eLCHLc= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=none (p=none dis=none) header.from=davidwei.uk; spf=none smtp.mailfrom=davidwei.uk; dkim=pass (2048-bit key) header.d=davidwei-uk.20230601.gappssmtp.com header.i=@davidwei-uk.20230601.gappssmtp.com header.b=G9fqGKmS; arc=none smtp.client-ip=209.85.210.175 Authentication-Results: smtp.subspace.kernel.org; dmarc=none (p=none dis=none) header.from=davidwei.uk Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=davidwei.uk Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=davidwei-uk.20230601.gappssmtp.com header.i=@davidwei-uk.20230601.gappssmtp.com header.b="G9fqGKmS" Received: by mail-pf1-f175.google.com with SMTP id d2e1a72fcca58-723f37dd76cso74420b3a.0 for ; Wed, 04 Dec 2024 09:23:02 -0800 (PST) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=davidwei-uk.20230601.gappssmtp.com; s=20230601; t=1733332982; x=1733937782; darn=vger.kernel.org; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:from:to:cc:subject:date :message-id:reply-to; bh=DJekRoIcm/1Er4pwWyRsR39n7PkzzcOxVAQMEmkZaiA=; b=G9fqGKmSEdUww/e50vCqt1vPGgC0qh+uWMHEzoPsw7bA39RlQq/UYqW9zwRgsASOUX 152RDH6haZ7ojGo6UW/4sLlN9R0kw8eOW8u+PC5ETDirPShoITu4YU2OQ2/ldzxHFpPN Li7FvREYAPIst+u25PUU17fM5DJ7t6CX8ss7x8NKCmCP63JwjGugiD4KQnnMmgPeYQg1 qtBOiLFlaVUxm6LP4ZOg0prFTi/vg7dA4c0vfr/nGzNKeo+HEX379x/dYLK8UwACN2+2 QRxloW08Guk4pzojRH8K1pzMEUjt6hU46uoQh8mYocMLgUHxRGKPoJLin6F0mrKfqT/d YyCQ== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20230601; t=1733332982; x=1733937782; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc :subject:date:message-id:reply-to; bh=DJekRoIcm/1Er4pwWyRsR39n7PkzzcOxVAQMEmkZaiA=; b=ZuL4wByIH+cA+Zumc50JIDrCg5Z+7ssUQcd1xX+kdyoul0gW/fhw6Kf3Yj/EMPbtfK stZrr+VsLEA2Kc4mbtFNxgGD+3eXWhrT1GcAAPReNwUGa/7OEtjoQwvxJGA08itAOVa2 zORb4EjXtX+HOgvdlZFgn3KcEAWiWqkQkGqM53DbG/MYq4l9ZeiMqyKoTXM16xgEaqcS HSnLM8YPBpRrQ+zXgcIO/NopBuSfZH8JB0NGKucw/GTPsQn1mbrzCcbTvNukUoaTW8Uk 6gRiQW3bv/7XSm88JMgwU4vfOlM7TeC7yPSQoPPc5Q9Moyh9YmC6grq1YFuUuTGpYeR8 9uhg== X-Gm-Message-State: AOJu0YzSZWqBbis5eDUeXtPsEUxw2r1WuZwYvxF5InucGw+D7v658w4P O8fbUjCp9AH4+njeElwW9+ekgxiVZhVgcWpt7GsG6M+uYXifKQnWg2bV7NpxaZFZdCZrKsstbaf U X-Gm-Gg: ASbGnctI03tVBuvX6H2TycZxPnCvzm8N5v4O0433m6Y0JYrk5QdTqppg+9YdZAGB29W ytwbqo/a7JKAd0tfnotTmooLlCtSMo4ldL7JUXgWLpVc4L3RY9iZ7MDxxbD9zwRwdTFvCkZKSTj S1vxtLrmrnmy5iLryB6H1PF0V7bOOr5NYlOkX493GfOeMTUjjrkxckVYyA9/fev0SpDZ7Gje85S 6yhFrh0HEgR/37ZW+tztsNSwMJgoY3RuEo= X-Google-Smtp-Source: AGHT+IH5V9UYcGCa4plFPp9ChGYwBiebQjLlpyqrYLZFl+FkJr9WuGVyaoVcY828FQ1WnBu5x7N2Lg== X-Received: by 2002:a05:6a00:21c4:b0:724:e75b:22d1 with SMTP id d2e1a72fcca58-7257fcc62c5mr10668943b3a.16.1733332982501; Wed, 04 Dec 2024 09:23:02 -0800 (PST) Received: from localhost ([2a03:2880:ff:1d::]) by smtp.gmail.com with ESMTPSA id d2e1a72fcca58-7254176fe28sm12617505b3a.72.2024.12.04.09.23.01 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Wed, 04 Dec 2024 09:23:02 -0800 (PST) From: David Wei To: io-uring@vger.kernel.org, netdev@vger.kernel.org Cc: Jens Axboe , Pavel Begunkov , Jakub Kicinski , Paolo Abeni , "David S. Miller" , Eric Dumazet , Jesper Dangaard Brouer , David Ahern , Mina Almasry , Stanislav Fomichev , Joe Damato , Pedro Tammela Subject: [PATCH net-next v8 16/17] net: add documentation for io_uring zcrx Date: Wed, 4 Dec 2024 09:21:55 -0800 Message-ID: <20241204172204.4180482-17-dw@davidwei.uk> X-Mailer: git-send-email 2.43.5 In-Reply-To: <20241204172204.4180482-1-dw@davidwei.uk> References: <20241204172204.4180482-1-dw@davidwei.uk> Precedence: bulk X-Mailing-List: io-uring@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Add documentation for io_uring zero copy Rx that explains requirements and the user API. Signed-off-by: David Wei --- Documentation/networking/iou-zcrx.rst | 201 ++++++++++++++++++++++++++ 1 file changed, 201 insertions(+) create mode 100644 Documentation/networking/iou-zcrx.rst diff --git a/Documentation/networking/iou-zcrx.rst b/Documentation/networking/iou-zcrx.rst new file mode 100644 index 000000000000..0a3af8c08c7e --- /dev/null +++ b/Documentation/networking/iou-zcrx.rst @@ -0,0 +1,201 @@ +.. SPDX-License-Identifier: GPL-2.0 + +===================== +io_uring zero copy Rx +===================== + +Introduction +============ + +io_uring zero copy Rx (ZC Rx) is a feature that removes kernel-to-user copy on +the network receive path, allowing packet data to be received directly into +userspace memory. This feature is different to TCP_ZEROCOPY_RECEIVE in that +there are no strict alignment requirements and no need to mmap()/munmap(). +Compared to kernel bypass solutions such as e.g. DPDK, the packet headers are +processed by the kernel TCP stack as normal. + +NIC HW Requirements +=================== + +Several NIC HW features are required for io_uring ZC Rx to work. For now the +kernel API does not configure the NIC and it must be done by the user. + +Header/data split +----------------- + +Required to split packets at the L4 boundary into a header and a payload. +Headers are received into kernel memory as normal and processed by the TCP +stack as normal. Payloads are received into userspace memory directly. + +Flow steering +------------- + +Specific HW Rx queues are configured for this feature, but modern NICs +typically distribute flows across all HW Rx queues. Flow steering is required +to ensure that only desired flows are directed towards HW queues that are +configured for io_uring ZC Rx. + +RSS +--- + +In addition to flow steering above, RSS is required to steer all other non-zero +copy flows away from queues that are configured for io_uring ZC Rx. + +Usage +===== + +Setup NIC +--------- + +Must be done out of band for now. + +Ensure there are enough queues:: + + ethtool -L eth0 combined 32 + +Enable header/data split:: + + ethtool -G eth0 tcp-data-split on + +Carve out half of the HW Rx queues for zero copy using RSS:: + + ethtool -X eth0 equal 16 + +Set up flow steering:: + + ethtool -N eth0 flow-type tcp6 ... action 16 + +Setup io_uring +-------------- + +This section describes the low level io_uring kernel API. Please refer to +liburing documentation for how to use the higher level API. + +Create an io_uring instance with the following required setup flags:: + + IORING_SETUP_SINGLE_ISSUER + IORING_SETUP_DEFER_TASKRUN + IORING_SETUP_CQE32 + +Create memory area +------------------ + +Allocate userspace memory area for receiving zero copy data:: + + void *area_ptr = mmap(NULL, area_size, + PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, + 0, 0); + +Create refill ring +------------------ + +Allocate memory for a shared ringbuf used for returning consumed buffers:: + + void *ring_ptr = mmap(NULL, ring_size, + PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, + 0, 0); + +This refill ring consists of some space for the header, followed by an array of +``struct io_uring_zcrx_rqe``:: + + size_t rq_entries = 4096; + size_t ring_size = rq_entries * sizeof(struct io_uring_zcrx_rqe) + PAGE_SIZE; + /* align to page size */ + ring_size = (ring_size + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1); + +Register ZC Rx +-------------- + +Fill in registration structs:: + + struct io_uring_zcrx_area_reg area_reg = { + .addr = (__u64)(unsigned long)area_ptr, + .len = area_size, + .flags = 0, + }; + + struct io_uring_region_desc region_reg = { + .user_addr = (__u64)(unsigned long)ring_ptr, + .size = ring_size, + .flags = IORING_MEM_REGION_TYPE_USER, + }; + + struct io_uring_zcrx_ifq_reg reg = { + .if_idx = if_nametoindex("eth0"), + /* this is the HW queue with desired flow steered into it */ + .if_rxq = 16, + .rq_entries = rq_entries, + .area_ptr = (__u64)(unsigned long)&area_reg, + .region_ptr = (__u64)(unsigned long)®ion_reg, + }; + +Register with kernel:: + + io_uring_register_ifq(ring, ®); + +Map refill ring +--------------- + +The kernel fills in fields for the refill ring in the registration ``struct +io_uring_zcrx_ifq_reg``. Map it into userspace:: + + struct io_uring_zcrx_rq refill_ring; + + refill_ring.khead = (unsigned *)((char *)ring_ptr + reg.offsets.head); + refill_ring.khead = (unsigned *)((char *)ring_ptr + reg.offsets.tail); + refill_ring.rqes = + (struct io_uring_zcrx_rqe *)((char *)ring_ptr + reg.offsets.rqes); + refill_ring.rq_tail = 0; + refill_ring.ring_ptr = ring_ptr; + +Receiving data +-------------- + +Prepare a zero copy recv request:: + + struct io_uring_sqe *sqe; + + sqe = io_uring_get_sqe(ring); + io_uring_prep_rw(IORING_OP_RECV_ZC, sqe, fd, NULL, 0, 0); + sqe->ioprio |= IORING_RECV_MULTISHOT; + +Now, submit and wait:: + + io_uring_submit_and_wait(ring, 1); + +Finally, process completions:: + + struct io_uring_cqe *cqe; + unsigned int count = 0; + unsigned int head; + + io_uring_for_each_cqe(ring, head, cqe) { + struct io_uring_zcrx_cqe *rcqe = (struct io_uring_zcrx_cqe *)(cqe + 1); + + unsigned char *data = area_ptr + (rcqe->off & IORING_ZCRX_AREA_MASK); + /* do something with the data */ + + count++; + } + io_uring_cq_advance(ring, count); + +Recycling buffers +----------------- + +Return buffers back to the kernel to be used again:: + + struct io_uring_zcrx_rqe *rqe; + unsigned mask = refill_ring.ring_entries - 1; + rqe = &refill_ring.rqes[refill_ring.rq_tail & mask]; + + area_offset = rcqe->off & IORING_ZCRX_AREA_MASK; + rqe->off = area_offset | area_reg.rq_area_token; + rqe->len = cqe->res; + IO_URING_WRITE_ONCE(*refill_ring.ktail, ++refill_ring.rq_tail); + +Testing +======= + +See ``tools/testing/selftests/net/iou-zcrx.c`` From patchwork Wed Dec 4 17:21:56 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: David Wei X-Patchwork-Id: 13894128 Received: from mail-pf1-f182.google.com (mail-pf1-f182.google.com [209.85.210.182]) (using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 8DEC4216397 for ; Wed, 4 Dec 2024 17:23:04 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=209.85.210.182 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1733332987; cv=none; b=nHUQwzqtk8Okk6FkBWr+prxfKqKiX3gKLALtvgo/pzwkZY3201VCfqXfMc3VK0ZQupdcTwDfERuxtp58nMl4Cw8/6VUe4LzDnS61m4hZmr0VAMq1Jmkzow1pzrGQr1EkEOtGqJRNBcaOsekXmpVZFbbGYaFF03g/N0/wTLB+T1Y= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1733332987; c=relaxed/simple; bh=XDL+9pwkfIBwDQiQFl4GDzKDRTUcI6R3pAnYvNNnDfo=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=gdoqoJ12OxAHXrqLyCqKkb8XaWN2begUbAp3VP4PKn7YxaOOLovfSn3DtZbMGYnhtOTUxe0CjTTm1RfTEIwyrcAkszf2ujx7T9HvrmkjAcjixuXaazk4quibmfzGIwgEumfH7bZ9pxO9oeRzMGlH6Zti3s/Mn7HVhJT7LReRVVM= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=none (p=none dis=none) header.from=davidwei.uk; spf=none smtp.mailfrom=davidwei.uk; dkim=pass (2048-bit key) header.d=davidwei-uk.20230601.gappssmtp.com header.i=@davidwei-uk.20230601.gappssmtp.com header.b=vvZ1qQ23; arc=none smtp.client-ip=209.85.210.182 Authentication-Results: smtp.subspace.kernel.org; dmarc=none (p=none dis=none) header.from=davidwei.uk Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=davidwei.uk Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=davidwei-uk.20230601.gappssmtp.com header.i=@davidwei-uk.20230601.gappssmtp.com header.b="vvZ1qQ23" Received: by mail-pf1-f182.google.com with SMTP id d2e1a72fcca58-724e7d5d5b2so53706b3a.2 for ; Wed, 04 Dec 2024 09:23:04 -0800 (PST) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=davidwei-uk.20230601.gappssmtp.com; s=20230601; t=1733332984; x=1733937784; darn=vger.kernel.org; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:from:to:cc:subject:date :message-id:reply-to; bh=E2A6qrannMDfARwxSzCj01X8wYOTt2ecawK3ihWMIsE=; b=vvZ1qQ23OOZW1lUqb+66XwxJKqWDUbabqoXuFHOhWdNn+ghgfYmrSkV2iFbM9/gU4M MvthnXSlzxjeqweNfWOgVMqtNiSsGj7V6T9WzIcZ8I7vRpmbq3pMhAyZFtAZF4sx07Bn Lo9lsZAzuWESv0XpZciGQcW1q+CxRhlaDMp/J/FOzKA3eOYxqRnps3MzMusvDtue0wkB TCuNSAM797eoLyrMfyIEzuK2oGoySbB3qgS16lRBzTfhoM/IKsEj1+SN2hWa6BFEmOwF 0n8hOdXZlcT7yTaOr9H0tSQCM45pnc5WCpPpCxIuScD7PNVmytZcpurdCyBniQf4EvJe 4dmA== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20230601; t=1733332984; x=1733937784; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc :subject:date:message-id:reply-to; bh=E2A6qrannMDfARwxSzCj01X8wYOTt2ecawK3ihWMIsE=; b=N/Jgtgr3VynFCErBw2CHVuP1zd8zhOuIXUAiMaJr3OaU1OrnJtZ7pOVDsyIy6tiW58 3lKXUYBcQUQu9kGUDja0A/JUqCffICcWg/osyhGd5To32+ulq4SbGPMS5SLFXIVWA/KJ /fJLIFf7lg5zhcmMZbRjCse5vMJQMOS1Z8l8bh0//h5++q+eRciozf9X1+plRu6EAfEf zYodOht8RfLze7lydHmwRLhVyMfewYwl4+qyj8j9aAEL5P8ASoM9+xbJy/O5kHPaChP/ UcwHhAtZk7sacdabxqdKZusEi1E6qwFWx5MiL6U2jjylvHXBrcyLtjnjqnsH2JTzBc5N YJXQ== X-Gm-Message-State: AOJu0YweRrMcAfqTCrkfGPoH+slgzu24hy0rvMQf7sSFmeYsYPGuhQ1I SaDPEFAVkywHCSBx9kCEVPypz+O+TdoriRt3vgmTukK9WCqKc1YKYeG1THZ6w6M1cMdK4+jTxns q X-Gm-Gg: ASbGncv3mffCSTTgE4dk3tt9sZ7P89fq7MTEMo9DC6qlPgj7BPlsR+xNONO51dKWLRC tBuV2OvnI4Cz3Hqg07LApi55NGylG/NCUIKQgSuc7LkGuaz8ldkYby2ZemtCgFkuevPidsa3EMd D5FIN4XmwqkaZ8LxlmDGQ8xLGbOXTLNFwJ5zKXFNWfRlDbwyXps0zpsH/5ryBIHwHKIVc0hXmHh ScLYbGmkFxxnjXrX/Vw7NV7Hw2f2lRoyA== X-Google-Smtp-Source: AGHT+IFV2DubMl0Q5qRtcIn27QfGQ9ADCOaGdH5ch7HShsZvg9syTNAWFwtCaI73j7vPmnHO1MmUPw== X-Received: by 2002:a17:90a:d607:b0:2ee:693e:ed7c with SMTP id 98e67ed59e1d1-2ef0127487emr11186076a91.33.1733332983854; Wed, 04 Dec 2024 09:23:03 -0800 (PST) Received: from localhost ([2a03:2880:ff:5::]) by smtp.gmail.com with ESMTPSA id 98e67ed59e1d1-2ef2708b93dsm1686406a91.51.2024.12.04.09.23.03 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Wed, 04 Dec 2024 09:23:03 -0800 (PST) From: David Wei To: io-uring@vger.kernel.org, netdev@vger.kernel.org Cc: Jens Axboe , Pavel Begunkov , Jakub Kicinski , Paolo Abeni , "David S. Miller" , Eric Dumazet , Jesper Dangaard Brouer , David Ahern , Mina Almasry , Stanislav Fomichev , Joe Damato , Pedro Tammela Subject: [PATCH net-next v8 17/17] io_uring/zcrx: add selftest Date: Wed, 4 Dec 2024 09:21:56 -0800 Message-ID: <20241204172204.4180482-18-dw@davidwei.uk> X-Mailer: git-send-email 2.43.5 In-Reply-To: <20241204172204.4180482-1-dw@davidwei.uk> References: <20241204172204.4180482-1-dw@davidwei.uk> Precedence: bulk X-Mailing-List: io-uring@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Add a selftest for io_uring zero copy Rx that sets up the feature on a receiver and validates that it receives traffic from a sender. Requires a remote host and a proper net.config for the test. The remote host also requires liburing installed. Signed-off-by: David Wei --- .../selftests/drivers/net/hw/.gitignore | 2 + .../testing/selftests/drivers/net/hw/Makefile | 6 + .../selftests/drivers/net/hw/iou-zcrx.c | 432 ++++++++++++++++++ .../selftests/drivers/net/hw/iou-zcrx.py | 64 +++ 4 files changed, 504 insertions(+) create mode 100644 tools/testing/selftests/drivers/net/hw/iou-zcrx.c create mode 100755 tools/testing/selftests/drivers/net/hw/iou-zcrx.py diff --git a/tools/testing/selftests/drivers/net/hw/.gitignore b/tools/testing/selftests/drivers/net/hw/.gitignore index e9fe6ede681a..6942bf575497 100644 --- a/tools/testing/selftests/drivers/net/hw/.gitignore +++ b/tools/testing/selftests/drivers/net/hw/.gitignore @@ -1 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only +iou-zcrx ncdevmem diff --git a/tools/testing/selftests/drivers/net/hw/Makefile b/tools/testing/selftests/drivers/net/hw/Makefile index 21ba64ce1e34..5431af8e8210 100644 --- a/tools/testing/selftests/drivers/net/hw/Makefile +++ b/tools/testing/selftests/drivers/net/hw/Makefile @@ -1,5 +1,7 @@ # SPDX-License-Identifier: GPL-2.0+ OR MIT +TEST_GEN_FILES = iou-zcrx + TEST_PROGS = \ csum.py \ devlink_port_split.py \ @@ -10,6 +12,7 @@ TEST_PROGS = \ ethtool_rmon.sh \ hw_stats_l3.sh \ hw_stats_l3_gre.sh \ + iou-zcrx.py \ loopback.sh \ nic_link_layer.py \ nic_performance.py \ @@ -38,3 +41,6 @@ include ../../../lib.mk # YNL build YNL_GENS := ethtool netdev include ../../../net/ynl.mk + +$(OUTPUT)/iou-zcrx: CFLAGS += -I/usr/include/ +$(OUTPUT)/iou-zcrx: LDLIBS += -luring diff --git a/tools/testing/selftests/drivers/net/hw/iou-zcrx.c b/tools/testing/selftests/drivers/net/hw/iou-zcrx.c new file mode 100644 index 000000000000..29cd17114632 --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/iou-zcrx.c @@ -0,0 +1,432 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define PAGE_SIZE (4096) +#define AREA_SIZE (8192 * PAGE_SIZE) +#define SEND_SIZE (512 * 4096) +#define min(a, b) \ + ({ \ + typeof(a) _a = (a); \ + typeof(b) _b = (b); \ + _a < _b ? _a : _b; \ + }) +#define min_t(t, a, b) \ + ({ \ + t _ta = (a); \ + t _tb = (b); \ + min(_ta, _tb); \ + }) + +#define ALIGN_UP(v, align) (((v) + (align) - 1) & ~((align) - 1)) + +static int cfg_family = PF_UNSPEC; +static int cfg_server = 0; +static int cfg_client = 0; +static int cfg_port = 8000; +static int cfg_payload_len; +static const char *cfg_ifname = NULL; +static int cfg_queue_id = -1; + +static socklen_t cfg_alen; +static struct sockaddr_storage cfg_addr; + +static char payload[SEND_SIZE] __attribute__((aligned(PAGE_SIZE))); +static void *area_ptr = NULL; +static void *ring_ptr = NULL; +static size_t ring_size = 0; +static struct io_uring_zcrx_rq rq_ring; +static unsigned long area_token; +static int connfd = 0; +static bool stop = false; +static size_t received = 0; + +static unsigned long gettimeofday_ms(void) +{ + struct timeval tv; + + gettimeofday(&tv, NULL); + return (tv.tv_sec * 1000) + (tv.tv_usec / 1000); +} + +static inline size_t get_refill_ring_size(unsigned int rq_entries) +{ + size_t size; + + ring_size = rq_entries * sizeof(struct io_uring_zcrx_rqe); + /* add space for the header (head/tail/etc.) */ + ring_size += PAGE_SIZE; + return ALIGN_UP(ring_size, 4096); +} + +static void setup_zcrx(struct io_uring *ring) +{ + unsigned int ifindex; + unsigned int rq_entries = 4096; + int ret; + + ifindex = if_nametoindex(cfg_ifname); + if (!ifindex) + error(1, 0, "bad interface name: %s", cfg_ifname); + + area_ptr = mmap(NULL, + AREA_SIZE, + PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, + 0, + 0); + if (area_ptr == MAP_FAILED) + error(1, 0, "mmap(): zero copy area"); + + ring_size = get_refill_ring_size(rq_entries); + ring_ptr = mmap(NULL, + ring_size, + PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, + 0, + 0); + + struct io_uring_region_desc region_reg = { + .size = ring_size, + .user_addr = (__u64)(unsigned long)ring_ptr, + .flags = IORING_MEM_REGION_TYPE_USER, + }; + + struct io_uring_zcrx_area_reg area_reg = { + .addr = (__u64)(unsigned long)area_ptr, + .len = AREA_SIZE, + .flags = 0, + }; + + struct io_uring_zcrx_ifq_reg reg = { + .if_idx = ifindex, + .if_rxq = cfg_queue_id, + .rq_entries = rq_entries, + .area_ptr = (__u64)(unsigned long)&area_reg, + .region_ptr = (__u64)(unsigned long)®ion_reg, + }; + + ret = io_uring_register_ifq(ring, ®); + if (ret) + error(1, 0, "io_uring_register_ifq(): %d", ret); + + rq_ring.khead = (unsigned int*)((char*)ring_ptr + reg.offsets.head); + rq_ring.ktail = (unsigned int*)((char*)ring_ptr + reg.offsets.tail); + rq_ring.rqes = (struct io_uring_zcrx_rqe*)((char*)ring_ptr + reg.offsets.rqes); + rq_ring.rq_tail = 0; + rq_ring.ring_entries = reg.rq_entries; + + area_token = area_reg.rq_area_token; +} + +static void add_accept(struct io_uring *ring, int sockfd) +{ + struct io_uring_sqe *sqe; + + sqe = io_uring_get_sqe(ring); + + io_uring_prep_accept(sqe, sockfd, NULL, NULL, 0); + sqe->user_data = 1; +} + +static void add_recvzc(struct io_uring *ring, int sockfd) +{ + struct io_uring_sqe *sqe; + + sqe = io_uring_get_sqe(ring); + + io_uring_prep_rw(IORING_OP_RECV_ZC, sqe, sockfd, NULL, 0, 0); + sqe->ioprio |= IORING_RECV_MULTISHOT; + sqe->user_data = 2; +} + +static void process_accept(struct io_uring *ring, struct io_uring_cqe *cqe) +{ + if (cqe->res < 0) + error(1, 0, "accept()"); + if (connfd) + error(1, 0, "Unexpected second connection"); + + connfd = cqe->res; + add_recvzc(ring, connfd); +} + +static void process_recvzc(struct io_uring *ring, struct io_uring_cqe *cqe) +{ + unsigned rq_mask = rq_ring.ring_entries - 1; + struct io_uring_zcrx_cqe *rcqe; + struct io_uring_zcrx_rqe* rqe; + struct io_uring_sqe *sqe; + uint64_t mask; + char *data; + ssize_t n; + int i; + + if (cqe->res == 0 && cqe->flags == 0) { + stop = true; + return; + } + + if (cqe->res < 0) + error(1, 0, "recvzc(): %d", cqe->res); + + if (!(cqe->flags & IORING_CQE_F_MORE)) + add_recvzc(ring, connfd); + + rcqe = (struct io_uring_zcrx_cqe*)(cqe + 1); + + n = cqe->res; + mask = (1ULL << IORING_ZCRX_AREA_SHIFT) - 1; + data = (char *)area_ptr + (rcqe->off & mask); + + for (i = 0; i < n; i++) { + if (*(data + i) != payload[(received + i)]) + error(1, 0, "payload mismatch"); + } + received += n; + + rqe = &rq_ring.rqes[(rq_ring.rq_tail & rq_mask)]; + rqe->off = (rcqe->off & IORING_ZCRX_AREA_MASK) | area_token; + rqe->len = cqe->res; + IO_URING_WRITE_ONCE(*rq_ring.ktail, ++rq_ring.rq_tail); +} + +static void server_loop(struct io_uring *ring) +{ + struct io_uring_cqe *cqe; + unsigned int count = 0; + unsigned int head; + int i, ret; + + io_uring_submit_and_wait(ring, 1); + + io_uring_for_each_cqe(ring, head, cqe) { + if (cqe->user_data == 1) + process_accept(ring, cqe); + else if (cqe->user_data == 2) + process_recvzc(ring, cqe); + else + error(1, 0, "unknown cqe"); + count++; + } + io_uring_cq_advance(ring, count); +} + +static void run_server() +{ + unsigned int flags = 0; + struct io_uring ring; + int fd, enable, ret; + uint64_t tstop; + + fd = socket(cfg_family, SOCK_STREAM, 0); + if (fd == -1) + error(1, 0, "socket()"); + + enable = 1; + ret = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(int)); + if (ret < 0) + error(1, 0, "setsockopt(SO_REUSEADDR)"); + + ret = bind(fd, (const struct sockaddr *)&cfg_addr, sizeof(cfg_addr)); + if (ret < 0) + error(1, 0, "bind()"); + + if (listen(fd, 1024) < 0) + error(1, 0, "listen()"); + + flags |= IORING_SETUP_COOP_TASKRUN; + flags |= IORING_SETUP_SINGLE_ISSUER; + flags |= IORING_SETUP_DEFER_TASKRUN; + flags |= IORING_SETUP_SUBMIT_ALL; + flags |= IORING_SETUP_CQE32; + + io_uring_queue_init(512, &ring, flags); + + setup_zcrx(&ring); + + add_accept(&ring, fd); + + tstop = gettimeofday_ms() + 5000; + while (!stop && gettimeofday_ms() < tstop) + server_loop(&ring); + + if (!stop) + error(1, 0, "test failed\n"); +} + +static void run_client() +{ + ssize_t to_send = SEND_SIZE; + ssize_t sent = 0; + ssize_t chunk, res; + int fd; + + fd = socket(cfg_family, SOCK_STREAM, 0); + if (fd == -1) + error(1, 0, "socket()"); + + if (connect(fd, (void *)&cfg_addr, cfg_alen)) + error(1, 0, "connect()"); + + while (to_send) { + void *src = &payload[sent]; + + chunk = min_t(ssize_t, cfg_payload_len, to_send); + res = send(fd, src, chunk, 0); + if (res < 0) + error(1, 0, "send(): %d", sent); + sent += res; + to_send -= res; + } + + close(fd); +} + +static void usage(const char *filepath) +{ + error(1, 0, "Usage: %s (-4|-6) (-s|-c) -h -p " + "-l -i -q", filepath); +} + +static void parse_opts(int argc, char **argv) +{ + const int max_payload_len = sizeof(payload) - + sizeof(struct ipv6hdr) - + sizeof(struct tcphdr) - + 40 /* max tcp options */; + struct sockaddr_in6 *addr6 = (void *) &cfg_addr; + struct sockaddr_in *addr4 = (void *) &cfg_addr; + char *addr = NULL; + int c; + + if (argc <= 1) + usage(argv[0]); + cfg_payload_len = max_payload_len; + + while ((c = getopt(argc, argv, "46sch:p:l:i:q:")) != -1) { + switch (c) { + case '4': + if (cfg_family != PF_UNSPEC) + error(1, 0, "Pass one of -4 or -6"); + cfg_family = PF_INET; + cfg_alen = sizeof(struct sockaddr_in); + break; + case '6': + if (cfg_family != PF_UNSPEC) + error(1, 0, "Pass one of -4 or -6"); + cfg_family = PF_INET6; + cfg_alen = sizeof(struct sockaddr_in6); + break; + case 's': + if (cfg_client) + error(1, 0, "Pass one of -s or -c"); + cfg_server = 1; + break; + case 'c': + if (cfg_server) + error(1, 0, "Pass one of -s or -c"); + cfg_client = 1; + break; + case 'h': + addr = optarg; + break; + case 'p': + cfg_port = strtoul(optarg, NULL, 0); + break; + case 'l': + cfg_payload_len = strtoul(optarg, NULL, 0); + break; + case 'i': + cfg_ifname = optarg; + break; + case 'q': + cfg_queue_id = strtoul(optarg, NULL, 0); + break; + } + } + + if (cfg_server && addr) + error(1, 0, "Receiver cannot have -h specified"); + + switch (cfg_family) { + case PF_INET: + memset(addr4, 0, sizeof(*addr4)); + addr4->sin_family = AF_INET; + addr4->sin_port = htons(cfg_port); + addr4->sin_addr.s_addr = htonl(INADDR_ANY); + + if (addr && + inet_pton(AF_INET, addr, &(addr4->sin_addr)) != 1) + error(1, 0, "ipv4 parse error: %s", addr); + break; + case PF_INET6: + memset(addr6, 0, sizeof(*addr6)); + addr6->sin6_family = AF_INET6; + addr6->sin6_port = htons(cfg_port); + addr6->sin6_addr = in6addr_any; + + if (addr && + inet_pton(AF_INET6, addr, &(addr6->sin6_addr)) != 1) + error(1, 0, "ipv6 parse error: %s", addr); + break; + default: + error(1, 0, "illegal domain"); + } + + if (cfg_payload_len > max_payload_len) + error(1, 0, "-l: payload exceeds max (%d)", max_payload_len); +} + +int main(int argc, char **argv) +{ + const char *cfg_test = argv[argc - 1]; + int i; + + parse_opts(argc, argv); + + for (i = 0; i < SEND_SIZE; i++) + payload[i] = 'a' + (i % 26); + + if (cfg_server) + run_server(); + else if (cfg_client) + run_client(); + + return 0; +} diff --git a/tools/testing/selftests/drivers/net/hw/iou-zcrx.py b/tools/testing/selftests/drivers/net/hw/iou-zcrx.py new file mode 100755 index 000000000000..3998d0ad504f --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/iou-zcrx.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +from os import path +from lib.py import ksft_run, ksft_exit +from lib.py import NetDrvEpEnv +from lib.py import bkg, cmd, wait_port_listen + + +def _get_rx_ring_entries(cfg): + eth_cmd = "ethtool -g {} | awk '/RX:/ {{count++}} count == 2 {{print $2; exit}}'" + res = cmd(eth_cmd.format(cfg.ifname), host=cfg.remote) + return int(res.stdout) + + +def _get_combined_channels(cfg): + eth_cmd = "ethtool -l {} | awk '/Combined:/ {{count++}} count == 2 {{print $2; exit}}'" + res = cmd(eth_cmd.format(cfg.ifname), host=cfg.remote) + return int(res.stdout) + + +def _set_flow_rule(cfg, chan): + eth_cmd = "ethtool -N {} flow-type tcp6 dst-port 9999 action {} | awk '{{print $NF}}'" + res = cmd(eth_cmd.format(cfg.ifname, chan), host=cfg.remote) + return int(res.stdout) + + +def test_zcrx(cfg) -> None: + cfg.require_v6() + cfg.require_cmd("awk", remote=True) + + combined_chans = _get_combined_channels(cfg) + if combined_chans < 2: + raise KsftSkipEx('at least 2 combined channels required') + rx_ring = _get_rx_ring_entries(cfg) + + rx_cmd = f"{cfg.bin_remote} -6 -s -p 9999 -i {cfg.ifname} -q {combined_chans - 1}" + tx_cmd = f"{cfg.bin_local} -6 -c -h {cfg.remote_v6} -p 9999 -l 12840" + + try: + cmd(f"ethtool -G {cfg.ifname} rx 64", host=cfg.remote) + cmd(f"ethtool -X {cfg.ifname} equal {combined_chans - 1}", host=cfg.remote) + flow_rule_id = _set_flow_rule(cfg, combined_chans - 1) + + with bkg(rx_cmd, host=cfg.remote, exit_wait=True): + wait_port_listen(9999, proto="tcp", host=cfg.remote) + cmd(tx_cmd) + finally: + cmd(f"ethtool -N {cfg.ifname} delete {flow_rule_id}", host=cfg.remote) + cmd(f"ethtool -X {cfg.ifname} default", host=cfg.remote) + cmd(f"ethtool -G {cfg.ifname} rx {rx_ring}", host=cfg.remote) + + +def main() -> None: + with NetDrvEpEnv(__file__) as cfg: + cfg.bin_local = path.abspath(path.dirname(__file__) + "/../../../drivers/net/hw/iou-zcrx") + cfg.bin_remote = cfg.remote.deploy(cfg.bin_local) + + ksft_run(globs=globals(), case_pfx={"test_"}, args=(cfg, )) + ksft_exit() + + +if __name__ == "__main__": + main()