@@ -35,11 +35,6 @@
#include "vhost.h"
-static int experimental_zcopytx = 0;
-module_param(experimental_zcopytx, int, 0444);
-MODULE_PARM_DESC(experimental_zcopytx, "Enable Zero Copy TX;"
- " 1 -Enable; 0 - Disable");
-
/* Max number of bytes transferred before requeueing the job.
* Using this limit prevents one virtqueue from starving others. */
#define VHOST_NET_WEIGHT 0x80000
@@ -50,25 +45,6 @@ MODULE_PARM_DESC(experimental_zcopytx, "Enable Zero Copy TX;"
*/
#define VHOST_NET_PKT_WEIGHT 256
-/* MAX number of TX used buffers for outstanding zerocopy */
-#define VHOST_MAX_PEND 128
-#define VHOST_GOODCOPY_LEN 256
-
-/*
- * For transmit, used buffer len is unused; we override it to track buffer
- * status internally; used for zerocopy tx only.
- */
-/* Lower device DMA failed */
-#define VHOST_DMA_FAILED_LEN ((__force __virtio32)3)
-/* Lower device DMA done */
-#define VHOST_DMA_DONE_LEN ((__force __virtio32)2)
-/* Lower device DMA in progress */
-#define VHOST_DMA_IN_PROGRESS ((__force __virtio32)1)
-/* Buffer unused */
-#define VHOST_DMA_CLEAR_LEN ((__force __virtio32)0)
-
-#define VHOST_DMA_IS_DONE(len) ((__force u32)(len) >= (__force u32)VHOST_DMA_DONE_LEN)
-
enum {
VHOST_NET_FEATURES = VHOST_FEATURES |
(1ULL << VHOST_NET_F_VIRTIO_NET_HDR) |
@@ -87,17 +63,6 @@ enum {
VHOST_NET_VQ_MAX = 2,
};
-struct vhost_net_ubuf_ref {
- /* refcount follows semantics similar to kref:
- * 0: object is released
- * 1: no outstanding ubufs
- * >1: outstanding ubufs
- */
- atomic_t refcount;
- wait_queue_head_t wait;
- struct vhost_virtqueue *vq;
-};
-
#define VHOST_NET_BATCH 64
struct vhost_net_buf {
void **queue;
@@ -109,20 +74,9 @@ struct vhost_net_virtqueue {
struct vhost_virtqueue vq;
size_t vhost_hlen;
size_t sock_hlen;
- /* vhost zerocopy support fields below: */
- /* last used idx for outstanding DMA zerocopy buffers */
- int upend_idx;
- /* For TX, first used idx for DMA done zerocopy buffers
- * For RX, number of batched heads
- */
int done_idx;
/* Number of XDP frames batched */
int batched_xdp;
- /* an array of userspace buffers info */
- struct ubuf_info_msgzc *ubuf_info;
- /* Reference counting for outstanding ubufs.
- * Protected by vq mutex. Writers must also take device mutex. */
- struct vhost_net_ubuf_ref *ubufs;
struct ptr_ring *rx_ring;
struct vhost_net_buf rxq;
/* Batched XDP buffs */
@@ -133,20 +87,10 @@ struct vhost_net {
struct vhost_dev dev;
struct vhost_net_virtqueue vqs[VHOST_NET_VQ_MAX];
struct vhost_poll poll[VHOST_NET_VQ_MAX];
- /* Number of TX recently submitted.
- * Protected by tx vq lock. */
- unsigned tx_packets;
- /* Number of times zerocopy TX recently failed.
- * Protected by tx vq lock. */
- unsigned tx_zcopy_err;
- /* Flush in progress. Protected by tx vq lock. */
- bool tx_flush;
/* Private page frag cache */
struct page_frag_cache pf_cache;
};
-static unsigned vhost_net_zcopy_mask __read_mostly;
-
static void *vhost_net_buf_get_ptr(struct vhost_net_buf *rxq)
{
if (rxq->tail != rxq->head)
@@ -224,90 +168,12 @@ static void vhost_net_buf_init(struct vhost_net_buf *rxq)
rxq->head = rxq->tail = 0;
}
-static void vhost_net_enable_zcopy(int vq)
-{
- vhost_net_zcopy_mask |= 0x1 << vq;
-}
-
-static struct vhost_net_ubuf_ref *
-vhost_net_ubuf_alloc(struct vhost_virtqueue *vq, bool zcopy)
-{
- struct vhost_net_ubuf_ref *ubufs;
- /* No zero copy backend? Nothing to count. */
- if (!zcopy)
- return NULL;
- ubufs = kmalloc(sizeof(*ubufs), GFP_KERNEL);
- if (!ubufs)
- return ERR_PTR(-ENOMEM);
- atomic_set(&ubufs->refcount, 1);
- init_waitqueue_head(&ubufs->wait);
- ubufs->vq = vq;
- return ubufs;
-}
-
-static int vhost_net_ubuf_put(struct vhost_net_ubuf_ref *ubufs)
-{
- int r = atomic_sub_return(1, &ubufs->refcount);
- if (unlikely(!r))
- wake_up(&ubufs->wait);
- return r;
-}
-
-static void vhost_net_ubuf_put_and_wait(struct vhost_net_ubuf_ref *ubufs)
-{
- vhost_net_ubuf_put(ubufs);
- wait_event(ubufs->wait, !atomic_read(&ubufs->refcount));
-}
-
-static void vhost_net_ubuf_put_wait_and_free(struct vhost_net_ubuf_ref *ubufs)
-{
- vhost_net_ubuf_put_and_wait(ubufs);
- kfree(ubufs);
-}
-
-static void vhost_net_clear_ubuf_info(struct vhost_net *n)
-{
- int i;
-
- for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
- kfree(n->vqs[i].ubuf_info);
- n->vqs[i].ubuf_info = NULL;
- }
-}
-
-static int vhost_net_set_ubuf_info(struct vhost_net *n)
-{
- bool zcopy;
- int i;
-
- for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
- zcopy = vhost_net_zcopy_mask & (0x1 << i);
- if (!zcopy)
- continue;
- n->vqs[i].ubuf_info =
- kmalloc_array(UIO_MAXIOV,
- sizeof(*n->vqs[i].ubuf_info),
- GFP_KERNEL);
- if (!n->vqs[i].ubuf_info)
- goto err;
- }
- return 0;
-
-err:
- vhost_net_clear_ubuf_info(n);
- return -ENOMEM;
-}
-
static void vhost_net_vq_reset(struct vhost_net *n)
{
int i;
- vhost_net_clear_ubuf_info(n);
-
for (i = 0; i < VHOST_NET_VQ_MAX; i++) {
n->vqs[i].done_idx = 0;
- n->vqs[i].upend_idx = 0;
- n->vqs[i].ubufs = NULL;
n->vqs[i].vhost_hlen = 0;
n->vqs[i].sock_hlen = 0;
vhost_net_buf_init(&n->vqs[i].rxq);
@@ -315,103 +181,11 @@ static void vhost_net_vq_reset(struct vhost_net *n)
}
-static void vhost_net_tx_packet(struct vhost_net *net)
-{
- ++net->tx_packets;
- if (net->tx_packets < 1024)
- return;
- net->tx_packets = 0;
- net->tx_zcopy_err = 0;
-}
-
-static void vhost_net_tx_err(struct vhost_net *net)
-{
- ++net->tx_zcopy_err;
-}
-
-static bool vhost_net_tx_select_zcopy(struct vhost_net *net)
-{
- /* TX flush waits for outstanding DMAs to be done.
- * Don't start new DMAs.
- */
- return !net->tx_flush &&
- net->tx_packets / 64 >= net->tx_zcopy_err;
-}
-
-static bool vhost_sock_zcopy(struct socket *sock)
-{
- return unlikely(experimental_zcopytx) &&
- sock_flag(sock->sk, SOCK_ZEROCOPY);
-}
-
static bool vhost_sock_xdp(struct socket *sock)
{
return sock_flag(sock->sk, SOCK_XDP);
}
-/* In case of DMA done not in order in lower device driver for some reason.
- * upend_idx is used to track end of used idx, done_idx is used to track head
- * of used idx. Once lower device DMA done contiguously, we will signal KVM
- * guest used idx.
- */
-static void vhost_zerocopy_signal_used(struct vhost_net *net,
- struct vhost_virtqueue *vq)
-{
- struct vhost_net_virtqueue *nvq =
- container_of(vq, struct vhost_net_virtqueue, vq);
- int i, add;
- int j = 0;
-
- for (i = nvq->done_idx; i != nvq->upend_idx; i = (i + 1) % UIO_MAXIOV) {
- if (vq->heads[i].len == VHOST_DMA_FAILED_LEN)
- vhost_net_tx_err(net);
- if (VHOST_DMA_IS_DONE(vq->heads[i].len)) {
- vq->heads[i].len = VHOST_DMA_CLEAR_LEN;
- ++j;
- } else
- break;
- }
- while (j) {
- add = min(UIO_MAXIOV - nvq->done_idx, j);
- vhost_add_used_and_signal_n(vq->dev, vq,
- &vq->heads[nvq->done_idx], add);
- nvq->done_idx = (nvq->done_idx + add) % UIO_MAXIOV;
- j -= add;
- }
-}
-
-static void vhost_zerocopy_complete(struct sk_buff *skb,
- struct ubuf_info *ubuf_base, bool success)
-{
- struct ubuf_info_msgzc *ubuf = uarg_to_msgzc(ubuf_base);
- struct vhost_net_ubuf_ref *ubufs = ubuf->ctx;
- struct vhost_virtqueue *vq = ubufs->vq;
- int cnt;
-
- rcu_read_lock_bh();
-
- /* set len to mark this desc buffers done DMA */
- vq->heads[ubuf->desc].len = success ?
- VHOST_DMA_DONE_LEN : VHOST_DMA_FAILED_LEN;
- cnt = vhost_net_ubuf_put(ubufs);
-
- /*
- * Trigger polling thread if guest stopped submitting new buffers:
- * in this case, the refcount after decrement will eventually reach 1.
- * We also trigger polling periodically after each 16 packets
- * (the value 16 here is more or less arbitrary, it's tuned to trigger
- * less than 10% of times).
- */
- if (cnt <= 1 || !(cnt % 16))
- vhost_poll_queue(&vq->poll);
-
- rcu_read_unlock_bh();
-}
-
-static const struct ubuf_info_ops vhost_ubuf_ops = {
- .complete = vhost_zerocopy_complete,
-};
-
static inline unsigned long busy_clock(void)
{
return local_clock() >> 10;
@@ -585,10 +359,7 @@ static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
if (r == tvq->num && tvq->busyloop_timeout) {
/* Flush batched packets first */
- if (!vhost_sock_zcopy(vhost_vq_get_backend(tvq)))
- vhost_tx_batch(net, tnvq,
- vhost_vq_get_backend(tvq),
- msghdr);
+ vhost_tx_batch(net, tnvq, vhost_vq_get_backend(tvq), msghdr);
vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, false);
@@ -599,15 +370,6 @@ static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
return r;
}
-static bool vhost_exceeds_maxpend(struct vhost_net *net)
-{
- struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
- struct vhost_virtqueue *vq = &nvq->vq;
-
- return (nvq->upend_idx + UIO_MAXIOV - nvq->done_idx) % UIO_MAXIOV >
- min_t(unsigned int, VHOST_MAX_PEND, vq->num >> 2);
-}
-
static size_t init_iov_iter(struct vhost_virtqueue *vq, struct iov_iter *iter,
size_t hdr_size, int out)
{
@@ -828,113 +590,6 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
vhost_tx_batch(net, nvq, sock, &msg);
}
-static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
-{
- struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
- struct vhost_virtqueue *vq = &nvq->vq;
- unsigned out, in;
- int head;
- struct msghdr msg = {
- .msg_name = NULL,
- .msg_namelen = 0,
- .msg_control = NULL,
- .msg_controllen = 0,
- .msg_flags = MSG_DONTWAIT,
- };
- struct tun_msg_ctl ctl;
- size_t len, total_len = 0;
- int err;
- struct vhost_net_ubuf_ref *ubufs;
- struct ubuf_info_msgzc *ubuf;
- bool zcopy_used;
- int sent_pkts = 0;
-
- do {
- bool busyloop_intr;
-
- /* Release DMAs done buffers first */
- vhost_zerocopy_signal_used(net, vq);
-
- busyloop_intr = false;
- head = get_tx_bufs(net, nvq, &msg, &out, &in, &len,
- &busyloop_intr);
- /* On error, stop handling until the next kick. */
- if (unlikely(head < 0))
- break;
- /* Nothing new? Wait for eventfd to tell us they refilled. */
- if (head == vq->num) {
- if (unlikely(busyloop_intr)) {
- vhost_poll_queue(&vq->poll);
- } else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
- vhost_disable_notify(&net->dev, vq);
- continue;
- }
- break;
- }
-
- zcopy_used = len >= VHOST_GOODCOPY_LEN
- && !vhost_exceeds_maxpend(net)
- && vhost_net_tx_select_zcopy(net);
-
- /* use msg_control to pass vhost zerocopy ubuf info to skb */
- if (zcopy_used) {
- ubuf = nvq->ubuf_info + nvq->upend_idx;
- vq->heads[nvq->upend_idx].id = cpu_to_vhost32(vq, head);
- vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS;
- ubuf->ctx = nvq->ubufs;
- ubuf->desc = nvq->upend_idx;
- ubuf->ubuf.ops = &vhost_ubuf_ops;
- ubuf->ubuf.flags = SKBFL_ZEROCOPY_FRAG;
- refcount_set(&ubuf->ubuf.refcnt, 1);
- msg.msg_control = &ctl;
- ctl.type = TUN_MSG_UBUF;
- ctl.ptr = &ubuf->ubuf;
- msg.msg_controllen = sizeof(ctl);
- ubufs = nvq->ubufs;
- atomic_inc(&ubufs->refcount);
- nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV;
- } else {
- msg.msg_control = NULL;
- ubufs = NULL;
- }
- total_len += len;
- if (tx_can_batch(vq, total_len) &&
- likely(!vhost_exceeds_maxpend(net))) {
- msg.msg_flags |= MSG_MORE;
- } else {
- msg.msg_flags &= ~MSG_MORE;
- }
-
- err = sock->ops->sendmsg(sock, &msg, len);
- if (unlikely(err < 0)) {
- bool retry = err == -EAGAIN || err == -ENOMEM || err == -ENOBUFS;
-
- if (zcopy_used) {
- if (vq->heads[ubuf->desc].len == VHOST_DMA_IN_PROGRESS)
- vhost_net_ubuf_put(ubufs);
- if (retry)
- nvq->upend_idx = ((unsigned)nvq->upend_idx - 1)
- % UIO_MAXIOV;
- else
- vq->heads[ubuf->desc].len = VHOST_DMA_DONE_LEN;
- }
- if (retry) {
- vhost_discard_vq_desc(vq, 1);
- vhost_net_enable_vq(net, vq);
- break;
- }
- pr_debug("Fail to send packet: err %d", err);
- } else if (unlikely(err != len))
- pr_debug("Truncated TX packet: "
- " len %d != %zd\n", err, len);
- if (!zcopy_used)
- vhost_add_used_and_signal(&net->dev, vq, head, 0);
- else
- vhost_zerocopy_signal_used(net, vq);
- vhost_net_tx_packet(net);
- } while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len)));
-}
-
/* Expects to be always run from workqueue - which acts as
* read-size critical section for our kind of RCU. */
static void handle_tx(struct vhost_net *net)
@@ -954,10 +609,7 @@ static void handle_tx(struct vhost_net *net)
vhost_disable_notify(&net->dev, vq);
vhost_net_disable_vq(net, vq);
- if (vhost_sock_zcopy(sock))
- handle_tx_zerocopy(net, sock);
- else
- handle_tx_copy(net, sock);
+ handle_tx_copy(net, sock);
out:
mutex_unlock(&vq->mutex);
@@ -1307,9 +959,6 @@ static int vhost_net_open(struct inode *inode, struct file *f)
n->vqs[VHOST_NET_VQ_TX].vq.handle_kick = handle_tx_kick;
n->vqs[VHOST_NET_VQ_RX].vq.handle_kick = handle_rx_kick;
for (i = 0; i < VHOST_NET_VQ_MAX; i++) {
- n->vqs[i].ubufs = NULL;
- n->vqs[i].ubuf_info = NULL;
- n->vqs[i].upend_idx = 0;
n->vqs[i].done_idx = 0;
n->vqs[i].batched_xdp = 0;
n->vqs[i].vhost_hlen = 0;
@@ -1360,17 +1009,6 @@ static void vhost_net_stop(struct vhost_net *n, struct socket **tx_sock,
static void vhost_net_flush(struct vhost_net *n)
{
vhost_dev_flush(&n->dev);
- if (n->vqs[VHOST_NET_VQ_TX].ubufs) {
- mutex_lock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex);
- n->tx_flush = true;
- mutex_unlock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex);
- /* Wait for all lower device DMAs done. */
- vhost_net_ubuf_put_and_wait(n->vqs[VHOST_NET_VQ_TX].ubufs);
- mutex_lock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex);
- n->tx_flush = false;
- atomic_set(&n->vqs[VHOST_NET_VQ_TX].ubufs->refcount, 1);
- mutex_unlock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex);
- }
}
static int vhost_net_release(struct inode *inode, struct file *f)
@@ -1476,7 +1114,6 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
struct socket *sock, *oldsock;
struct vhost_virtqueue *vq;
struct vhost_net_virtqueue *nvq;
- struct vhost_net_ubuf_ref *ubufs, *oldubufs = NULL;
int r;
mutex_lock(&n->dev.mutex);
@@ -1509,13 +1146,6 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
/* start polling new socket */
oldsock = vhost_vq_get_backend(vq);
if (sock != oldsock) {
- ubufs = vhost_net_ubuf_alloc(vq,
- sock && vhost_sock_zcopy(sock));
- if (IS_ERR(ubufs)) {
- r = PTR_ERR(ubufs);
- goto err_ubufs;
- }
-
vhost_net_disable_vq(n, vq);
vhost_vq_set_backend(vq, sock);
vhost_net_buf_unproduce(nvq);
@@ -1531,24 +1161,10 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
else
nvq->rx_ring = NULL;
}
-
- oldubufs = nvq->ubufs;
- nvq->ubufs = ubufs;
-
- n->tx_packets = 0;
- n->tx_zcopy_err = 0;
- n->tx_flush = false;
}
mutex_unlock(&vq->mutex);
- if (oldubufs) {
- vhost_net_ubuf_put_wait_and_free(oldubufs);
- mutex_lock(&vq->mutex);
- vhost_zerocopy_signal_used(n, vq);
- mutex_unlock(&vq->mutex);
- }
-
if (oldsock) {
vhost_dev_flush(&n->dev);
sockfd_put(oldsock);
@@ -1560,9 +1176,6 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
err_used:
vhost_vq_set_backend(vq, oldsock);
vhost_net_enable_vq(n, vq);
- if (ubufs)
- vhost_net_ubuf_put_wait_and_free(ubufs);
-err_ubufs:
if (sock)
sockfd_put(sock);
err_vq:
@@ -1654,12 +1267,7 @@ static long vhost_net_set_owner(struct vhost_net *n)
r = -EBUSY;
goto out;
}
- r = vhost_net_set_ubuf_info(n);
- if (r)
- goto out;
r = vhost_dev_set_owner(&n->dev);
- if (r)
- vhost_net_clear_ubuf_info(n);
vhost_net_flush(n);
out:
mutex_unlock(&n->dev.mutex);
@@ -1768,8 +1376,6 @@ static struct miscdevice vhost_net_misc = {
static int __init vhost_net_init(void)
{
- if (experimental_zcopytx)
- vhost_net_enable_zcopy(VHOST_NET_VQ_TX);
return misc_register(&vhost_net_misc);
}
module_init(vhost_net_init);
Commit 098eadce3c62 ("vhost_net: disable zerocopy by default") disabled the module parameter for the handle_tx_zerocopy path back in 2019, nothing that many downstream distributions (e.g., RHEL7 and later) had already done the same. Both upstream and downstream disablement suggest this path is rarely used. Testing the module parameter shows that while the path allows packet forwarding, the zerocopy functionality itself is broken. On outbound traffic (guest TX -> external), zerocopy SKBs are orphaned by either skb_orphan_frags_rx() (used with the tun driver via tun_net_xmit()) or skb_orphan_frags() elsewhere in the stack, as vhost_net does not set SKBFL_DONT_ORPHAN. Orphaning enforces a memcpy and triggers the completion callback, which increments the failed TX counter, effectively disabling zerocopy again. Even after addressing these issues to prevent SKB orphaning and error counter increments, performance remains poor. By default, only 64 messages can be zerocopied, which is immediately exhausted by workloads like iperf, resulting in most messages being memcpy'd anyhow. Additionally, memcpy'd messages do not benefit from the XDP batching optimizations present in the handle_tx_copy path. Given these limitations and the lack of any tangible benefits, remove zerocopy entirely to simplify the code base. Signed-off-by: Jon Kohler <jon@nutanix.com> --- drivers/vhost/net.c | 398 +------------------------------------------- 1 file changed, 2 insertions(+), 396 deletions(-)