Message ID | 20240614063933.108811-10-xuanzhuo@linux.alibaba.com (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | virtio-net: support AF_XDP zero copy | expand |
On Fri, Jun 14, 2024 at 2:39 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote: > > This patch implement the logic of bind/unbind xsk pool to sq and rq. > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com> > --- > drivers/net/virtio_net.c | 201 ++++++++++++++++++++++++++++++++++++++- > 1 file changed, 200 insertions(+), 1 deletion(-) > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > index 88ab9ea1646f..35fd8bca7fcf 100644 > --- a/drivers/net/virtio_net.c > +++ b/drivers/net/virtio_net.c > @@ -26,6 +26,7 @@ > #include <net/netdev_rx_queue.h> > #include <net/netdev_queues.h> > #include <uapi/linux/virtio_ring.h> > +#include <net/xdp_sock_drv.h> > > static int napi_weight = NAPI_POLL_WEIGHT; > module_param(napi_weight, int, 0444); > @@ -57,6 +58,8 @@ DECLARE_EWMA(pkt_len, 0, 64) > > #define VIRTNET_DRIVER_VERSION "1.0.0" > > +static struct virtio_net_hdr_mrg_rxbuf xsk_hdr; Does this mean AF_XDP only supports virtio_net_hdr_mrg_rxbuf but not others? > + > static const unsigned long guest_offloads[] = { > VIRTIO_NET_F_GUEST_TSO4, > VIRTIO_NET_F_GUEST_TSO6, > @@ -321,6 +324,12 @@ struct send_queue { > bool premapped; > > struct virtnet_sq_dma_info dmainfo; > + > + struct { > + struct xsk_buff_pool *pool; > + > + dma_addr_t hdr_dma_address; > + } xsk; > }; > > /* Internal representation of a receive virtqueue */ > @@ -372,6 +381,13 @@ struct receive_queue { > > /* Record the last dma info to free after new pages is allocated. */ > struct virtnet_rq_dma *last_dma; > + > + struct { > + struct xsk_buff_pool *pool; > + > + /* xdp rxq used by xsk */ > + struct xdp_rxq_info xdp_rxq; > + } xsk; > }; > > /* This structure can contain rss message with maximum settings for indirection table and keysize > @@ -695,7 +711,7 @@ static void virtnet_sq_free_dma_meta(struct send_queue *sq) > /* This function must be called immediately after creating the vq, or after vq > * reset, and before adding any buffers to it. > */ > -static __maybe_unused int virtnet_sq_set_premapped(struct send_queue *sq, bool premapped) > +static int virtnet_sq_set_premapped(struct send_queue *sq, bool premapped) > { > if (premapped) { > int r; > @@ -5177,6 +5193,187 @@ static int virtnet_restore_guest_offloads(struct virtnet_info *vi) > return virtnet_set_guest_offloads(vi, offloads); > } > > +static int virtnet_rq_bind_xsk_pool(struct virtnet_info *vi, struct receive_queue *rq, > + struct xsk_buff_pool *pool) > +{ > + int err, qindex; > + > + qindex = rq - vi->rq; > + > + if (pool) { > + err = xdp_rxq_info_reg(&rq->xsk.xdp_rxq, vi->dev, qindex, rq->napi.napi_id); > + if (err < 0) > + return err; > + > + err = xdp_rxq_info_reg_mem_model(&rq->xsk.xdp_rxq, > + MEM_TYPE_XSK_BUFF_POOL, NULL); > + if (err < 0) { > + xdp_rxq_info_unreg(&rq->xsk.xdp_rxq); > + return err; > + } > + > + xsk_pool_set_rxq_info(pool, &rq->xsk.xdp_rxq); > + } > + > + virtnet_rx_pause(vi, rq); > + > + err = virtqueue_reset(rq->vq, virtnet_rq_unmap_free_buf); > + if (err) { > + netdev_err(vi->dev, "reset rx fail: rx queue index: %d err: %d\n", qindex, err); > + > + pool = NULL; > + } > + > + if (!pool) > + xdp_rxq_info_unreg(&rq->xsk.xdp_rxq); Let's use err label instead of duplicating xdp_rxq_info_unreg() here? > + > + rq->xsk.pool = pool; > + > + virtnet_rx_resume(vi, rq); > + > + return err; > +} > + > +static int virtnet_sq_bind_xsk_pool(struct virtnet_info *vi, > + struct send_queue *sq, > + struct xsk_buff_pool *pool) > +{ > + int err, qindex; > + > + qindex = sq - vi->sq; > + > + virtnet_tx_pause(vi, sq); > + > + err = virtqueue_reset(sq->vq, virtnet_sq_free_unused_buf); > + if (err) > + netdev_err(vi->dev, "reset tx fail: tx queue index: %d err: %d\n", qindex, err); > + else > + err = virtnet_sq_set_premapped(sq, !!pool); > + > + if (err) > + pool = NULL; > + > + sq->xsk.pool = pool; > + > + virtnet_tx_resume(vi, sq); > + > + return err; > +} > + > +static int virtnet_xsk_pool_enable(struct net_device *dev, > + struct xsk_buff_pool *pool, > + u16 qid) > +{ > + struct virtnet_info *vi = netdev_priv(dev); > + struct receive_queue *rq; > + struct send_queue *sq; > + struct device *dma_dev; > + dma_addr_t hdr_dma; > + int err; > + > + /* In big_packets mode, xdp cannot work, so there is no need to > + * initialize xsk of rq. > + * > + * Support for small mode firstly. This comment is kind of confusing, I think mergeable mode is also supported. If it's true, we can simply remove it. > + */ > + if (vi->big_packets) > + return -ENOENT; > + > + if (qid >= vi->curr_queue_pairs) > + return -EINVAL; > + > + sq = &vi->sq[qid]; > + rq = &vi->rq[qid]; > + > + /* xsk tx zerocopy depend on the tx napi. > + * > + * All xsk packets are actually consumed and sent out from the xsk tx > + * queue under the tx napi mechanism. > + */ > + if (!sq->napi.weight) > + return -EPERM; > + > + /* For the xsk, the tx and rx should have the same device. But > + * vq->dma_dev allows every vq has the respective dma dev. So I check > + * the dma dev of vq and sq is the same dev. > + */ > + if (virtqueue_dma_dev(rq->vq) != virtqueue_dma_dev(sq->vq)) > + return -EPERM; I don't understand how a different DMA device matters here. It looks like the code is using per virtqueue DMA below. > + > + dma_dev = virtqueue_dma_dev(rq->vq); > + if (!dma_dev) > + return -EPERM; > + > + hdr_dma = dma_map_single(dma_dev, &xsk_hdr, vi->hdr_len, DMA_TO_DEVICE); > + if (dma_mapping_error(dma_dev, hdr_dma)) > + return -ENOMEM; > + > + err = xsk_pool_dma_map(pool, dma_dev, 0); > + if (err) > + goto err_xsk_map; > + > + err = virtnet_rq_bind_xsk_pool(vi, rq, pool); > + if (err) > + goto err_rq; > + > + err = virtnet_sq_bind_xsk_pool(vi, sq, pool); > + if (err) > + goto err_sq; > + > + /* Now, we do not support tx offset, so all the tx virtnet hdr is zero. > + * So all the tx packets can share a single hdr. > + */ > + sq->xsk.hdr_dma_address = hdr_dma; > + > + return 0; > + > +err_sq: > + virtnet_rq_bind_xsk_pool(vi, rq, NULL); > +err_rq: > + xsk_pool_dma_unmap(pool, 0); > +err_xsk_map: > + dma_unmap_single(dma_dev, hdr_dma, vi->hdr_len, DMA_TO_DEVICE); > + return err; > +} > + > +static int virtnet_xsk_pool_disable(struct net_device *dev, u16 qid) > +{ > + struct virtnet_info *vi = netdev_priv(dev); > + struct xsk_buff_pool *pool; > + struct device *dma_dev; > + struct receive_queue *rq; > + struct send_queue *sq; > + int err1, err2; > + > + if (qid >= vi->curr_queue_pairs) > + return -EINVAL; > + > + sq = &vi->sq[qid]; > + rq = &vi->rq[qid]; > + > + pool = sq->xsk.pool; > + > + err1 = virtnet_sq_bind_xsk_pool(vi, sq, NULL); > + err2 = virtnet_rq_bind_xsk_pool(vi, rq, NULL); > + > + xsk_pool_dma_unmap(pool, 0); > + > + dma_dev = virtqueue_dma_dev(rq->vq); > + > + dma_unmap_single(dma_dev, sq->xsk.hdr_dma_address, vi->hdr_len, DMA_TO_DEVICE); > + > + return err1 | err2; > +} > + > +static int virtnet_xsk_pool_setup(struct net_device *dev, struct netdev_bpf *xdp) > +{ > + if (xdp->xsk.pool) > + return virtnet_xsk_pool_enable(dev, xdp->xsk.pool, > + xdp->xsk.queue_id); > + else > + return virtnet_xsk_pool_disable(dev, xdp->xsk.queue_id); > +} > + > static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog, > struct netlink_ext_ack *extack) > { > @@ -5302,6 +5499,8 @@ static int virtnet_xdp(struct net_device *dev, struct netdev_bpf *xdp) > switch (xdp->command) { > case XDP_SETUP_PROG: > return virtnet_xdp_set(dev, xdp->prog, xdp->extack); > + case XDP_SETUP_XSK_POOL: > + return virtnet_xsk_pool_setup(dev, xdp); > default: > return -EINVAL; > } > -- > 2.32.0.3.g01195cf9f > Thanks
On Mon, 17 Jun 2024 14:19:10 +0800, Jason Wang <jasowang@redhat.com> wrote: > On Fri, Jun 14, 2024 at 2:39 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote: > > > > This patch implement the logic of bind/unbind xsk pool to sq and rq. > > > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com> > > --- > > drivers/net/virtio_net.c | 201 ++++++++++++++++++++++++++++++++++++++- > > 1 file changed, 200 insertions(+), 1 deletion(-) > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > > index 88ab9ea1646f..35fd8bca7fcf 100644 > > --- a/drivers/net/virtio_net.c > > +++ b/drivers/net/virtio_net.c > > @@ -26,6 +26,7 @@ > > #include <net/netdev_rx_queue.h> > > #include <net/netdev_queues.h> > > #include <uapi/linux/virtio_ring.h> > > +#include <net/xdp_sock_drv.h> > > > > static int napi_weight = NAPI_POLL_WEIGHT; > > module_param(napi_weight, int, 0444); > > @@ -57,6 +58,8 @@ DECLARE_EWMA(pkt_len, 0, 64) > > > > #define VIRTNET_DRIVER_VERSION "1.0.0" > > > > +static struct virtio_net_hdr_mrg_rxbuf xsk_hdr; > > Does this mean AF_XDP only supports virtio_net_hdr_mrg_rxbuf but not others? Sorry, this is the old code. Should be virtio_net_common_hdr. Here we should use the max size of the virtio-net header. > > > + > > static const unsigned long guest_offloads[] = { > > VIRTIO_NET_F_GUEST_TSO4, > > VIRTIO_NET_F_GUEST_TSO6, > > @@ -321,6 +324,12 @@ struct send_queue { > > bool premapped; > > > > struct virtnet_sq_dma_info dmainfo; > > + > > + struct { > > + struct xsk_buff_pool *pool; > > + > > + dma_addr_t hdr_dma_address; > > + } xsk; > > }; > > > > /* Internal representation of a receive virtqueue */ > > @@ -372,6 +381,13 @@ struct receive_queue { > > > > /* Record the last dma info to free after new pages is allocated. */ > > struct virtnet_rq_dma *last_dma; > > + > > + struct { > > + struct xsk_buff_pool *pool; > > + > > + /* xdp rxq used by xsk */ > > + struct xdp_rxq_info xdp_rxq; > > + } xsk; > > }; > > > > /* This structure can contain rss message with maximum settings for indirection table and keysize > > @@ -695,7 +711,7 @@ static void virtnet_sq_free_dma_meta(struct send_queue *sq) > > /* This function must be called immediately after creating the vq, or after vq > > * reset, and before adding any buffers to it. > > */ > > -static __maybe_unused int virtnet_sq_set_premapped(struct send_queue *sq, bool premapped) > > +static int virtnet_sq_set_premapped(struct send_queue *sq, bool premapped) > > { > > if (premapped) { > > int r; > > @@ -5177,6 +5193,187 @@ static int virtnet_restore_guest_offloads(struct virtnet_info *vi) > > return virtnet_set_guest_offloads(vi, offloads); > > } > > > > +static int virtnet_rq_bind_xsk_pool(struct virtnet_info *vi, struct receive_queue *rq, > > + struct xsk_buff_pool *pool) > > +{ > > + int err, qindex; > > + > > + qindex = rq - vi->rq; > > + > > + if (pool) { > > + err = xdp_rxq_info_reg(&rq->xsk.xdp_rxq, vi->dev, qindex, rq->napi.napi_id); > > + if (err < 0) > > + return err; > > + > > + err = xdp_rxq_info_reg_mem_model(&rq->xsk.xdp_rxq, > > + MEM_TYPE_XSK_BUFF_POOL, NULL); > > + if (err < 0) { > > + xdp_rxq_info_unreg(&rq->xsk.xdp_rxq); > > + return err; > > + } > > + > > + xsk_pool_set_rxq_info(pool, &rq->xsk.xdp_rxq); > > + } > > + > > + virtnet_rx_pause(vi, rq); > > + > > + err = virtqueue_reset(rq->vq, virtnet_rq_unmap_free_buf); > > + if (err) { > > + netdev_err(vi->dev, "reset rx fail: rx queue index: %d err: %d\n", qindex, err); > > + > > + pool = NULL; > > + } > > + > > + if (!pool) > > + xdp_rxq_info_unreg(&rq->xsk.xdp_rxq); > > Let's use err label instead of duplicating xdp_rxq_info_unreg() here? > > > + > > + rq->xsk.pool = pool; > > + > > + virtnet_rx_resume(vi, rq); > > + > > + return err; > > +} > > + > > +static int virtnet_sq_bind_xsk_pool(struct virtnet_info *vi, > > + struct send_queue *sq, > > + struct xsk_buff_pool *pool) > > +{ > > + int err, qindex; > > + > > + qindex = sq - vi->sq; > > + > > + virtnet_tx_pause(vi, sq); > > + > > + err = virtqueue_reset(sq->vq, virtnet_sq_free_unused_buf); > > + if (err) > > + netdev_err(vi->dev, "reset tx fail: tx queue index: %d err: %d\n", qindex, err); > > + else > > + err = virtnet_sq_set_premapped(sq, !!pool); > > + > > + if (err) > > + pool = NULL; > > + > > + sq->xsk.pool = pool; > > + > > + virtnet_tx_resume(vi, sq); > > + > > + return err; > > +} > > + > > +static int virtnet_xsk_pool_enable(struct net_device *dev, > > + struct xsk_buff_pool *pool, > > + u16 qid) > > +{ > > + struct virtnet_info *vi = netdev_priv(dev); > > + struct receive_queue *rq; > > + struct send_queue *sq; > > + struct device *dma_dev; > > + dma_addr_t hdr_dma; > > + int err; > > + > > + /* In big_packets mode, xdp cannot work, so there is no need to > > + * initialize xsk of rq. > > + * > > + * Support for small mode firstly. > > This comment is kind of confusing, I think mergeable mode is also > supported. If it's true, we can simply remove it. For the commit num limit of the net-next, I have to remove some commits. So the mergeable mode is not supported by this patch set. I plan to support the merge mode after this patch set. > > > + */ > > + if (vi->big_packets) > > + return -ENOENT; > > + > > + if (qid >= vi->curr_queue_pairs) > > + return -EINVAL; > > + > > + sq = &vi->sq[qid]; > > + rq = &vi->rq[qid]; > > + > > + /* xsk tx zerocopy depend on the tx napi. > > + * > > + * All xsk packets are actually consumed and sent out from the xsk tx > > + * queue under the tx napi mechanism. > > + */ > > + if (!sq->napi.weight) > > + return -EPERM; > > + > > + /* For the xsk, the tx and rx should have the same device. But > > + * vq->dma_dev allows every vq has the respective dma dev. So I check > > + * the dma dev of vq and sq is the same dev. > > + */ > > + if (virtqueue_dma_dev(rq->vq) != virtqueue_dma_dev(sq->vq)) > > + return -EPERM; > > I don't understand how a different DMA device matters here. It looks > like the code is using per virtqueue DMA below. The af-xdp may use one buffer to receive from the rx and reuse this buffer to send by the tx. So the dma dev of sq and rq should be the same one. Thanks. > > > + > > + dma_dev = virtqueue_dma_dev(rq->vq); > > + if (!dma_dev) > > + return -EPERM; > > + > > + hdr_dma = dma_map_single(dma_dev, &xsk_hdr, vi->hdr_len, DMA_TO_DEVICE); > > + if (dma_mapping_error(dma_dev, hdr_dma)) > > + return -ENOMEM; > > + > > + err = xsk_pool_dma_map(pool, dma_dev, 0); > > + if (err) > > + goto err_xsk_map; > > + > > + err = virtnet_rq_bind_xsk_pool(vi, rq, pool); > > + if (err) > > + goto err_rq; > > + > > + err = virtnet_sq_bind_xsk_pool(vi, sq, pool); > > + if (err) > > + goto err_sq; > > + > > + /* Now, we do not support tx offset, so all the tx virtnet hdr is zero. > > + * So all the tx packets can share a single hdr. > > + */ > > + sq->xsk.hdr_dma_address = hdr_dma; > > + > > + return 0; > > + > > +err_sq: > > + virtnet_rq_bind_xsk_pool(vi, rq, NULL); > > +err_rq: > > + xsk_pool_dma_unmap(pool, 0); > > +err_xsk_map: > > + dma_unmap_single(dma_dev, hdr_dma, vi->hdr_len, DMA_TO_DEVICE); > > + return err; > > +} > > + > > +static int virtnet_xsk_pool_disable(struct net_device *dev, u16 qid) > > +{ > > + struct virtnet_info *vi = netdev_priv(dev); > > + struct xsk_buff_pool *pool; > > + struct device *dma_dev; > > + struct receive_queue *rq; > > + struct send_queue *sq; > > + int err1, err2; > > + > > + if (qid >= vi->curr_queue_pairs) > > + return -EINVAL; > > + > > + sq = &vi->sq[qid]; > > + rq = &vi->rq[qid]; > > + > > + pool = sq->xsk.pool; > > + > > + err1 = virtnet_sq_bind_xsk_pool(vi, sq, NULL); > > + err2 = virtnet_rq_bind_xsk_pool(vi, rq, NULL); > > + > > + xsk_pool_dma_unmap(pool, 0); > > + > > + dma_dev = virtqueue_dma_dev(rq->vq); > > + > > + dma_unmap_single(dma_dev, sq->xsk.hdr_dma_address, vi->hdr_len, DMA_TO_DEVICE); > > + > > + return err1 | err2; > > +} > > + > > +static int virtnet_xsk_pool_setup(struct net_device *dev, struct netdev_bpf *xdp) > > +{ > > + if (xdp->xsk.pool) > > + return virtnet_xsk_pool_enable(dev, xdp->xsk.pool, > > + xdp->xsk.queue_id); > > + else > > + return virtnet_xsk_pool_disable(dev, xdp->xsk.queue_id); > > +} > > + > > static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog, > > struct netlink_ext_ack *extack) > > { > > @@ -5302,6 +5499,8 @@ static int virtnet_xdp(struct net_device *dev, struct netdev_bpf *xdp) > > switch (xdp->command) { > > case XDP_SETUP_PROG: > > return virtnet_xdp_set(dev, xdp->prog, xdp->extack); > > + case XDP_SETUP_XSK_POOL: > > + return virtnet_xsk_pool_setup(dev, xdp); > > default: > > return -EINVAL; > > } > > -- > > 2.32.0.3.g01195cf9f > > > > Thanks >
On Mon, Jun 17, 2024 at 3:49 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote: > > On Mon, 17 Jun 2024 14:19:10 +0800, Jason Wang <jasowang@redhat.com> wrote: > > On Fri, Jun 14, 2024 at 2:39 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote: > > > > > > This patch implement the logic of bind/unbind xsk pool to sq and rq. > > > > > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com> > > > --- > > > drivers/net/virtio_net.c | 201 ++++++++++++++++++++++++++++++++++++++- > > > 1 file changed, 200 insertions(+), 1 deletion(-) > > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > > > index 88ab9ea1646f..35fd8bca7fcf 100644 > > > --- a/drivers/net/virtio_net.c > > > +++ b/drivers/net/virtio_net.c > > > @@ -26,6 +26,7 @@ > > > #include <net/netdev_rx_queue.h> > > > #include <net/netdev_queues.h> > > > #include <uapi/linux/virtio_ring.h> > > > +#include <net/xdp_sock_drv.h> > > > > > > static int napi_weight = NAPI_POLL_WEIGHT; > > > module_param(napi_weight, int, 0444); > > > @@ -57,6 +58,8 @@ DECLARE_EWMA(pkt_len, 0, 64) > > > > > > #define VIRTNET_DRIVER_VERSION "1.0.0" > > > > > > +static struct virtio_net_hdr_mrg_rxbuf xsk_hdr; > > > > Does this mean AF_XDP only supports virtio_net_hdr_mrg_rxbuf but not others? > > Sorry, this is the old code. > > Should be virtio_net_common_hdr. > > Here we should use the max size of the virtio-net header. Better but still suboptimal, for example it could be extended as we're adding more features? > > > > > > + > > > static const unsigned long guest_offloads[] = { > > > VIRTIO_NET_F_GUEST_TSO4, > > > VIRTIO_NET_F_GUEST_TSO6, > > > @@ -321,6 +324,12 @@ struct send_queue { > > > bool premapped; > > > > > > struct virtnet_sq_dma_info dmainfo; > > > + > > > + struct { > > > + struct xsk_buff_pool *pool; > > > + > > > + dma_addr_t hdr_dma_address; > > > + } xsk; > > > }; > > > > > > /* Internal representation of a receive virtqueue */ > > > @@ -372,6 +381,13 @@ struct receive_queue { > > > > > > /* Record the last dma info to free after new pages is allocated. */ > > > struct virtnet_rq_dma *last_dma; > > > + > > > + struct { > > > + struct xsk_buff_pool *pool; > > > + > > > + /* xdp rxq used by xsk */ > > > + struct xdp_rxq_info xdp_rxq; > > > + } xsk; > > > }; > > > > > > /* This structure can contain rss message with maximum settings for indirection table and keysize > > > @@ -695,7 +711,7 @@ static void virtnet_sq_free_dma_meta(struct send_queue *sq) > > > /* This function must be called immediately after creating the vq, or after vq > > > * reset, and before adding any buffers to it. > > > */ > > > -static __maybe_unused int virtnet_sq_set_premapped(struct send_queue *sq, bool premapped) > > > +static int virtnet_sq_set_premapped(struct send_queue *sq, bool premapped) > > > { > > > if (premapped) { > > > int r; > > > @@ -5177,6 +5193,187 @@ static int virtnet_restore_guest_offloads(struct virtnet_info *vi) > > > return virtnet_set_guest_offloads(vi, offloads); > > > } > > > > > > +static int virtnet_rq_bind_xsk_pool(struct virtnet_info *vi, struct receive_queue *rq, > > > + struct xsk_buff_pool *pool) > > > +{ > > > + int err, qindex; > > > + > > > + qindex = rq - vi->rq; > > > + > > > + if (pool) { > > > + err = xdp_rxq_info_reg(&rq->xsk.xdp_rxq, vi->dev, qindex, rq->napi.napi_id); > > > + if (err < 0) > > > + return err; > > > + > > > + err = xdp_rxq_info_reg_mem_model(&rq->xsk.xdp_rxq, > > > + MEM_TYPE_XSK_BUFF_POOL, NULL); > > > + if (err < 0) { > > > + xdp_rxq_info_unreg(&rq->xsk.xdp_rxq); > > > + return err; > > > + } > > > + > > > + xsk_pool_set_rxq_info(pool, &rq->xsk.xdp_rxq); > > > + } > > > + > > > + virtnet_rx_pause(vi, rq); > > > + > > > + err = virtqueue_reset(rq->vq, virtnet_rq_unmap_free_buf); > > > + if (err) { > > > + netdev_err(vi->dev, "reset rx fail: rx queue index: %d err: %d\n", qindex, err); > > > + > > > + pool = NULL; > > > + } > > > + > > > + if (!pool) > > > + xdp_rxq_info_unreg(&rq->xsk.xdp_rxq); > > > > Let's use err label instead of duplicating xdp_rxq_info_unreg() here? > > > > > + > > > + rq->xsk.pool = pool; > > > + > > > + virtnet_rx_resume(vi, rq); > > > + > > > + return err; > > > +} > > > + > > > +static int virtnet_sq_bind_xsk_pool(struct virtnet_info *vi, > > > + struct send_queue *sq, > > > + struct xsk_buff_pool *pool) > > > +{ > > > + int err, qindex; > > > + > > > + qindex = sq - vi->sq; > > > + > > > + virtnet_tx_pause(vi, sq); > > > + > > > + err = virtqueue_reset(sq->vq, virtnet_sq_free_unused_buf); > > > + if (err) > > > + netdev_err(vi->dev, "reset tx fail: tx queue index: %d err: %d\n", qindex, err); > > > + else > > > + err = virtnet_sq_set_premapped(sq, !!pool); > > > + > > > + if (err) > > > + pool = NULL; > > > + > > > + sq->xsk.pool = pool; > > > + > > > + virtnet_tx_resume(vi, sq); > > > + > > > + return err; > > > +} > > > + > > > +static int virtnet_xsk_pool_enable(struct net_device *dev, > > > + struct xsk_buff_pool *pool, > > > + u16 qid) > > > +{ > > > + struct virtnet_info *vi = netdev_priv(dev); > > > + struct receive_queue *rq; > > > + struct send_queue *sq; > > > + struct device *dma_dev; > > > + dma_addr_t hdr_dma; > > > + int err; > > > + > > > + /* In big_packets mode, xdp cannot work, so there is no need to > > > + * initialize xsk of rq. > > > + * > > > + * Support for small mode firstly. > > > > This comment is kind of confusing, I think mergeable mode is also > > supported. If it's true, we can simply remove it. > > For the commit num limit of the net-next, I have to remove some commits. > > So the mergeable mode is not supported by this patch set. > > I plan to support the merge mode after this patch set. Then, I'd suggest to split the patches into two series: 1) AF_XDP TX zerocopy 2) AF_XDP RX zerocopy And implement both small and mergeable in series 2). > > > > > > > + */ > > > + if (vi->big_packets) > > > + return -ENOENT; > > > + > > > + if (qid >= vi->curr_queue_pairs) > > > + return -EINVAL; > > > + > > > + sq = &vi->sq[qid]; > > > + rq = &vi->rq[qid]; > > > + > > > + /* xsk tx zerocopy depend on the tx napi. > > > + * > > > + * All xsk packets are actually consumed and sent out from the xsk tx > > > + * queue under the tx napi mechanism. > > > + */ > > > + if (!sq->napi.weight) > > > + return -EPERM; > > > + > > > + /* For the xsk, the tx and rx should have the same device. But > > > + * vq->dma_dev allows every vq has the respective dma dev. So I check > > > + * the dma dev of vq and sq is the same dev. > > > + */ > > > + if (virtqueue_dma_dev(rq->vq) != virtqueue_dma_dev(sq->vq)) > > > + return -EPERM; > > > > I don't understand how a different DMA device matters here. It looks > > like the code is using per virtqueue DMA below. > > The af-xdp may use one buffer to receive from the rx and reuse this buffer to > send by the tx. So the dma dev of sq and rq should be the same one. Right, let's tweak the comment to say something like this. Thanks
On Tue, 18 Jun 2024 09:04:03 +0800, Jason Wang <jasowang@redhat.com> wrote: > On Mon, Jun 17, 2024 at 3:49 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote: > > > > On Mon, 17 Jun 2024 14:19:10 +0800, Jason Wang <jasowang@redhat.com> wrote: > > > On Fri, Jun 14, 2024 at 2:39 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote: > > > > > > > > This patch implement the logic of bind/unbind xsk pool to sq and rq. > > > > > > > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com> > > > > --- > > > > drivers/net/virtio_net.c | 201 ++++++++++++++++++++++++++++++++++++++- > > > > 1 file changed, 200 insertions(+), 1 deletion(-) > > > > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > > > > index 88ab9ea1646f..35fd8bca7fcf 100644 > > > > --- a/drivers/net/virtio_net.c > > > > +++ b/drivers/net/virtio_net.c > > > > @@ -26,6 +26,7 @@ > > > > #include <net/netdev_rx_queue.h> > > > > #include <net/netdev_queues.h> > > > > #include <uapi/linux/virtio_ring.h> > > > > +#include <net/xdp_sock_drv.h> > > > > > > > > static int napi_weight = NAPI_POLL_WEIGHT; > > > > module_param(napi_weight, int, 0444); > > > > @@ -57,6 +58,8 @@ DECLARE_EWMA(pkt_len, 0, 64) > > > > > > > > #define VIRTNET_DRIVER_VERSION "1.0.0" > > > > > > > > +static struct virtio_net_hdr_mrg_rxbuf xsk_hdr; > > > > > > Does this mean AF_XDP only supports virtio_net_hdr_mrg_rxbuf but not others? > > > > Sorry, this is the old code. > > > > Should be virtio_net_common_hdr. > > > > Here we should use the max size of the virtio-net header. > > Better but still suboptimal, for example it could be extended as we're > adding more features? When we use this, we will work with hdr_len. sg_fill_dma(sq->sg, sq->xsk.hdr_dma_address, vi->hdr_len); So, I think it is ok. > > > > > > > > > > + > > > > static const unsigned long guest_offloads[] = { > > > > VIRTIO_NET_F_GUEST_TSO4, > > > > VIRTIO_NET_F_GUEST_TSO6, > > > > @@ -321,6 +324,12 @@ struct send_queue { > > > > bool premapped; > > > > > > > > struct virtnet_sq_dma_info dmainfo; > > > > + > > > > + struct { > > > > + struct xsk_buff_pool *pool; > > > > + > > > > + dma_addr_t hdr_dma_address; > > > > + } xsk; > > > > }; > > > > > > > > /* Internal representation of a receive virtqueue */ > > > > @@ -372,6 +381,13 @@ struct receive_queue { > > > > > > > > /* Record the last dma info to free after new pages is allocated. */ > > > > struct virtnet_rq_dma *last_dma; > > > > + > > > > + struct { > > > > + struct xsk_buff_pool *pool; > > > > + > > > > + /* xdp rxq used by xsk */ > > > > + struct xdp_rxq_info xdp_rxq; > > > > + } xsk; > > > > }; > > > > > > > > /* This structure can contain rss message with maximum settings for indirection table and keysize > > > > @@ -695,7 +711,7 @@ static void virtnet_sq_free_dma_meta(struct send_queue *sq) > > > > /* This function must be called immediately after creating the vq, or after vq > > > > * reset, and before adding any buffers to it. > > > > */ > > > > -static __maybe_unused int virtnet_sq_set_premapped(struct send_queue *sq, bool premapped) > > > > +static int virtnet_sq_set_premapped(struct send_queue *sq, bool premapped) > > > > { > > > > if (premapped) { > > > > int r; > > > > @@ -5177,6 +5193,187 @@ static int virtnet_restore_guest_offloads(struct virtnet_info *vi) > > > > return virtnet_set_guest_offloads(vi, offloads); > > > > } > > > > > > > > +static int virtnet_rq_bind_xsk_pool(struct virtnet_info *vi, struct receive_queue *rq, > > > > + struct xsk_buff_pool *pool) > > > > +{ > > > > + int err, qindex; > > > > + > > > > + qindex = rq - vi->rq; > > > > + > > > > + if (pool) { > > > > + err = xdp_rxq_info_reg(&rq->xsk.xdp_rxq, vi->dev, qindex, rq->napi.napi_id); > > > > + if (err < 0) > > > > + return err; > > > > + > > > > + err = xdp_rxq_info_reg_mem_model(&rq->xsk.xdp_rxq, > > > > + MEM_TYPE_XSK_BUFF_POOL, NULL); > > > > + if (err < 0) { > > > > + xdp_rxq_info_unreg(&rq->xsk.xdp_rxq); > > > > + return err; > > > > + } > > > > + > > > > + xsk_pool_set_rxq_info(pool, &rq->xsk.xdp_rxq); > > > > + } > > > > + > > > > + virtnet_rx_pause(vi, rq); > > > > + > > > > + err = virtqueue_reset(rq->vq, virtnet_rq_unmap_free_buf); > > > > + if (err) { > > > > + netdev_err(vi->dev, "reset rx fail: rx queue index: %d err: %d\n", qindex, err); > > > > + > > > > + pool = NULL; > > > > + } > > > > + > > > > + if (!pool) > > > > + xdp_rxq_info_unreg(&rq->xsk.xdp_rxq); > > > > > > Let's use err label instead of duplicating xdp_rxq_info_unreg() here? > > > > > > > + > > > > + rq->xsk.pool = pool; > > > > + > > > > + virtnet_rx_resume(vi, rq); > > > > + > > > > + return err; > > > > +} > > > > + > > > > +static int virtnet_sq_bind_xsk_pool(struct virtnet_info *vi, > > > > + struct send_queue *sq, > > > > + struct xsk_buff_pool *pool) > > > > +{ > > > > + int err, qindex; > > > > + > > > > + qindex = sq - vi->sq; > > > > + > > > > + virtnet_tx_pause(vi, sq); > > > > + > > > > + err = virtqueue_reset(sq->vq, virtnet_sq_free_unused_buf); > > > > + if (err) > > > > + netdev_err(vi->dev, "reset tx fail: tx queue index: %d err: %d\n", qindex, err); > > > > + else > > > > + err = virtnet_sq_set_premapped(sq, !!pool); > > > > + > > > > + if (err) > > > > + pool = NULL; > > > > + > > > > + sq->xsk.pool = pool; > > > > + > > > > + virtnet_tx_resume(vi, sq); > > > > + > > > > + return err; > > > > +} > > > > + > > > > +static int virtnet_xsk_pool_enable(struct net_device *dev, > > > > + struct xsk_buff_pool *pool, > > > > + u16 qid) > > > > +{ > > > > + struct virtnet_info *vi = netdev_priv(dev); > > > > + struct receive_queue *rq; > > > > + struct send_queue *sq; > > > > + struct device *dma_dev; > > > > + dma_addr_t hdr_dma; > > > > + int err; > > > > + > > > > + /* In big_packets mode, xdp cannot work, so there is no need to > > > > + * initialize xsk of rq. > > > > + * > > > > + * Support for small mode firstly. > > > > > > This comment is kind of confusing, I think mergeable mode is also > > > supported. If it's true, we can simply remove it. > > > > For the commit num limit of the net-next, I have to remove some commits. > > > > So the mergeable mode is not supported by this patch set. > > > > I plan to support the merge mode after this patch set. > > Then, I'd suggest to split the patches into two series: > > 1) AF_XDP TX zerocopy > 2) AF_XDP RX zerocopy > > And implement both small and mergeable in series 2). I am ok. > > > > > > > > > > > > + */ > > > > + if (vi->big_packets) > > > > + return -ENOENT; > > > > + > > > > + if (qid >= vi->curr_queue_pairs) > > > > + return -EINVAL; > > > > + > > > > + sq = &vi->sq[qid]; > > > > + rq = &vi->rq[qid]; > > > > + > > > > + /* xsk tx zerocopy depend on the tx napi. > > > > + * > > > > + * All xsk packets are actually consumed and sent out from the xsk tx > > > > + * queue under the tx napi mechanism. > > > > + */ > > > > + if (!sq->napi.weight) > > > > + return -EPERM; > > > > + > > > > + /* For the xsk, the tx and rx should have the same device. But > > > > + * vq->dma_dev allows every vq has the respective dma dev. So I check > > > > + * the dma dev of vq and sq is the same dev. > > > > + */ > > > > + if (virtqueue_dma_dev(rq->vq) != virtqueue_dma_dev(sq->vq)) > > > > + return -EPERM; > > > > > > I don't understand how a different DMA device matters here. It looks > > > like the code is using per virtqueue DMA below. > > > > The af-xdp may use one buffer to receive from the rx and reuse this buffer to > > send by the tx. So the dma dev of sq and rq should be the same one. > > Right, let's tweak the comment to say something like this. Will fix. Thanks. > > Thanks >
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 88ab9ea1646f..35fd8bca7fcf 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -26,6 +26,7 @@ #include <net/netdev_rx_queue.h> #include <net/netdev_queues.h> #include <uapi/linux/virtio_ring.h> +#include <net/xdp_sock_drv.h> static int napi_weight = NAPI_POLL_WEIGHT; module_param(napi_weight, int, 0444); @@ -57,6 +58,8 @@ DECLARE_EWMA(pkt_len, 0, 64) #define VIRTNET_DRIVER_VERSION "1.0.0" +static struct virtio_net_hdr_mrg_rxbuf xsk_hdr; + static const unsigned long guest_offloads[] = { VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6, @@ -321,6 +324,12 @@ struct send_queue { bool premapped; struct virtnet_sq_dma_info dmainfo; + + struct { + struct xsk_buff_pool *pool; + + dma_addr_t hdr_dma_address; + } xsk; }; /* Internal representation of a receive virtqueue */ @@ -372,6 +381,13 @@ struct receive_queue { /* Record the last dma info to free after new pages is allocated. */ struct virtnet_rq_dma *last_dma; + + struct { + struct xsk_buff_pool *pool; + + /* xdp rxq used by xsk */ + struct xdp_rxq_info xdp_rxq; + } xsk; }; /* This structure can contain rss message with maximum settings for indirection table and keysize @@ -695,7 +711,7 @@ static void virtnet_sq_free_dma_meta(struct send_queue *sq) /* This function must be called immediately after creating the vq, or after vq * reset, and before adding any buffers to it. */ -static __maybe_unused int virtnet_sq_set_premapped(struct send_queue *sq, bool premapped) +static int virtnet_sq_set_premapped(struct send_queue *sq, bool premapped) { if (premapped) { int r; @@ -5177,6 +5193,187 @@ static int virtnet_restore_guest_offloads(struct virtnet_info *vi) return virtnet_set_guest_offloads(vi, offloads); } +static int virtnet_rq_bind_xsk_pool(struct virtnet_info *vi, struct receive_queue *rq, + struct xsk_buff_pool *pool) +{ + int err, qindex; + + qindex = rq - vi->rq; + + if (pool) { + err = xdp_rxq_info_reg(&rq->xsk.xdp_rxq, vi->dev, qindex, rq->napi.napi_id); + if (err < 0) + return err; + + err = xdp_rxq_info_reg_mem_model(&rq->xsk.xdp_rxq, + MEM_TYPE_XSK_BUFF_POOL, NULL); + if (err < 0) { + xdp_rxq_info_unreg(&rq->xsk.xdp_rxq); + return err; + } + + xsk_pool_set_rxq_info(pool, &rq->xsk.xdp_rxq); + } + + virtnet_rx_pause(vi, rq); + + err = virtqueue_reset(rq->vq, virtnet_rq_unmap_free_buf); + if (err) { + netdev_err(vi->dev, "reset rx fail: rx queue index: %d err: %d\n", qindex, err); + + pool = NULL; + } + + if (!pool) + xdp_rxq_info_unreg(&rq->xsk.xdp_rxq); + + rq->xsk.pool = pool; + + virtnet_rx_resume(vi, rq); + + return err; +} + +static int virtnet_sq_bind_xsk_pool(struct virtnet_info *vi, + struct send_queue *sq, + struct xsk_buff_pool *pool) +{ + int err, qindex; + + qindex = sq - vi->sq; + + virtnet_tx_pause(vi, sq); + + err = virtqueue_reset(sq->vq, virtnet_sq_free_unused_buf); + if (err) + netdev_err(vi->dev, "reset tx fail: tx queue index: %d err: %d\n", qindex, err); + else + err = virtnet_sq_set_premapped(sq, !!pool); + + if (err) + pool = NULL; + + sq->xsk.pool = pool; + + virtnet_tx_resume(vi, sq); + + return err; +} + +static int virtnet_xsk_pool_enable(struct net_device *dev, + struct xsk_buff_pool *pool, + u16 qid) +{ + struct virtnet_info *vi = netdev_priv(dev); + struct receive_queue *rq; + struct send_queue *sq; + struct device *dma_dev; + dma_addr_t hdr_dma; + int err; + + /* In big_packets mode, xdp cannot work, so there is no need to + * initialize xsk of rq. + * + * Support for small mode firstly. + */ + if (vi->big_packets) + return -ENOENT; + + if (qid >= vi->curr_queue_pairs) + return -EINVAL; + + sq = &vi->sq[qid]; + rq = &vi->rq[qid]; + + /* xsk tx zerocopy depend on the tx napi. + * + * All xsk packets are actually consumed and sent out from the xsk tx + * queue under the tx napi mechanism. + */ + if (!sq->napi.weight) + return -EPERM; + + /* For the xsk, the tx and rx should have the same device. But + * vq->dma_dev allows every vq has the respective dma dev. So I check + * the dma dev of vq and sq is the same dev. + */ + if (virtqueue_dma_dev(rq->vq) != virtqueue_dma_dev(sq->vq)) + return -EPERM; + + dma_dev = virtqueue_dma_dev(rq->vq); + if (!dma_dev) + return -EPERM; + + hdr_dma = dma_map_single(dma_dev, &xsk_hdr, vi->hdr_len, DMA_TO_DEVICE); + if (dma_mapping_error(dma_dev, hdr_dma)) + return -ENOMEM; + + err = xsk_pool_dma_map(pool, dma_dev, 0); + if (err) + goto err_xsk_map; + + err = virtnet_rq_bind_xsk_pool(vi, rq, pool); + if (err) + goto err_rq; + + err = virtnet_sq_bind_xsk_pool(vi, sq, pool); + if (err) + goto err_sq; + + /* Now, we do not support tx offset, so all the tx virtnet hdr is zero. + * So all the tx packets can share a single hdr. + */ + sq->xsk.hdr_dma_address = hdr_dma; + + return 0; + +err_sq: + virtnet_rq_bind_xsk_pool(vi, rq, NULL); +err_rq: + xsk_pool_dma_unmap(pool, 0); +err_xsk_map: + dma_unmap_single(dma_dev, hdr_dma, vi->hdr_len, DMA_TO_DEVICE); + return err; +} + +static int virtnet_xsk_pool_disable(struct net_device *dev, u16 qid) +{ + struct virtnet_info *vi = netdev_priv(dev); + struct xsk_buff_pool *pool; + struct device *dma_dev; + struct receive_queue *rq; + struct send_queue *sq; + int err1, err2; + + if (qid >= vi->curr_queue_pairs) + return -EINVAL; + + sq = &vi->sq[qid]; + rq = &vi->rq[qid]; + + pool = sq->xsk.pool; + + err1 = virtnet_sq_bind_xsk_pool(vi, sq, NULL); + err2 = virtnet_rq_bind_xsk_pool(vi, rq, NULL); + + xsk_pool_dma_unmap(pool, 0); + + dma_dev = virtqueue_dma_dev(rq->vq); + + dma_unmap_single(dma_dev, sq->xsk.hdr_dma_address, vi->hdr_len, DMA_TO_DEVICE); + + return err1 | err2; +} + +static int virtnet_xsk_pool_setup(struct net_device *dev, struct netdev_bpf *xdp) +{ + if (xdp->xsk.pool) + return virtnet_xsk_pool_enable(dev, xdp->xsk.pool, + xdp->xsk.queue_id); + else + return virtnet_xsk_pool_disable(dev, xdp->xsk.queue_id); +} + static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog, struct netlink_ext_ack *extack) { @@ -5302,6 +5499,8 @@ static int virtnet_xdp(struct net_device *dev, struct netdev_bpf *xdp) switch (xdp->command) { case XDP_SETUP_PROG: return virtnet_xdp_set(dev, xdp->prog, xdp->extack); + case XDP_SETUP_XSK_POOL: + return virtnet_xsk_pool_setup(dev, xdp); default: return -EINVAL; }
This patch implement the logic of bind/unbind xsk pool to sq and rq. Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com> --- drivers/net/virtio_net.c | 201 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 200 insertions(+), 1 deletion(-)