Message ID | 20240705073734.93905-10-xuanzhuo@linux.alibaba.com (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | virtio-net: support AF_XDP zero copy | expand |
On Fri, Jul 5, 2024 at 3:38 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote: > > In the process: > 1. We may need to copy data to create skb for XDP_PASS. > 2. We may need to call xsk_buff_free() to release the buffer. > 3. The handle for xdp_buff is difference from the buffer. > > If we pushed this logic into existing receive handle(merge and small), > we would have to maintain code scattered inside merge and small (and big). > So I think it is a good choice for us to put the xsk code into an > independent function. > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com> > --- > > v7: > 1. rename xdp_construct_skb to xsk_construct_skb > 2. refactor virtnet_receive() > > drivers/net/virtio_net.c | 176 +++++++++++++++++++++++++++++++++++++-- > 1 file changed, 168 insertions(+), 8 deletions(-) > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > index 2b27f5ada64a..64d8cd481890 100644 > --- a/drivers/net/virtio_net.c > +++ b/drivers/net/virtio_net.c > @@ -498,6 +498,12 @@ struct virtio_net_common_hdr { > }; > > static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf); > +static int virtnet_xdp_handler(struct bpf_prog *xdp_prog, struct xdp_buff *xdp, > + struct net_device *dev, > + unsigned int *xdp_xmit, > + struct virtnet_rq_stats *stats); > +static void virtnet_receive_done(struct virtnet_info *vi, struct receive_queue *rq, > + struct sk_buff *skb, u8 flags); > > static bool is_xdp_frame(void *ptr) > { > @@ -1062,6 +1068,124 @@ static void sg_fill_dma(struct scatterlist *sg, dma_addr_t addr, u32 len) > sg->length = len; > } > > +static struct xdp_buff *buf_to_xdp(struct virtnet_info *vi, > + struct receive_queue *rq, void *buf, u32 len) > +{ > + struct xdp_buff *xdp; > + u32 bufsize; > + > + xdp = (struct xdp_buff *)buf; > + > + bufsize = xsk_pool_get_rx_frame_size(rq->xsk_pool) + vi->hdr_len; > + > + if (unlikely(len > bufsize)) { > + pr_debug("%s: rx error: len %u exceeds truesize %u\n", > + vi->dev->name, len, bufsize); > + DEV_STATS_INC(vi->dev, rx_length_errors); > + xsk_buff_free(xdp); > + return NULL; > + } > + > + xsk_buff_set_size(xdp, len); > + xsk_buff_dma_sync_for_cpu(xdp); > + > + return xdp; > +} > + > +static struct sk_buff *xsk_construct_skb(struct receive_queue *rq, > + struct xdp_buff *xdp) > +{ > + unsigned int metasize = xdp->data - xdp->data_meta; > + struct sk_buff *skb; > + unsigned int size; > + > + size = xdp->data_end - xdp->data_hard_start; > + skb = napi_alloc_skb(&rq->napi, size); > + if (unlikely(!skb)) { > + xsk_buff_free(xdp); > + return NULL; > + } > + > + skb_reserve(skb, xdp->data_meta - xdp->data_hard_start); > + > + size = xdp->data_end - xdp->data_meta; > + memcpy(__skb_put(skb, size), xdp->data_meta, size); > + > + if (metasize) { > + __skb_pull(skb, metasize); > + skb_metadata_set(skb, metasize); > + } > + > + xsk_buff_free(xdp); > + > + return skb; > +} > + > +static struct sk_buff *virtnet_receive_xsk_small(struct net_device *dev, struct virtnet_info *vi, > + struct receive_queue *rq, struct xdp_buff *xdp, > + unsigned int *xdp_xmit, > + struct virtnet_rq_stats *stats) > +{ > + struct bpf_prog *prog; > + u32 ret; > + > + ret = XDP_PASS; > + rcu_read_lock(); > + prog = rcu_dereference(rq->xdp_prog); > + if (prog) > + ret = virtnet_xdp_handler(prog, xdp, dev, xdp_xmit, stats); > + rcu_read_unlock(); > + > + switch (ret) { > + case XDP_PASS: > + return xsk_construct_skb(rq, xdp); > + > + case XDP_TX: > + case XDP_REDIRECT: > + return NULL; > + > + default: > + /* drop packet */ > + xsk_buff_free(xdp); > + u64_stats_inc(&stats->drops); > + return NULL; > + } > +} > + > +static void virtnet_receive_xsk_buf(struct virtnet_info *vi, struct receive_queue *rq, > + void *buf, u32 len, > + unsigned int *xdp_xmit, > + struct virtnet_rq_stats *stats) > +{ > + struct net_device *dev = vi->dev; > + struct sk_buff *skb = NULL; > + struct xdp_buff *xdp; > + u8 flags; > + > + len -= vi->hdr_len; > + > + u64_stats_add(&stats->bytes, len); > + > + xdp = buf_to_xdp(vi, rq, buf, len); > + if (!xdp) > + return; > + > + if (unlikely(len < ETH_HLEN)) { > + pr_debug("%s: short packet %i\n", dev->name, len); > + DEV_STATS_INC(dev, rx_length_errors); > + xsk_buff_free(xdp); > + return; > + } > + > + flags = ((struct virtio_net_common_hdr *)(xdp->data - vi->hdr_len))->hdr.flags; > + > + if (!vi->mergeable_rx_bufs) > + skb = virtnet_receive_xsk_small(dev, vi, rq, xdp, xdp_xmit, stats); I wonder if we add the mergeable support in the next patch would it be better to re-order the patch? For example, the xsk binding needs to be moved to the last patch, otherwise we break xsk with a mergeable buffer here? Or anything I missed here? Thanks
On Mon, 8 Jul 2024 15:00:50 +0800, Jason Wang <jasowang@redhat.com> wrote: > On Fri, Jul 5, 2024 at 3:38 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote: > > > > In the process: > > 1. We may need to copy data to create skb for XDP_PASS. > > 2. We may need to call xsk_buff_free() to release the buffer. > > 3. The handle for xdp_buff is difference from the buffer. > > > > If we pushed this logic into existing receive handle(merge and small), > > we would have to maintain code scattered inside merge and small (and big). > > So I think it is a good choice for us to put the xsk code into an > > independent function. > > > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com> > > --- > > > > v7: > > 1. rename xdp_construct_skb to xsk_construct_skb > > 2. refactor virtnet_receive() > > > > drivers/net/virtio_net.c | 176 +++++++++++++++++++++++++++++++++++++-- > > 1 file changed, 168 insertions(+), 8 deletions(-) > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > > index 2b27f5ada64a..64d8cd481890 100644 > > --- a/drivers/net/virtio_net.c > > +++ b/drivers/net/virtio_net.c > > @@ -498,6 +498,12 @@ struct virtio_net_common_hdr { > > }; > > > > static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf); > > +static int virtnet_xdp_handler(struct bpf_prog *xdp_prog, struct xdp_buff *xdp, > > + struct net_device *dev, > > + unsigned int *xdp_xmit, > > + struct virtnet_rq_stats *stats); > > +static void virtnet_receive_done(struct virtnet_info *vi, struct receive_queue *rq, > > + struct sk_buff *skb, u8 flags); > > > > static bool is_xdp_frame(void *ptr) > > { > > @@ -1062,6 +1068,124 @@ static void sg_fill_dma(struct scatterlist *sg, dma_addr_t addr, u32 len) > > sg->length = len; > > } > > > > +static struct xdp_buff *buf_to_xdp(struct virtnet_info *vi, > > + struct receive_queue *rq, void *buf, u32 len) > > +{ > > + struct xdp_buff *xdp; > > + u32 bufsize; > > + > > + xdp = (struct xdp_buff *)buf; > > + > > + bufsize = xsk_pool_get_rx_frame_size(rq->xsk_pool) + vi->hdr_len; > > + > > + if (unlikely(len > bufsize)) { > > + pr_debug("%s: rx error: len %u exceeds truesize %u\n", > > + vi->dev->name, len, bufsize); > > + DEV_STATS_INC(vi->dev, rx_length_errors); > > + xsk_buff_free(xdp); > > + return NULL; > > + } > > + > > + xsk_buff_set_size(xdp, len); > > + xsk_buff_dma_sync_for_cpu(xdp); > > + > > + return xdp; > > +} > > + > > +static struct sk_buff *xsk_construct_skb(struct receive_queue *rq, > > + struct xdp_buff *xdp) > > +{ > > + unsigned int metasize = xdp->data - xdp->data_meta; > > + struct sk_buff *skb; > > + unsigned int size; > > + > > + size = xdp->data_end - xdp->data_hard_start; > > + skb = napi_alloc_skb(&rq->napi, size); > > + if (unlikely(!skb)) { > > + xsk_buff_free(xdp); > > + return NULL; > > + } > > + > > + skb_reserve(skb, xdp->data_meta - xdp->data_hard_start); > > + > > + size = xdp->data_end - xdp->data_meta; > > + memcpy(__skb_put(skb, size), xdp->data_meta, size); > > + > > + if (metasize) { > > + __skb_pull(skb, metasize); > > + skb_metadata_set(skb, metasize); > > + } > > + > > + xsk_buff_free(xdp); > > + > > + return skb; > > +} > > + > > +static struct sk_buff *virtnet_receive_xsk_small(struct net_device *dev, struct virtnet_info *vi, > > + struct receive_queue *rq, struct xdp_buff *xdp, > > + unsigned int *xdp_xmit, > > + struct virtnet_rq_stats *stats) > > +{ > > + struct bpf_prog *prog; > > + u32 ret; > > + > > + ret = XDP_PASS; > > + rcu_read_lock(); > > + prog = rcu_dereference(rq->xdp_prog); > > + if (prog) > > + ret = virtnet_xdp_handler(prog, xdp, dev, xdp_xmit, stats); > > + rcu_read_unlock(); > > + > > + switch (ret) { > > + case XDP_PASS: > > + return xsk_construct_skb(rq, xdp); > > + > > + case XDP_TX: > > + case XDP_REDIRECT: > > + return NULL; > > + > > + default: > > + /* drop packet */ > > + xsk_buff_free(xdp); > > + u64_stats_inc(&stats->drops); > > + return NULL; > > + } > > +} > > + > > +static void virtnet_receive_xsk_buf(struct virtnet_info *vi, struct receive_queue *rq, > > + void *buf, u32 len, > > + unsigned int *xdp_xmit, > > + struct virtnet_rq_stats *stats) > > +{ > > + struct net_device *dev = vi->dev; > > + struct sk_buff *skb = NULL; > > + struct xdp_buff *xdp; > > + u8 flags; > > + > > + len -= vi->hdr_len; > > + > > + u64_stats_add(&stats->bytes, len); > > + > > + xdp = buf_to_xdp(vi, rq, buf, len); > > + if (!xdp) > > + return; > > + > > + if (unlikely(len < ETH_HLEN)) { > > + pr_debug("%s: short packet %i\n", dev->name, len); > > + DEV_STATS_INC(dev, rx_length_errors); > > + xsk_buff_free(xdp); > > + return; > > + } > > + > > + flags = ((struct virtio_net_common_hdr *)(xdp->data - vi->hdr_len))->hdr.flags; > > + > > + if (!vi->mergeable_rx_bufs) > > + skb = virtnet_receive_xsk_small(dev, vi, rq, xdp, xdp_xmit, stats); > > I wonder if we add the mergeable support in the next patch would it be > better to re-order the patch? For example, the xsk binding needs to be > moved to the last patch, otherwise we break xsk with a mergeable > buffer here? If you worry that the user works with this commit, I want to say you do not worry. Because the flags NETDEV_XDP_ACT_XSK_ZEROCOPY is not added. I plan to add that after the tx is completed. I do test by adding this flags locally. Thanks. > > Or anything I missed here? > > Thanks >
On Mon, Jul 8, 2024 at 3:47 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote: > > On Mon, 8 Jul 2024 15:00:50 +0800, Jason Wang <jasowang@redhat.com> wrote: > > On Fri, Jul 5, 2024 at 3:38 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote: > > > > > > In the process: > > > 1. We may need to copy data to create skb for XDP_PASS. > > > 2. We may need to call xsk_buff_free() to release the buffer. > > > 3. The handle for xdp_buff is difference from the buffer. > > > > > > If we pushed this logic into existing receive handle(merge and small), > > > we would have to maintain code scattered inside merge and small (and big). > > > So I think it is a good choice for us to put the xsk code into an > > > independent function. > > > > > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com> > > > --- > > > > > > v7: > > > 1. rename xdp_construct_skb to xsk_construct_skb > > > 2. refactor virtnet_receive() > > > > > > drivers/net/virtio_net.c | 176 +++++++++++++++++++++++++++++++++++++-- > > > 1 file changed, 168 insertions(+), 8 deletions(-) > > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > > > index 2b27f5ada64a..64d8cd481890 100644 > > > --- a/drivers/net/virtio_net.c > > > +++ b/drivers/net/virtio_net.c > > > @@ -498,6 +498,12 @@ struct virtio_net_common_hdr { > > > }; > > > > > > static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf); > > > +static int virtnet_xdp_handler(struct bpf_prog *xdp_prog, struct xdp_buff *xdp, > > > + struct net_device *dev, > > > + unsigned int *xdp_xmit, > > > + struct virtnet_rq_stats *stats); > > > +static void virtnet_receive_done(struct virtnet_info *vi, struct receive_queue *rq, > > > + struct sk_buff *skb, u8 flags); > > > > > > static bool is_xdp_frame(void *ptr) > > > { > > > @@ -1062,6 +1068,124 @@ static void sg_fill_dma(struct scatterlist *sg, dma_addr_t addr, u32 len) > > > sg->length = len; > > > } > > > > > > +static struct xdp_buff *buf_to_xdp(struct virtnet_info *vi, > > > + struct receive_queue *rq, void *buf, u32 len) > > > +{ > > > + struct xdp_buff *xdp; > > > + u32 bufsize; > > > + > > > + xdp = (struct xdp_buff *)buf; > > > + > > > + bufsize = xsk_pool_get_rx_frame_size(rq->xsk_pool) + vi->hdr_len; > > > + > > > + if (unlikely(len > bufsize)) { > > > + pr_debug("%s: rx error: len %u exceeds truesize %u\n", > > > + vi->dev->name, len, bufsize); > > > + DEV_STATS_INC(vi->dev, rx_length_errors); > > > + xsk_buff_free(xdp); > > > + return NULL; > > > + } > > > + > > > + xsk_buff_set_size(xdp, len); > > > + xsk_buff_dma_sync_for_cpu(xdp); > > > + > > > + return xdp; > > > +} > > > + > > > +static struct sk_buff *xsk_construct_skb(struct receive_queue *rq, > > > + struct xdp_buff *xdp) > > > +{ > > > + unsigned int metasize = xdp->data - xdp->data_meta; > > > + struct sk_buff *skb; > > > + unsigned int size; > > > + > > > + size = xdp->data_end - xdp->data_hard_start; > > > + skb = napi_alloc_skb(&rq->napi, size); > > > + if (unlikely(!skb)) { > > > + xsk_buff_free(xdp); > > > + return NULL; > > > + } > > > + > > > + skb_reserve(skb, xdp->data_meta - xdp->data_hard_start); > > > + > > > + size = xdp->data_end - xdp->data_meta; > > > + memcpy(__skb_put(skb, size), xdp->data_meta, size); > > > + > > > + if (metasize) { > > > + __skb_pull(skb, metasize); > > > + skb_metadata_set(skb, metasize); > > > + } > > > + > > > + xsk_buff_free(xdp); > > > + > > > + return skb; > > > +} > > > + > > > +static struct sk_buff *virtnet_receive_xsk_small(struct net_device *dev, struct virtnet_info *vi, > > > + struct receive_queue *rq, struct xdp_buff *xdp, > > > + unsigned int *xdp_xmit, > > > + struct virtnet_rq_stats *stats) > > > +{ > > > + struct bpf_prog *prog; > > > + u32 ret; > > > + > > > + ret = XDP_PASS; > > > + rcu_read_lock(); > > > + prog = rcu_dereference(rq->xdp_prog); > > > + if (prog) > > > + ret = virtnet_xdp_handler(prog, xdp, dev, xdp_xmit, stats); > > > + rcu_read_unlock(); > > > + > > > + switch (ret) { > > > + case XDP_PASS: > > > + return xsk_construct_skb(rq, xdp); > > > + > > > + case XDP_TX: > > > + case XDP_REDIRECT: > > > + return NULL; > > > + > > > + default: > > > + /* drop packet */ > > > + xsk_buff_free(xdp); > > > + u64_stats_inc(&stats->drops); > > > + return NULL; > > > + } > > > +} > > > + > > > +static void virtnet_receive_xsk_buf(struct virtnet_info *vi, struct receive_queue *rq, > > > + void *buf, u32 len, > > > + unsigned int *xdp_xmit, > > > + struct virtnet_rq_stats *stats) > > > +{ > > > + struct net_device *dev = vi->dev; > > > + struct sk_buff *skb = NULL; > > > + struct xdp_buff *xdp; > > > + u8 flags; > > > + > > > + len -= vi->hdr_len; > > > + > > > + u64_stats_add(&stats->bytes, len); > > > + > > > + xdp = buf_to_xdp(vi, rq, buf, len); > > > + if (!xdp) > > > + return; > > > + > > > + if (unlikely(len < ETH_HLEN)) { > > > + pr_debug("%s: short packet %i\n", dev->name, len); > > > + DEV_STATS_INC(dev, rx_length_errors); > > > + xsk_buff_free(xdp); > > > + return; > > > + } > > > + > > > + flags = ((struct virtio_net_common_hdr *)(xdp->data - vi->hdr_len))->hdr.flags; > > > + > > > + if (!vi->mergeable_rx_bufs) > > > + skb = virtnet_receive_xsk_small(dev, vi, rq, xdp, xdp_xmit, stats); > > > > I wonder if we add the mergeable support in the next patch would it be > > better to re-order the patch? For example, the xsk binding needs to be > > moved to the last patch, otherwise we break xsk with a mergeable > > buffer here? > > If you worry that the user works with this commit, I want to say you do not > worry. > > Because the flags NETDEV_XDP_ACT_XSK_ZEROCOPY is not added. I plan to add that > after the tx is completed. Ok, this is something I missed, it would be better to mention it somewhere (or it is already there but I miss it). > > I do test by adding this flags locally. > > Thanks. Acked-by: Jason Wang <jasowang@redhat.com> Thanks > > > > > Or anything I missed here? > > > > Thanks > > >
On Mon, 8 Jul 2024 16:08:44 +0800, Jason Wang <jasowang@redhat.com> wrote: > On Mon, Jul 8, 2024 at 3:47 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote: > > > > On Mon, 8 Jul 2024 15:00:50 +0800, Jason Wang <jasowang@redhat.com> wrote: > > > On Fri, Jul 5, 2024 at 3:38 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote: > > > > > > > > In the process: > > > > 1. We may need to copy data to create skb for XDP_PASS. > > > > 2. We may need to call xsk_buff_free() to release the buffer. > > > > 3. The handle for xdp_buff is difference from the buffer. > > > > > > > > If we pushed this logic into existing receive handle(merge and small), > > > > we would have to maintain code scattered inside merge and small (and big). > > > > So I think it is a good choice for us to put the xsk code into an > > > > independent function. > > > > > > > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com> > > > > --- > > > > > > > > v7: > > > > 1. rename xdp_construct_skb to xsk_construct_skb > > > > 2. refactor virtnet_receive() > > > > > > > > drivers/net/virtio_net.c | 176 +++++++++++++++++++++++++++++++++++++-- > > > > 1 file changed, 168 insertions(+), 8 deletions(-) > > > > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > > > > index 2b27f5ada64a..64d8cd481890 100644 > > > > --- a/drivers/net/virtio_net.c > > > > +++ b/drivers/net/virtio_net.c > > > > @@ -498,6 +498,12 @@ struct virtio_net_common_hdr { > > > > }; > > > > > > > > static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf); > > > > +static int virtnet_xdp_handler(struct bpf_prog *xdp_prog, struct xdp_buff *xdp, > > > > + struct net_device *dev, > > > > + unsigned int *xdp_xmit, > > > > + struct virtnet_rq_stats *stats); > > > > +static void virtnet_receive_done(struct virtnet_info *vi, struct receive_queue *rq, > > > > + struct sk_buff *skb, u8 flags); > > > > > > > > static bool is_xdp_frame(void *ptr) > > > > { > > > > @@ -1062,6 +1068,124 @@ static void sg_fill_dma(struct scatterlist *sg, dma_addr_t addr, u32 len) > > > > sg->length = len; > > > > } > > > > > > > > +static struct xdp_buff *buf_to_xdp(struct virtnet_info *vi, > > > > + struct receive_queue *rq, void *buf, u32 len) > > > > +{ > > > > + struct xdp_buff *xdp; > > > > + u32 bufsize; > > > > + > > > > + xdp = (struct xdp_buff *)buf; > > > > + > > > > + bufsize = xsk_pool_get_rx_frame_size(rq->xsk_pool) + vi->hdr_len; > > > > + > > > > + if (unlikely(len > bufsize)) { > > > > + pr_debug("%s: rx error: len %u exceeds truesize %u\n", > > > > + vi->dev->name, len, bufsize); > > > > + DEV_STATS_INC(vi->dev, rx_length_errors); > > > > + xsk_buff_free(xdp); > > > > + return NULL; > > > > + } > > > > + > > > > + xsk_buff_set_size(xdp, len); > > > > + xsk_buff_dma_sync_for_cpu(xdp); > > > > + > > > > + return xdp; > > > > +} > > > > + > > > > +static struct sk_buff *xsk_construct_skb(struct receive_queue *rq, > > > > + struct xdp_buff *xdp) > > > > +{ > > > > + unsigned int metasize = xdp->data - xdp->data_meta; > > > > + struct sk_buff *skb; > > > > + unsigned int size; > > > > + > > > > + size = xdp->data_end - xdp->data_hard_start; > > > > + skb = napi_alloc_skb(&rq->napi, size); > > > > + if (unlikely(!skb)) { > > > > + xsk_buff_free(xdp); > > > > + return NULL; > > > > + } > > > > + > > > > + skb_reserve(skb, xdp->data_meta - xdp->data_hard_start); > > > > + > > > > + size = xdp->data_end - xdp->data_meta; > > > > + memcpy(__skb_put(skb, size), xdp->data_meta, size); > > > > + > > > > + if (metasize) { > > > > + __skb_pull(skb, metasize); > > > > + skb_metadata_set(skb, metasize); > > > > + } > > > > + > > > > + xsk_buff_free(xdp); > > > > + > > > > + return skb; > > > > +} > > > > + > > > > +static struct sk_buff *virtnet_receive_xsk_small(struct net_device *dev, struct virtnet_info *vi, > > > > + struct receive_queue *rq, struct xdp_buff *xdp, > > > > + unsigned int *xdp_xmit, > > > > + struct virtnet_rq_stats *stats) > > > > +{ > > > > + struct bpf_prog *prog; > > > > + u32 ret; > > > > + > > > > + ret = XDP_PASS; > > > > + rcu_read_lock(); > > > > + prog = rcu_dereference(rq->xdp_prog); > > > > + if (prog) > > > > + ret = virtnet_xdp_handler(prog, xdp, dev, xdp_xmit, stats); > > > > + rcu_read_unlock(); > > > > + > > > > + switch (ret) { > > > > + case XDP_PASS: > > > > + return xsk_construct_skb(rq, xdp); > > > > + > > > > + case XDP_TX: > > > > + case XDP_REDIRECT: > > > > + return NULL; > > > > + > > > > + default: > > > > + /* drop packet */ > > > > + xsk_buff_free(xdp); > > > > + u64_stats_inc(&stats->drops); > > > > + return NULL; > > > > + } > > > > +} > > > > + > > > > +static void virtnet_receive_xsk_buf(struct virtnet_info *vi, struct receive_queue *rq, > > > > + void *buf, u32 len, > > > > + unsigned int *xdp_xmit, > > > > + struct virtnet_rq_stats *stats) > > > > +{ > > > > + struct net_device *dev = vi->dev; > > > > + struct sk_buff *skb = NULL; > > > > + struct xdp_buff *xdp; > > > > + u8 flags; > > > > + > > > > + len -= vi->hdr_len; > > > > + > > > > + u64_stats_add(&stats->bytes, len); > > > > + > > > > + xdp = buf_to_xdp(vi, rq, buf, len); > > > > + if (!xdp) > > > > + return; > > > > + > > > > + if (unlikely(len < ETH_HLEN)) { > > > > + pr_debug("%s: short packet %i\n", dev->name, len); > > > > + DEV_STATS_INC(dev, rx_length_errors); > > > > + xsk_buff_free(xdp); > > > > + return; > > > > + } > > > > + > > > > + flags = ((struct virtio_net_common_hdr *)(xdp->data - vi->hdr_len))->hdr.flags; > > > > + > > > > + if (!vi->mergeable_rx_bufs) > > > > + skb = virtnet_receive_xsk_small(dev, vi, rq, xdp, xdp_xmit, stats); > > > > > > I wonder if we add the mergeable support in the next patch would it be > > > better to re-order the patch? For example, the xsk binding needs to be > > > moved to the last patch, otherwise we break xsk with a mergeable > > > buffer here? > > > > If you worry that the user works with this commit, I want to say you do not > > worry. > > > > Because the flags NETDEV_XDP_ACT_XSK_ZEROCOPY is not added. I plan to add that > > after the tx is completed. > > Ok, this is something I missed, it would be better to mention it > somewhere (or it is already there but I miss it). OK. I will add it to next version cover. Thanks. > > > > > I do test by adding this flags locally. > > > > Thanks. > > Acked-by: Jason Wang <jasowang@redhat.com> > > Thanks > > > > > > > > > Or anything I missed here? > > > > > > Thanks > > > > > >
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 2b27f5ada64a..64d8cd481890 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -498,6 +498,12 @@ struct virtio_net_common_hdr { }; static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf); +static int virtnet_xdp_handler(struct bpf_prog *xdp_prog, struct xdp_buff *xdp, + struct net_device *dev, + unsigned int *xdp_xmit, + struct virtnet_rq_stats *stats); +static void virtnet_receive_done(struct virtnet_info *vi, struct receive_queue *rq, + struct sk_buff *skb, u8 flags); static bool is_xdp_frame(void *ptr) { @@ -1062,6 +1068,124 @@ static void sg_fill_dma(struct scatterlist *sg, dma_addr_t addr, u32 len) sg->length = len; } +static struct xdp_buff *buf_to_xdp(struct virtnet_info *vi, + struct receive_queue *rq, void *buf, u32 len) +{ + struct xdp_buff *xdp; + u32 bufsize; + + xdp = (struct xdp_buff *)buf; + + bufsize = xsk_pool_get_rx_frame_size(rq->xsk_pool) + vi->hdr_len; + + if (unlikely(len > bufsize)) { + pr_debug("%s: rx error: len %u exceeds truesize %u\n", + vi->dev->name, len, bufsize); + DEV_STATS_INC(vi->dev, rx_length_errors); + xsk_buff_free(xdp); + return NULL; + } + + xsk_buff_set_size(xdp, len); + xsk_buff_dma_sync_for_cpu(xdp); + + return xdp; +} + +static struct sk_buff *xsk_construct_skb(struct receive_queue *rq, + struct xdp_buff *xdp) +{ + unsigned int metasize = xdp->data - xdp->data_meta; + struct sk_buff *skb; + unsigned int size; + + size = xdp->data_end - xdp->data_hard_start; + skb = napi_alloc_skb(&rq->napi, size); + if (unlikely(!skb)) { + xsk_buff_free(xdp); + return NULL; + } + + skb_reserve(skb, xdp->data_meta - xdp->data_hard_start); + + size = xdp->data_end - xdp->data_meta; + memcpy(__skb_put(skb, size), xdp->data_meta, size); + + if (metasize) { + __skb_pull(skb, metasize); + skb_metadata_set(skb, metasize); + } + + xsk_buff_free(xdp); + + return skb; +} + +static struct sk_buff *virtnet_receive_xsk_small(struct net_device *dev, struct virtnet_info *vi, + struct receive_queue *rq, struct xdp_buff *xdp, + unsigned int *xdp_xmit, + struct virtnet_rq_stats *stats) +{ + struct bpf_prog *prog; + u32 ret; + + ret = XDP_PASS; + rcu_read_lock(); + prog = rcu_dereference(rq->xdp_prog); + if (prog) + ret = virtnet_xdp_handler(prog, xdp, dev, xdp_xmit, stats); + rcu_read_unlock(); + + switch (ret) { + case XDP_PASS: + return xsk_construct_skb(rq, xdp); + + case XDP_TX: + case XDP_REDIRECT: + return NULL; + + default: + /* drop packet */ + xsk_buff_free(xdp); + u64_stats_inc(&stats->drops); + return NULL; + } +} + +static void virtnet_receive_xsk_buf(struct virtnet_info *vi, struct receive_queue *rq, + void *buf, u32 len, + unsigned int *xdp_xmit, + struct virtnet_rq_stats *stats) +{ + struct net_device *dev = vi->dev; + struct sk_buff *skb = NULL; + struct xdp_buff *xdp; + u8 flags; + + len -= vi->hdr_len; + + u64_stats_add(&stats->bytes, len); + + xdp = buf_to_xdp(vi, rq, buf, len); + if (!xdp) + return; + + if (unlikely(len < ETH_HLEN)) { + pr_debug("%s: short packet %i\n", dev->name, len); + DEV_STATS_INC(dev, rx_length_errors); + xsk_buff_free(xdp); + return; + } + + flags = ((struct virtio_net_common_hdr *)(xdp->data - vi->hdr_len))->hdr.flags; + + if (!vi->mergeable_rx_bufs) + skb = virtnet_receive_xsk_small(dev, vi, rq, xdp, xdp_xmit, stats); + + if (skb) + virtnet_receive_done(vi, rq, skb, flags); +} + static int virtnet_add_recvbuf_xsk(struct virtnet_info *vi, struct receive_queue *rq, struct xsk_buff_pool *pool, gfp_t gfp) { @@ -2392,32 +2516,68 @@ static void refill_work(struct work_struct *work) } } -static int virtnet_receive(struct receive_queue *rq, int budget, - unsigned int *xdp_xmit) +static int virtnet_receive_xsk_bufs(struct virtnet_info *vi, + struct receive_queue *rq, + int budget, + unsigned int *xdp_xmit, + struct virtnet_rq_stats *stats) +{ + unsigned int len; + int packets = 0; + void *buf; + + while (packets < budget) { + buf = virtqueue_get_buf(rq->vq, &len); + if (!buf) + break; + + virtnet_receive_xsk_buf(vi, rq, buf, len, xdp_xmit, stats); + packets++; + } + + return packets; +} + +static int virtnet_receive_packets(struct virtnet_info *vi, + struct receive_queue *rq, + int budget, + unsigned int *xdp_xmit, + struct virtnet_rq_stats *stats) { - struct virtnet_info *vi = rq->vq->vdev->priv; - struct virtnet_rq_stats stats = {}; unsigned int len; int packets = 0; void *buf; - int i; if (!vi->big_packets || vi->mergeable_rx_bufs) { void *ctx; - while (packets < budget && (buf = virtnet_rq_get_buf(rq, &len, &ctx))) { - receive_buf(vi, rq, buf, len, ctx, xdp_xmit, &stats); + receive_buf(vi, rq, buf, len, ctx, xdp_xmit, stats); packets++; } } else { while (packets < budget && (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) { - receive_buf(vi, rq, buf, len, NULL, xdp_xmit, &stats); + receive_buf(vi, rq, buf, len, NULL, xdp_xmit, stats); packets++; } } + return packets; +} + +static int virtnet_receive(struct receive_queue *rq, int budget, + unsigned int *xdp_xmit) +{ + struct virtnet_info *vi = rq->vq->vdev->priv; + struct virtnet_rq_stats stats = {}; + int i, packets; + + if (rq->xsk_pool) + packets = virtnet_receive_xsk_bufs(vi, rq, budget, xdp_xmit, &stats); + else + packets = virtnet_receive_packets(vi, rq, budget, xdp_xmit, &stats); + if (rq->vq->num_free > min((unsigned int)budget, virtqueue_get_vring_size(rq->vq)) / 2) { if (!try_fill_recv(vi, rq, GFP_ATOMIC)) { spin_lock(&vi->refill_lock);
In the process: 1. We may need to copy data to create skb for XDP_PASS. 2. We may need to call xsk_buff_free() to release the buffer. 3. The handle for xdp_buff is difference from the buffer. If we pushed this logic into existing receive handle(merge and small), we would have to maintain code scattered inside merge and small (and big). So I think it is a good choice for us to put the xsk code into an independent function. Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com> --- v7: 1. rename xdp_construct_skb to xsk_construct_skb 2. refactor virtnet_receive() drivers/net/virtio_net.c | 176 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 168 insertions(+), 8 deletions(-) -- 2.32.0.3.g01195cf9f