From patchwork Wed Oct 20 08:55:05 2010 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Krishna Kumar X-Patchwork-Id: 268141 Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by demeter1.kernel.org (8.14.4/8.14.3) with ESMTP id o9K8uBnr010264 for ; Wed, 20 Oct 2010 08:56:12 GMT Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1751334Ab0JTIzQ (ORCPT ); Wed, 20 Oct 2010 04:55:16 -0400 Received: from e23smtp01.au.ibm.com ([202.81.31.143]:42750 "EHLO e23smtp01.au.ibm.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1750990Ab0JTIzN (ORCPT ); Wed, 20 Oct 2010 04:55:13 -0400 Received: from d23relay05.au.ibm.com (d23relay05.au.ibm.com [202.81.31.247]) by e23smtp01.au.ibm.com (8.14.4/8.13.1) with ESMTP id o9K8q4NX025140; Wed, 20 Oct 2010 19:52:04 +1100 Received: from d23av04.au.ibm.com (d23av04.au.ibm.com [9.190.235.139]) by d23relay05.au.ibm.com (8.13.8/8.13.8/NCO v10.0) with ESMTP id o9K8tBx81851524; Wed, 20 Oct 2010 19:55:11 +1100 Received: from d23av04.au.ibm.com (loopback [127.0.0.1]) by d23av04.au.ibm.com (8.14.4/8.13.1/NCO v10.0 AVout) with ESMTP id o9K8tAHU000911; Wed, 20 Oct 2010 19:55:11 +1100 Received: from krkumar2.in.ibm.com ([9.124.209.222]) by d23av04.au.ibm.com (8.14.4/8.13.1/NCO v10.0 AVin) with ESMTP id o9K8t6XV000839; Wed, 20 Oct 2010 19:55:06 +1100 From: Krishna Kumar To: rusty@rustcorp.com.au, davem@davemloft.net, mst@redhat.com Cc: kvm@vger.kernel.org, arnd@arndb.de, netdev@vger.kernel.org, avi@redhat.com, anthony@codemonkey.ws, eric.dumazet@gmail.com, Krishna Kumar Date: Wed, 20 Oct 2010 14:25:05 +0530 Message-Id: <20101020085505.15579.94591.sendpatchset@krkumar2.in.ibm.com> In-Reply-To: <20101020085452.15579.76002.sendpatchset@krkumar2.in.ibm.com> References: <20101020085452.15579.76002.sendpatchset@krkumar2.in.ibm.com> Subject: [v3 RFC PATCH 2/4] Changes for virtio-net Sender: kvm-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: kvm@vger.kernel.org X-Greylist: IP, sender and recipient auto-whitelisted, not delayed by milter-greylist-4.2.3 (demeter1.kernel.org [140.211.167.41]); Wed, 20 Oct 2010 08:56:12 +0000 (UTC) diff -ruNp org/include/linux/virtio_net.h new.dynamic.optimize_vhost/include/linux/virtio_net.h --- org/include/linux/virtio_net.h 2010-10-11 10:20:22.000000000 +0530 +++ new.dynamic.optimize_vhost/include/linux/virtio_net.h 2010-10-19 13:24:38.000000000 +0530 @@ -7,6 +7,9 @@ #include #include +/* Maximum number of TX queues supported */ +#define VIRTIO_MAX_SQ 32 + /* The feature bitmap for virtio net */ #define VIRTIO_NET_F_CSUM 0 /* Host handles pkts w/ partial csum */ #define VIRTIO_NET_F_GUEST_CSUM 1 /* Guest handles pkts w/ partial csum */ @@ -26,6 +29,7 @@ #define VIRTIO_NET_F_CTRL_RX 18 /* Control channel RX mode support */ #define VIRTIO_NET_F_CTRL_VLAN 19 /* Control channel VLAN filtering */ #define VIRTIO_NET_F_CTRL_RX_EXTRA 20 /* Extra RX mode control support */ +#define VIRTIO_NET_F_NUMTXQS 21 /* Device supports multiple TX queue */ #define VIRTIO_NET_S_LINK_UP 1 /* Link is up */ @@ -34,6 +38,8 @@ struct virtio_net_config { __u8 mac[6]; /* See VIRTIO_NET_F_STATUS and VIRTIO_NET_S_* above */ __u16 status; + /* number of transmit queues */ + __u16 numtxqs; } __attribute__((packed)); /* This is the first element of the scatter-gather list. If you don't diff -ruNp org/drivers/net/virtio_net.c new.dynamic.optimize_vhost/drivers/net/virtio_net.c --- org/drivers/net/virtio_net.c 2010-10-11 10:20:02.000000000 +0530 +++ new.dynamic.optimize_vhost/drivers/net/virtio_net.c 2010-10-19 17:01:53.000000000 +0530 @@ -40,11 +40,24 @@ module_param(gso, bool, 0444); #define VIRTNET_SEND_COMMAND_SG_MAX 2 +/* Our representation of a send virtqueue */ +struct send_queue { + struct virtqueue *svq; + + /* TX: fragments + linear part + virtio header */ + struct scatterlist tx_sg[MAX_SKB_FRAGS + 2]; +}; + struct virtnet_info { + struct send_queue **sq; + struct napi_struct napi ____cacheline_aligned_in_smp; + + /* read-mostly variables */ + int numtxqs ____cacheline_aligned_in_smp; struct virtio_device *vdev; - struct virtqueue *rvq, *svq, *cvq; + struct virtqueue *rvq; + struct virtqueue *cvq; struct net_device *dev; - struct napi_struct napi; unsigned int status; /* Number of input buffers, and max we've ever had. */ @@ -62,9 +75,8 @@ struct virtnet_info { /* Chain pages by the private ptr. */ struct page *pages; - /* fragments + linear part + virtio header */ + /* RX: fragments + linear part + virtio header */ struct scatterlist rx_sg[MAX_SKB_FRAGS + 2]; - struct scatterlist tx_sg[MAX_SKB_FRAGS + 2]; }; struct skb_vnet_hdr { @@ -120,12 +132,13 @@ static struct page *get_a_page(struct vi static void skb_xmit_done(struct virtqueue *svq) { struct virtnet_info *vi = svq->vdev->priv; + int qnum = svq->queue_index - 1; /* 0 is RX vq */ /* Suppress further interrupts. */ virtqueue_disable_cb(svq); /* We were probably waiting for more output buffers. */ - netif_wake_queue(vi->dev); + netif_wake_subqueue(vi->dev, qnum); } static void set_skb_frag(struct sk_buff *skb, struct page *page, @@ -495,12 +508,13 @@ again: return received; } -static unsigned int free_old_xmit_skbs(struct virtnet_info *vi) +static unsigned int free_old_xmit_skbs(struct virtnet_info *vi, + struct virtqueue *svq) { struct sk_buff *skb; unsigned int len, tot_sgs = 0; - while ((skb = virtqueue_get_buf(vi->svq, &len)) != NULL) { + while ((skb = virtqueue_get_buf(svq, &len)) != NULL) { pr_debug("Sent skb %p\n", skb); vi->dev->stats.tx_bytes += skb->len; vi->dev->stats.tx_packets++; @@ -510,7 +524,8 @@ static unsigned int free_old_xmit_skbs(s return tot_sgs; } -static int xmit_skb(struct virtnet_info *vi, struct sk_buff *skb) +static int xmit_skb(struct virtnet_info *vi, struct sk_buff *skb, + struct virtqueue *svq, struct scatterlist *tx_sg) { struct skb_vnet_hdr *hdr = skb_vnet_hdr(skb); const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest; @@ -548,12 +563,12 @@ static int xmit_skb(struct virtnet_info /* Encode metadata header at front. */ if (vi->mergeable_rx_bufs) - sg_set_buf(vi->tx_sg, &hdr->mhdr, sizeof hdr->mhdr); + sg_set_buf(tx_sg, &hdr->mhdr, sizeof hdr->mhdr); else - sg_set_buf(vi->tx_sg, &hdr->hdr, sizeof hdr->hdr); + sg_set_buf(tx_sg, &hdr->hdr, sizeof hdr->hdr); - hdr->num_sg = skb_to_sgvec(skb, vi->tx_sg + 1, 0, skb->len) + 1; - return virtqueue_add_buf(vi->svq, vi->tx_sg, hdr->num_sg, + hdr->num_sg = skb_to_sgvec(skb, tx_sg + 1, 0, skb->len) + 1; + return virtqueue_add_buf(svq, tx_sg, hdr->num_sg, 0, skb); } @@ -561,31 +576,34 @@ static netdev_tx_t start_xmit(struct sk_ { struct virtnet_info *vi = netdev_priv(dev); int capacity; + int qnum = skb_get_queue_mapping(skb); + struct virtqueue *svq = vi->sq[qnum]->svq; /* Free up any pending old buffers before queueing new ones. */ - free_old_xmit_skbs(vi); + free_old_xmit_skbs(vi, svq); /* Try to transmit */ - capacity = xmit_skb(vi, skb); + capacity = xmit_skb(vi, skb, svq, vi->sq[qnum]->tx_sg); /* This can happen with OOM and indirect buffers. */ if (unlikely(capacity < 0)) { if (net_ratelimit()) { if (likely(capacity == -ENOMEM)) { dev_warn(&dev->dev, - "TX queue failure: out of memory\n"); + "TXQ (%d) failure: out of memory\n", + qnum); } else { dev->stats.tx_fifo_errors++; dev_warn(&dev->dev, - "Unexpected TX queue failure: %d\n", - capacity); + "Unexpected TXQ (%d) failure: %d\n", + qnum, capacity); } } dev->stats.tx_dropped++; kfree_skb(skb); return NETDEV_TX_OK; } - virtqueue_kick(vi->svq); + virtqueue_kick(svq); /* Don't wait up for transmitted skbs to be freed. */ skb_orphan(skb); @@ -594,13 +612,13 @@ static netdev_tx_t start_xmit(struct sk_ /* Apparently nice girls don't return TX_BUSY; stop the queue * before it gets out of hand. Naturally, this wastes entries. */ if (capacity < 2+MAX_SKB_FRAGS) { - netif_stop_queue(dev); - if (unlikely(!virtqueue_enable_cb(vi->svq))) { + netif_stop_subqueue(dev, qnum); + if (unlikely(!virtqueue_enable_cb(svq))) { /* More just got used, free them then recheck. */ - capacity += free_old_xmit_skbs(vi); + capacity += free_old_xmit_skbs(vi, svq); if (capacity >= 2+MAX_SKB_FRAGS) { - netif_start_queue(dev); - virtqueue_disable_cb(vi->svq); + netif_start_subqueue(dev, qnum); + virtqueue_disable_cb(svq); } } } @@ -871,10 +889,10 @@ static void virtnet_update_status(struct if (vi->status & VIRTIO_NET_S_LINK_UP) { netif_carrier_on(vi->dev); - netif_wake_queue(vi->dev); + netif_tx_wake_all_queues(vi->dev); } else { netif_carrier_off(vi->dev); - netif_stop_queue(vi->dev); + netif_tx_stop_all_queues(vi->dev); } } @@ -885,18 +903,122 @@ static void virtnet_config_changed(struc virtnet_update_status(vi); } +#define MAX_DEVICE_NAME 16 +static int initialize_vqs(struct virtnet_info *vi, int numtxqs) +{ + vq_callback_t **callbacks; + struct virtqueue **vqs; + int i, err = -ENOMEM; + int totalvqs; + char **names; + + vi->sq = kzalloc(numtxqs * sizeof(*vi->sq), GFP_KERNEL); + if (!vi->sq) + goto out; + for (i = 0; i < numtxqs; i++) { + vi->sq[i] = kzalloc(sizeof(*vi->sq[i]), GFP_KERNEL); + if (!vi->sq[i]) + goto out; + } + + /* setup initial send queue parameters */ + for (i = 0; i < numtxqs; i++) + sg_init_table(vi->sq[i]->tx_sg, ARRAY_SIZE(vi->sq[i]->tx_sg)); + + /* + * We expect 1 RX virtqueue followed by 'numtxqs' TX virtqueues, and + * optionally one control virtqueue. + */ + totalvqs = 1 + numtxqs + + virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ); + + /* Setup parameters for find_vqs */ + vqs = kmalloc(totalvqs * sizeof(*vqs), GFP_KERNEL); + callbacks = kmalloc(totalvqs * sizeof(*callbacks), GFP_KERNEL); + names = kzalloc(totalvqs * sizeof(*names), GFP_KERNEL); + if (!vqs || !callbacks || !names) + goto free_mem; + + /* Parameters for recv virtqueue */ + callbacks[0] = skb_recv_done; + names[0] = "input"; + + /* Parameters for send virtqueues */ + for (i = 1; i <= numtxqs; i++) { + callbacks[i] = skb_xmit_done; + names[i] = kmalloc(MAX_DEVICE_NAME * sizeof(*names[i]), + GFP_KERNEL); + if (!names[i]) + goto free_mem; + sprintf(names[i], "output.%d", i - 1); + } + + /* Parameters for control virtqueue, if any */ + if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ)) { + callbacks[i] = NULL; + names[i] = "control"; + } + + err = vi->vdev->config->find_vqs(vi->vdev, totalvqs, vqs, callbacks, + (const char **)names); + if (err) + goto free_mem; + + vi->rvq = vqs[0]; + for (i = 0; i < numtxqs; i++) + vi->sq[i]->svq = vqs[i + 1]; + + if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ)) { + vi->cvq = vqs[i + 1]; + + if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VLAN)) + vi->dev->features |= NETIF_F_HW_VLAN_FILTER; + } + +free_mem: + if (names) { + for (i = 1; i <= numtxqs; i++) + kfree(names[i]); + kfree(names); + } + + kfree(callbacks); + kfree(vqs); + +out: + if (err) { + for (i = 0; i < numtxqs; i++) + kfree(vi->sq[i]); + kfree(vi->sq); + } + + return err; +} + static int virtnet_probe(struct virtio_device *vdev) { - int err; + int i, err; + u16 numtxqs; struct net_device *dev; struct virtnet_info *vi; - struct virtqueue *vqs[3]; - vq_callback_t *callbacks[] = { skb_recv_done, skb_xmit_done, NULL}; - const char *names[] = { "input", "output", "control" }; - int nvqs; + + /* + * Find if host passed the number of transmit queues supported + * by the device + */ + err = virtio_config_val(vdev, VIRTIO_NET_F_NUMTXQS, + offsetof(struct virtio_net_config, numtxqs), + &numtxqs); + + /* We need atleast one txq */ + if (err || !numtxqs) + numtxqs = 1; + + if (numtxqs > VIRTIO_MAX_SQ) + return -EINVAL; /* Allocate ourselves a network device with room for our info */ - dev = alloc_etherdev(sizeof(struct virtnet_info)); + dev = alloc_etherdev_mq(sizeof(struct virtnet_info), numtxqs); if (!dev) return -ENOMEM; @@ -940,9 +1062,9 @@ static int virtnet_probe(struct virtio_d vi->vdev = vdev; vdev->priv = vi; vi->pages = NULL; + vi->numtxqs = numtxqs; INIT_DELAYED_WORK(&vi->refill, refill_work); sg_init_table(vi->rx_sg, ARRAY_SIZE(vi->rx_sg)); - sg_init_table(vi->tx_sg, ARRAY_SIZE(vi->tx_sg)); /* If we can receive ANY GSO packets, we must allocate large ones. */ if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) || @@ -953,23 +1075,10 @@ static int virtnet_probe(struct virtio_d if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF)) vi->mergeable_rx_bufs = true; - /* We expect two virtqueues, receive then send, - * and optionally control. */ - nvqs = virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ) ? 3 : 2; - - err = vdev->config->find_vqs(vdev, nvqs, vqs, callbacks, names); + /* Initialize our rx/tx queue parameters, and invoke find_vqs */ + err = initialize_vqs(vi, numtxqs); if (err) - goto free; - - vi->rvq = vqs[0]; - vi->svq = vqs[1]; - - if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ)) { - vi->cvq = vqs[2]; - - if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VLAN)) - dev->features |= NETIF_F_HW_VLAN_FILTER; - } + goto free_netdev; err = register_netdev(dev); if (err) { @@ -986,6 +1095,9 @@ static int virtnet_probe(struct virtio_d goto unregister; } + dev_info(&dev->dev, "(virtio-net) Allocated 1 RX and %d TX vq's\n", + numtxqs); + vi->status = VIRTIO_NET_S_LINK_UP; virtnet_update_status(vi); netif_carrier_on(dev); @@ -998,7 +1110,10 @@ unregister: cancel_delayed_work_sync(&vi->refill); free_vqs: vdev->config->del_vqs(vdev); -free: + for (i = 0; i < numtxqs; i++) + kfree(vi->sq[i]); + kfree(vi->sq); +free_netdev: free_netdev(dev); return err; } @@ -1006,12 +1121,21 @@ free: static void free_unused_bufs(struct virtnet_info *vi) { void *buf; - while (1) { - buf = virtqueue_detach_unused_buf(vi->svq); - if (!buf) - break; - dev_kfree_skb(buf); + int i; + + for (i = 0; i < vi->numtxqs; i++) { + struct virtqueue *svq = vi->sq[i]->svq; + + while (1) { + buf = virtqueue_detach_unused_buf(svq); + if (!buf) + break; + dev_kfree_skb(buf); + } + kfree(vi->sq[i]); } + kfree(vi->sq); + while (1) { buf = virtqueue_detach_unused_buf(vi->rvq); if (!buf) @@ -1059,7 +1183,7 @@ static unsigned int features[] = { VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6, VIRTIO_NET_F_GUEST_ECN, VIRTIO_NET_F_GUEST_UFO, VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ, - VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN, + VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN, VIRTIO_NET_F_NUMTXQS, }; static struct virtio_driver virtio_net_driver = {