From patchwork Fri Apr 9 09:37:45 2010 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Xin, Xiaohui" X-Patchwork-Id: 91677 Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by demeter.kernel.org (8.14.3/8.14.3) with ESMTP id o399Ycta014871 for ; Fri, 9 Apr 2010 09:35:52 GMT Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1757591Ab0DIJfL (ORCPT ); Fri, 9 Apr 2010 05:35:11 -0400 Received: from mga09.intel.com ([134.134.136.24]:59554 "EHLO mga09.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1757309Ab0DIJes (ORCPT ); Fri, 9 Apr 2010 05:34:48 -0400 Received: from orsmga002.jf.intel.com ([10.7.209.21]) by orsmga102.jf.intel.com with ESMTP; 09 Apr 2010 02:34:25 -0700 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="4.52,176,1270450800"; d="scan'208";a="507723874" Received: from unknown (HELO localhost.localdomain) ([10.239.36.200]) by orsmga002.jf.intel.com with ESMTP; 09 Apr 2010 02:34:38 -0700 From: xiaohui.xin@intel.com To: netdev@vger.kernel.org, kvm@vger.kernel.org, linux-kernel@vger.kernel.org, mst@redhat.com, mingo@elte.hu, davem@davemloft.net, jdike@linux.intel.com Cc: Xin Xiaohui Subject: [RFC][PATCH v3 3/3] Let host NIC driver to DMA to guest user space. Date: Fri, 9 Apr 2010 17:37:45 +0800 Message-Id: <1270805865-16901-4-git-send-email-xiaohui.xin@intel.com> X-Mailer: git-send-email 1.5.4.4 In-Reply-To: <1270805865-16901-3-git-send-email-xiaohui.xin@intel.com> References: <1270805865-16901-1-git-send-email-xiaohui.xin@intel.com> <1270805865-16901-2-git-send-email-xiaohui.xin@intel.com> <1270805865-16901-3-git-send-email-xiaohui.xin@intel.com> Sender: kvm-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: kvm@vger.kernel.org X-Greylist: IP, sender and recipient auto-whitelisted, not delayed by milter-greylist-4.2.3 (demeter.kernel.org [140.211.167.41]); Fri, 09 Apr 2010 09:35:55 +0000 (UTC) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 94958c1..ba48eb0 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -485,6 +485,17 @@ struct netdev_queue { unsigned long tx_dropped; } ____cacheline_aligned_in_smp; +#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE) +struct mpassthru_port { + int hdr_len; + int data_len; + int npages; + unsigned flags; + struct socket *sock; + struct skb_user_page *(*ctor)(struct mpassthru_port *, + struct sk_buff *, int); +}; +#endif /* * This structure defines the management hooks for network devices. @@ -636,6 +647,10 @@ struct net_device_ops { int (*ndo_fcoe_ddp_done)(struct net_device *dev, u16 xid); #endif +#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE) + int (*ndo_mp_port_prep)(struct net_device *dev, + struct mpassthru_port *port); +#endif }; /* @@ -891,7 +906,8 @@ struct net_device struct macvlan_port *macvlan_port; /* GARP */ struct garp_port *garp_port; - + /* mpassthru */ + struct mpassthru_port *mp_port; /* class/net/name entry */ struct device dev; /* space for optional statistics and wireless sysfs groups */ @@ -2013,6 +2029,55 @@ static inline u32 dev_ethtool_get_flags(struct net_device *dev) return 0; return dev->ethtool_ops->get_flags(dev); } -#endif /* __KERNEL__ */ +/* To support zero-copy between user space application and NIC driver, + * we'd better ask NIC driver for the capability it can provide, especially + * for packet split mode, now we only ask for the header size, and the + * payload once a descriptor may carry. + */ + +#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE) +static inline int netdev_mp_port_prep(struct net_device *dev, + struct mpassthru_port *port) +{ + int rc; + int npages, data_len; + const struct net_device_ops *ops = dev->netdev_ops; + + /* needed by packet split */ + if (ops->ndo_mp_port_prep) { + rc = ops->ndo_mp_port_prep(dev, port); + if (rc) + return rc; + } else { + /* If the NIC driver did not report this, + * then we try to use it as igb driver. + */ + port->hdr_len = 128; + port->data_len = 2048; + port->npages = 1; + } + + if (port->hdr_len <= 0) + goto err; + + npages = port->npages; + data_len = port->data_len; + if (npages <= 0 || npages > MAX_SKB_FRAGS || + (data_len < PAGE_SIZE * (npages - 1) || + data_len > PAGE_SIZE * npages)) + goto err; + + return 0; +err: + dev_warn(&dev->dev, "invalid page constructor parameters\n"); + + return -EINVAL; +} + +extern int netdev_mp_port_attach(struct net_device *dev, + struct mpassthru_port *port); +extern void netdev_mp_port_detach(struct net_device *dev); +#endif /* CONFIG_VHOST_PASSTHRU */ +#endif /* __KERNEL__ */ #endif /* _LINUX_NETDEVICE_H */ diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index df7b23a..e59fa57 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -209,6 +209,13 @@ struct skb_shared_info { void * destructor_arg; }; +struct skb_user_page { + u8 *start; + int size; + struct skb_frag_struct *frags; + struct skb_shared_info *ushinfo; + void (*dtor)(struct skb_user_page *); +}; /* We divide dataref into two halves. The higher 16 bits hold references * to the payload part of skb->data. The lower 16 bits hold references to * the entire skb->data. A clone of a headerless skb holds the length of @@ -441,17 +448,18 @@ extern void kfree_skb(struct sk_buff *skb); extern void consume_skb(struct sk_buff *skb); extern void __kfree_skb(struct sk_buff *skb); extern struct sk_buff *__alloc_skb(unsigned int size, - gfp_t priority, int fclone, int node); + gfp_t priority, int fclone, + int node, struct net_device *dev); static inline struct sk_buff *alloc_skb(unsigned int size, gfp_t priority) { - return __alloc_skb(size, priority, 0, -1); + return __alloc_skb(size, priority, 0, -1, NULL); } static inline struct sk_buff *alloc_skb_fclone(unsigned int size, gfp_t priority) { - return __alloc_skb(size, priority, 1, -1); + return __alloc_skb(size, priority, 1, -1, NULL); } extern int skb_recycle_check(struct sk_buff *skb, int skb_size); @@ -1509,6 +1517,22 @@ static inline void netdev_free_page(struct net_device *dev, struct page *page) __free_page(page); } +extern struct skb_user_page *netdev_alloc_user_pages(struct net_device *dev, + struct sk_buff *skb, int npages); + +static inline struct skb_user_page *netdev_alloc_user_page( + struct net_device *dev, + struct sk_buff *skb, unsigned int size) +{ + struct skb_user_page *user; + int npages = (size < PAGE_SIZE) ? 1 : (size / PAGE_SIZE); + + user = netdev_alloc_user_pages(dev, skb, npages); + if (likely(user)) + return user; + return NULL; +} + /** * skb_clone_writable - is the header of a clone writable * @skb: buffer to check diff --git a/net/core/dev.c b/net/core/dev.c index b8f74cf..b50bdcb 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2265,6 +2265,61 @@ void netif_nit_deliver(struct sk_buff *skb) rcu_read_unlock(); } +/* Add a hook to intercept zero-copy packets, and insert it + * to the socket queue specially. + */ +#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE) +int netdev_mp_port_attach(struct net_device *dev, + struct mpassthru_port *port) +{ + /* locked by mp_mutex */ + if (rcu_dereference(dev->mp_port)) + return -EBUSY; + + rcu_assign_pointer(dev->mp_port, port); + + return 0; +} +EXPORT_SYMBOL(netdev_mp_port_attach); + +void netdev_mp_port_detach(struct net_device *dev) +{ + /* locked by mp_mutex */ + if (!rcu_dereference(dev->mp_port)) + return; + + rcu_assign_pointer(dev->mp_port, NULL); + synchronize_rcu(); +} +EXPORT_SYMBOL(netdev_mp_port_detach); + +static inline struct sk_buff *handle_mpassthru(struct sk_buff *skb, + struct packet_type **pt_prev, + int *ret, struct net_device *orig_dev) +{ + struct mpassthru_port *mp_port = NULL; + struct sock *sk = NULL; + + if (skb->dev) + mp_port = skb->dev->mp_port; + if (!mp_port) + return skb; + + if (*pt_prev) { + *ret = deliver_skb(skb, *pt_prev, orig_dev); + *pt_prev = NULL; + } + + sk = mp_port->sock->sk; + skb_queue_tail(&sk->sk_receive_queue, skb); + sk->sk_data_ready(sk, skb->len); + + return NULL; +} +#else +#define handle_mpassthru(skb, pt_prev, ret, orig_dev) (skb) +#endif + /** * netif_receive_skb - process receive buffer from network * @skb: buffer to process @@ -2342,6 +2397,9 @@ int netif_receive_skb(struct sk_buff *skb) goto out; ncls: #endif + skb = handle_mpassthru(skb, &pt_prev, &ret, orig_dev); + if (!skb) + goto out; skb = handle_bridge(skb, &pt_prev, &ret, orig_dev); if (!skb) @@ -2455,6 +2513,11 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) if (skb_is_gso(skb) || skb_has_frags(skb)) goto normal; +#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE) + if (skb->dev && skb->dev->mp_port) + goto normal; +#endif + rcu_read_lock(); list_for_each_entry_rcu(ptype, head, list) { if (ptype->type != type || ptype->dev || !ptype->gro_receive) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 80a9616..e684898 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -170,13 +170,15 @@ EXPORT_SYMBOL(skb_under_panic); * %GFP_ATOMIC. */ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, - int fclone, int node) + int fclone, int node, struct net_device *dev) { struct kmem_cache *cache; struct skb_shared_info *shinfo; struct sk_buff *skb; u8 *data; - +#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE) + struct skb_user_page *user = NULL; +#endif cache = fclone ? skbuff_fclone_cache : skbuff_head_cache; /* Get the HEAD */ @@ -185,8 +187,26 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, goto out; size = SKB_DATA_ALIGN(size); - data = kmalloc_node_track_caller(size + sizeof(struct skb_shared_info), - gfp_mask, node); +#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE) + if (!dev || !dev->mp_port) { /* Legacy alloc func */ +#endif + data = kmalloc_node_track_caller( + size + sizeof(struct skb_shared_info), + gfp_mask, node); +#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE) + } else { /* Allocation may from page constructor of device */ + user = netdev_alloc_user_page(dev, skb, size); + if (!user) { + data = kmalloc_node_track_caller( + size + sizeof(struct skb_shared_info), + gfp_mask, node); + printk(KERN_INFO "can't alloc user buffer.\n"); + } else { + data = user->start; + size = SKB_DATA_ALIGN(user->size); + } + } +#endif if (!data) goto nodata; @@ -208,6 +228,11 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, skb->mac_header = ~0U; #endif +#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE) + if (user) + memcpy(user->ushinfo, skb_shinfo(skb), + sizeof(struct skb_shared_info)); +#endif /* make sure we initialize shinfo sequentially */ shinfo = skb_shinfo(skb); atomic_set(&shinfo->dataref, 1); @@ -231,6 +256,10 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, child->fclone = SKB_FCLONE_UNAVAILABLE; } +#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE) + shinfo->destructor_arg = user; +#endif + out: return skb; nodata: @@ -259,7 +288,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, int node = dev->dev.parent ? dev_to_node(dev->dev.parent) : -1; struct sk_buff *skb; - skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, node); + skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, node, dev); if (likely(skb)) { skb_reserve(skb, NET_SKB_PAD); skb->dev = dev; @@ -278,6 +307,24 @@ struct page *__netdev_alloc_page(struct net_device *dev, gfp_t gfp_mask) } EXPORT_SYMBOL(__netdev_alloc_page); +#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE) +struct skb_user_page *netdev_alloc_user_pages(struct net_device *dev, + struct sk_buff *skb, int npages) +{ + struct mpassthru_port *ctor; + struct skb_user_page *user = NULL; + + ctor = rcu_dereference(dev->mp_port); + if (!ctor) + goto out; + BUG_ON(npages > ctor->npages); + user = ctor->ctor(ctor, skb, npages); +out: + return user; +} +EXPORT_SYMBOL(netdev_alloc_user_pages); +#endif + void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, int size) { @@ -338,6 +385,10 @@ static void skb_clone_fraglist(struct sk_buff *skb) static void skb_release_data(struct sk_buff *skb) { +#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE) + struct skb_user_page *user = skb_shinfo(skb)->destructor_arg; +#endif + if (!skb->cloned || !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1, &skb_shinfo(skb)->dataref)) { @@ -349,7 +400,10 @@ static void skb_release_data(struct sk_buff *skb) if (skb_has_frags(skb)) skb_drop_fraglist(skb); - +#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE) + if (skb->dev && skb->dev->mp_port && user && user->dtor) + user->dtor(user); +#endif kfree(skb->head); } } @@ -503,8 +557,14 @@ int skb_recycle_check(struct sk_buff *skb, int skb_size) if (skb_shared(skb) || skb_cloned(skb)) return 0; - skb_release_head_state(skb); +#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE) + if (skb->dev && skb->dev->mp_port) + return 0; +#endif + shinfo = skb_shinfo(skb); + + skb_release_head_state(skb); atomic_set(&shinfo->dataref, 1); shinfo->nr_frags = 0; shinfo->gso_size = 0;