@@ -485,6 +485,17 @@ struct netdev_queue {
unsigned long tx_dropped;
} ____cacheline_aligned_in_smp;
+#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
+struct mpassthru_port {
+ int hdr_len;
+ int data_len;
+ int npages;
+ unsigned flags;
+ struct socket *sock;
+ struct skb_user_page *(*ctor)(struct mpassthru_port *,
+ struct sk_buff *, int);
+};
+#endif
/*
* This structure defines the management hooks for network devices.
@@ -636,6 +647,10 @@ struct net_device_ops {
int (*ndo_fcoe_ddp_done)(struct net_device *dev,
u16 xid);
#endif
+#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
+ int (*ndo_mp_port_prep)(struct net_device *dev,
+ struct mpassthru_port *port);
+#endif
};
/*
@@ -891,7 +906,8 @@ struct net_device
struct macvlan_port *macvlan_port;
/* GARP */
struct garp_port *garp_port;
-
+ /* mpassthru */
+ struct mpassthru_port *mp_port;
/* class/net/name entry */
struct device dev;
/* space for optional statistics and wireless sysfs groups */
@@ -2013,6 +2029,55 @@ static inline u32 dev_ethtool_get_flags(struct net_device *dev)
return 0;
return dev->ethtool_ops->get_flags(dev);
}
-#endif /* __KERNEL__ */
+/* To support zero-copy between user space application and NIC driver,
+ * we'd better ask NIC driver for the capability it can provide, especially
+ * for packet split mode, now we only ask for the header size, and the
+ * payload once a descriptor may carry.
+ */
+
+#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
+static inline int netdev_mp_port_prep(struct net_device *dev,
+ struct mpassthru_port *port)
+{
+ int rc;
+ int npages, data_len;
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ /* needed by packet split */
+ if (ops->ndo_mp_port_prep) {
+ rc = ops->ndo_mp_port_prep(dev, port);
+ if (rc)
+ return rc;
+ } else {
+ /* If the NIC driver did not report this,
+ * then we try to use it as igb driver.
+ */
+ port->hdr_len = 128;
+ port->data_len = 2048;
+ port->npages = 1;
+ }
+
+ if (port->hdr_len <= 0)
+ goto err;
+
+ npages = port->npages;
+ data_len = port->data_len;
+ if (npages <= 0 || npages > MAX_SKB_FRAGS ||
+ (data_len < PAGE_SIZE * (npages - 1) ||
+ data_len > PAGE_SIZE * npages))
+ goto err;
+
+ return 0;
+err:
+ dev_warn(&dev->dev, "invalid page constructor parameters\n");
+
+ return -EINVAL;
+}
+
+extern int netdev_mp_port_attach(struct net_device *dev,
+ struct mpassthru_port *port);
+extern void netdev_mp_port_detach(struct net_device *dev);
+#endif /* CONFIG_VHOST_PASSTHRU */
+#endif /* __KERNEL__ */
#endif /* _LINUX_NETDEVICE_H */
@@ -209,6 +209,13 @@ struct skb_shared_info {
void * destructor_arg;
};
+struct skb_user_page {
+ u8 *start;
+ int size;
+ struct skb_frag_struct *frags;
+ struct skb_shared_info *ushinfo;
+ void (*dtor)(struct skb_user_page *);
+};
/* We divide dataref into two halves. The higher 16 bits hold references
* to the payload part of skb->data. The lower 16 bits hold references to
* the entire skb->data. A clone of a headerless skb holds the length of
@@ -441,17 +448,18 @@ extern void kfree_skb(struct sk_buff *skb);
extern void consume_skb(struct sk_buff *skb);
extern void __kfree_skb(struct sk_buff *skb);
extern struct sk_buff *__alloc_skb(unsigned int size,
- gfp_t priority, int fclone, int node);
+ gfp_t priority, int fclone,
+ int node, struct net_device *dev);
static inline struct sk_buff *alloc_skb(unsigned int size,
gfp_t priority)
{
- return __alloc_skb(size, priority, 0, -1);
+ return __alloc_skb(size, priority, 0, -1, NULL);
}
static inline struct sk_buff *alloc_skb_fclone(unsigned int size,
gfp_t priority)
{
- return __alloc_skb(size, priority, 1, -1);
+ return __alloc_skb(size, priority, 1, -1, NULL);
}
extern int skb_recycle_check(struct sk_buff *skb, int skb_size);
@@ -1509,6 +1517,22 @@ static inline void netdev_free_page(struct net_device *dev, struct page *page)
__free_page(page);
}
+extern struct skb_user_page *netdev_alloc_user_pages(struct net_device *dev,
+ struct sk_buff *skb, int npages);
+
+static inline struct skb_user_page *netdev_alloc_user_page(
+ struct net_device *dev,
+ struct sk_buff *skb, unsigned int size)
+{
+ struct skb_user_page *user;
+ int npages = (size < PAGE_SIZE) ? 1 : (size / PAGE_SIZE);
+
+ user = netdev_alloc_user_pages(dev, skb, npages);
+ if (likely(user))
+ return user;
+ return NULL;
+}
+
/**
* skb_clone_writable - is the header of a clone writable
* @skb: buffer to check
@@ -2265,6 +2265,61 @@ void netif_nit_deliver(struct sk_buff *skb)
rcu_read_unlock();
}
+/* Add a hook to intercept zero-copy packets, and insert it
+ * to the socket queue specially.
+ */
+#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
+int netdev_mp_port_attach(struct net_device *dev,
+ struct mpassthru_port *port)
+{
+ /* locked by mp_mutex */
+ if (rcu_dereference(dev->mp_port))
+ return -EBUSY;
+
+ rcu_assign_pointer(dev->mp_port, port);
+
+ return 0;
+}
+EXPORT_SYMBOL(netdev_mp_port_attach);
+
+void netdev_mp_port_detach(struct net_device *dev)
+{
+ /* locked by mp_mutex */
+ if (!rcu_dereference(dev->mp_port))
+ return;
+
+ rcu_assign_pointer(dev->mp_port, NULL);
+ synchronize_rcu();
+}
+EXPORT_SYMBOL(netdev_mp_port_detach);
+
+static inline struct sk_buff *handle_mpassthru(struct sk_buff *skb,
+ struct packet_type **pt_prev,
+ int *ret, struct net_device *orig_dev)
+{
+ struct mpassthru_port *mp_port = NULL;
+ struct sock *sk = NULL;
+
+ if (skb->dev)
+ mp_port = skb->dev->mp_port;
+ if (!mp_port)
+ return skb;
+
+ if (*pt_prev) {
+ *ret = deliver_skb(skb, *pt_prev, orig_dev);
+ *pt_prev = NULL;
+ }
+
+ sk = mp_port->sock->sk;
+ skb_queue_tail(&sk->sk_receive_queue, skb);
+ sk->sk_data_ready(sk, skb->len);
+
+ return NULL;
+}
+#else
+#define handle_mpassthru(skb, pt_prev, ret, orig_dev) (skb)
+#endif
+
/**
* netif_receive_skb - process receive buffer from network
* @skb: buffer to process
@@ -2342,6 +2397,9 @@ int netif_receive_skb(struct sk_buff *skb)
goto out;
ncls:
#endif
+ skb = handle_mpassthru(skb, &pt_prev, &ret, orig_dev);
+ if (!skb)
+ goto out;
skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
if (!skb)
@@ -2455,6 +2513,11 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
if (skb_is_gso(skb) || skb_has_frags(skb))
goto normal;
+#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
+ if (skb->dev && skb->dev->mp_port)
+ goto normal;
+#endif
+
rcu_read_lock();
list_for_each_entry_rcu(ptype, head, list) {
if (ptype->type != type || ptype->dev || !ptype->gro_receive)
@@ -170,13 +170,15 @@ EXPORT_SYMBOL(skb_under_panic);
* %GFP_ATOMIC.
*/
struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
- int fclone, int node)
+ int fclone, int node, struct net_device *dev)
{
struct kmem_cache *cache;
struct skb_shared_info *shinfo;
struct sk_buff *skb;
u8 *data;
-
+#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
+ struct skb_user_page *user = NULL;
+#endif
cache = fclone ? skbuff_fclone_cache : skbuff_head_cache;
/* Get the HEAD */
@@ -185,8 +187,26 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
goto out;
size = SKB_DATA_ALIGN(size);
- data = kmalloc_node_track_caller(size + sizeof(struct skb_shared_info),
- gfp_mask, node);
+#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
+ if (!dev || !dev->mp_port) { /* Legacy alloc func */
+#endif
+ data = kmalloc_node_track_caller(
+ size + sizeof(struct skb_shared_info),
+ gfp_mask, node);
+#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
+ } else { /* Allocation may from page constructor of device */
+ user = netdev_alloc_user_page(dev, skb, size);
+ if (!user) {
+ data = kmalloc_node_track_caller(
+ size + sizeof(struct skb_shared_info),
+ gfp_mask, node);
+ printk(KERN_INFO "can't alloc user buffer.\n");
+ } else {
+ data = user->start;
+ size = SKB_DATA_ALIGN(user->size);
+ }
+ }
+#endif
if (!data)
goto nodata;
@@ -208,6 +228,11 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
skb->mac_header = ~0U;
#endif
+#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
+ if (user)
+ memcpy(user->ushinfo, skb_shinfo(skb),
+ sizeof(struct skb_shared_info));
+#endif
/* make sure we initialize shinfo sequentially */
shinfo = skb_shinfo(skb);
atomic_set(&shinfo->dataref, 1);
@@ -231,6 +256,10 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
child->fclone = SKB_FCLONE_UNAVAILABLE;
}
+#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
+ shinfo->destructor_arg = user;
+#endif
+
out:
return skb;
nodata:
@@ -259,7 +288,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
int node = dev->dev.parent ? dev_to_node(dev->dev.parent) : -1;
struct sk_buff *skb;
- skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, node);
+ skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, node, dev);
if (likely(skb)) {
skb_reserve(skb, NET_SKB_PAD);
skb->dev = dev;
@@ -278,6 +307,24 @@ struct page *__netdev_alloc_page(struct net_device *dev, gfp_t gfp_mask)
}
EXPORT_SYMBOL(__netdev_alloc_page);
+#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
+struct skb_user_page *netdev_alloc_user_pages(struct net_device *dev,
+ struct sk_buff *skb, int npages)
+{
+ struct mpassthru_port *ctor;
+ struct skb_user_page *user = NULL;
+
+ ctor = rcu_dereference(dev->mp_port);
+ if (!ctor)
+ goto out;
+ BUG_ON(npages > ctor->npages);
+ user = ctor->ctor(ctor, skb, npages);
+out:
+ return user;
+}
+EXPORT_SYMBOL(netdev_alloc_user_pages);
+#endif
+
void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
int size)
{
@@ -338,6 +385,10 @@ static void skb_clone_fraglist(struct sk_buff *skb)
static void skb_release_data(struct sk_buff *skb)
{
+#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
+ struct skb_user_page *user = skb_shinfo(skb)->destructor_arg;
+#endif
+
if (!skb->cloned ||
!atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
&skb_shinfo(skb)->dataref)) {
@@ -349,7 +400,10 @@ static void skb_release_data(struct sk_buff *skb)
if (skb_has_frags(skb))
skb_drop_fraglist(skb);
-
+#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
+ if (skb->dev && skb->dev->mp_port && user && user->dtor)
+ user->dtor(user);
+#endif
kfree(skb->head);
}
}
@@ -503,8 +557,14 @@ int skb_recycle_check(struct sk_buff *skb, int skb_size)
if (skb_shared(skb) || skb_cloned(skb))
return 0;
- skb_release_head_state(skb);
+#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
+ if (skb->dev && skb->dev->mp_port)
+ return 0;
+#endif
+
shinfo = skb_shinfo(skb);
+
+ skb_release_head_state(skb);
atomic_set(&shinfo->dataref, 1);
shinfo->nr_frags = 0;
shinfo->gso_size = 0;