Message ID | 1306610588.5180.87.camel@localhost.localdomain (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Hello Dave, To support skb zero-copy, a pointer is needed to add to skb share info. Do you agree with this approach? If not, do you have any other suggestions? Thanks Shirley On Sat, 2011-05-28 at 12:23 -0700, Shirley Ma wrote: > From: > Shirley Ma <mashirle@us.ibm.com> > To: > David Miller <davem@davemloft.net>, > mst@redhat.com, Eric Dumazet > <eric.dumazet@gmail.com>, Avi > Kivity <avi@redhat.com>, Arnd > Bergmann <arnd@arndb.de> > Cc: > netdev@vger.kernel.org, > kvm@vger.kernel.org, > linux-kernel@vger.kernel.org > Subject: > [PATCH V7 2/4 net-next] skbuff: Add > userspace zero-copy buffers in skb > Date: > 05/28/2011 12:23:08 PM > > > This patch adds userspace buffers support in skb shared info. A new > struct skb_ubuf_info is needed to maintain the userspace buffers > argument and index, a callback is used to notify userspace to release > the buffers once lower device has done DMA (Last reference to that skb > has gone). > > If there is any userspace apps to reference these userspace buffers, > then these userspaces buffers will be copied into kernel. This way we > can prevent userspace apps to hold these userspace buffers too long. > > One userspace buffer info pointer is added in skb_shared_info. Is that > safe to use destructor_arg? From the comments, this destructor_arg is > used for destructor, destructor calls doesn't wait for last reference > to > that skb is gone. > > Signed-off-by: Shirley Ma <xma@us.ibm.com> > --- > include/linux/skbuff.h | 24 ++++++++++++++++ > net/core/skbuff.c | 73 > ++++++++++++++++++++++++++++++++++++++++++++++++ > 2 files changed, 97 insertions(+), 0 deletions(-) > > diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h > index e8b78ce..37a2cb4 100644 > --- a/include/linux/skbuff.h > +++ b/include/linux/skbuff.h > @@ -189,6 +189,17 @@ enum { > SKBTX_DRV_NEEDS_SK_REF = 1 << 3, > }; > > +/* > + * The callback notifies userspace to release buffers when skb DMA is > done in > + * lower device, the skb last reference should be 0 when calling > this. > + * The desc is used to track userspace buffer index. > + */ > +struct ubuf_info { > + void (*callback)(void *); > + void *arg; > + unsigned long desc; > +}; > + > /* This data is invariant across clones and lives at > * the end of the header data, ie. at skb->end. > */ > @@ -203,6 +214,9 @@ struct skb_shared_info { > struct sk_buff *frag_list; > struct skb_shared_hwtstamps hwtstamps; > > + /* DMA mapping from/to userspace buffers info */ > + void *ubuf_arg; > + > /* > * Warning : all fields before dataref are cleared in > __alloc_skb() > */ > @@ -2261,5 +2275,15 @@ static inline void > skb_checksum_none_assert(struct sk_buff *skb) > } > > bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off); > + > +/* > + * skb *uarg - is the buffer from userspace > + * @skb: buffer to check > + */ > +static inline int skb_ubuf(const struct sk_buff *skb) > +{ > + return skb_shinfo(skb)->ubuf_arg != NULL; > +} > + > #endif /* __KERNEL__ */ > #endif /* _LINUX_SKBUFF_H */ > diff --git a/net/core/skbuff.c b/net/core/skbuff.c > index 46cbd28..115c5ca 100644 > --- a/net/core/skbuff.c > +++ b/net/core/skbuff.c > @@ -329,6 +329,19 @@ static void skb_release_data(struct sk_buff *skb) > put_page(skb_shinfo(skb)->frags[i].page); > } > > + /* > + * If skb buf is from userspace, we need to notify the > caller > + * the lower device DMA has done; > + */ > + if (skb_ubuf(skb)) { > + struct ubuf_info *uarg; > + > + uarg = (struct ubuf_info > *)skb_shinfo(skb)->ubuf_arg; > + > + if (uarg->callback) > + uarg->callback(uarg); > + } > + > if (skb_has_frag_list(skb)) > skb_drop_fraglist(skb); > > @@ -481,6 +494,9 @@ bool skb_recycle_check(struct sk_buff *skb, int > skb_size) > if (irqs_disabled()) > return false; > > + if (skb_ubuf(skb)) > + return false; > + > if (skb_is_nonlinear(skb) || skb->fclone != > SKB_FCLONE_UNAVAILABLE) > return false; > > @@ -573,6 +589,7 @@ static struct sk_buff *__skb_clone(struct sk_buff > *n, struct sk_buff *skb) > atomic_set(&n->users, 1); > > atomic_inc(&(skb_shinfo(skb)->dataref)); > + skb_shinfo(skb)->ubuf_arg = NULL; > skb->cloned = 1; > > return n; > @@ -596,6 +613,51 @@ struct sk_buff *skb_morph(struct sk_buff *dst, > struct sk_buff *src) > } > EXPORT_SYMBOL_GPL(skb_morph); > > +/* skb frags copy userspace buffers to kernel */ > +static int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) > +{ > + int i; > + int num_frags = skb_shinfo(skb)->nr_frags; > + struct page *page, *head = NULL; > + struct ubuf_info *uarg = skb_shinfo(skb)->ubuf_arg; > + > + for (i = 0; i < num_frags; i++) { > + u8 *vaddr; > + skb_frag_t *f = &skb_shinfo(skb)->frags[i]; > + > + page = alloc_page(GFP_ATOMIC); > + if (!page) { > + while (head) { > + put_page(head); > + head = (struct page *)head->private; > + } > + return -ENOMEM; > + } > + vaddr = kmap_skb_frag(&skb_shinfo(skb)->frags[i]); > + memcpy(page_address(page), > + vaddr + f->page_offset, f->size); > + kunmap_skb_frag(vaddr); > + page->private = (unsigned long)head; > + head = page; > + } > + > + /* skb frags release userspace buffers */ > + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) > + put_page(skb_shinfo(skb)->frags[i].page); > + > + uarg->callback(uarg); > + skb_shinfo(skb)->ubuf_arg = NULL; > + > + /* skb frags point to kernel buffers */ > + for (i = skb_shinfo(skb)->nr_frags; i > 0; i--) { > + skb_shinfo(skb)->frags[i - 1].page_offset = 0; > + skb_shinfo(skb)->frags[i - 1].page = head; > + head = (struct page *)head->private; > + } > + return 0; > +} > + > + > /** > * skb_clone - duplicate an sk_buff > * @skb: buffer to clone > @@ -614,6 +676,11 @@ struct sk_buff *skb_clone(struct sk_buff *skb, > gfp_t gfp_mask) > { > struct sk_buff *n; > > + if (skb_ubuf(skb)) { > + if (skb_copy_ubufs(skb, gfp_mask)) > + return NULL; > + } > + > n = skb + 1; > if (skb->fclone == SKB_FCLONE_ORIG && > n->fclone == SKB_FCLONE_UNAVAILABLE) { > @@ -731,6 +798,12 @@ struct sk_buff *pskb_copy(struct sk_buff *skb, > gfp_t gfp_mask) > if (skb_shinfo(skb)->nr_frags) { > int i; > > + if (skb_ubuf(skb)) { > + if (skb_copy_ubufs(skb, gfp_mask)) { > + kfree(n); > + goto out; > + } > + } > for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { > skb_shinfo(n)->frags[i] = > skb_shinfo(skb)->frags[i]; > get_page(skb_shinfo(n)->frags[i].page); -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
From: Shirley Ma <mashirle@us.ibm.com> Date: Mon, 27 Jun 2011 08:45:10 -0700 > To support skb zero-copy, a pointer is needed to add to skb share info. > Do you agree with this approach? If not, do you have any other > suggestions? I really can't form an opinion unless I am shown the complete implementation, what this give us in return, what the impact is, etc. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, 2011-06-27 at 15:54 -0700, David Miller wrote: > From: Shirley Ma <mashirle@us.ibm.com> > Date: Mon, 27 Jun 2011 08:45:10 -0700 > > > To support skb zero-copy, a pointer is needed to add to skb share > info. > > Do you agree with this approach? If not, do you have any other > > suggestions? > > I really can't form an opinion unless I am shown the complete > implementation, what this give us in return, what the impact is, etc. zero-copy skb buffers can save significant CPUs. Right now, I only implements macvtap/vhost zero-copy between KVM guest and host. The performance is as follow: Single TCP_STREAM 120 secs test results 2.6.39-rc3 over ixgbe 10Gb NIC results: Message BW(Gb/s)qemu-kvm (NumCPU)vhost-net(NumCPU) PerfTop irq/s 4K 7408.57 92.1% 22.6% 1229 4K(Orig)4913.17 118.1% 84.1% 2086 8K 9129.90 89.3% 23.3% 1141 8K(Orig)7094.55 115.9% 84.7% 2157 16K 9178.81 89.1% 23.3% 1139 16K(Orig)8927.1 118.7% 83.4% 2262 64K 9171.43 88.4% 24.9% 1253 64K(Orig)9085.85 115.9% 82.4% 2229 You can see the overall CPU saved 50% w/i zero-copy. The impact is every skb allocation consumed one more pointer in skb share info, and a pointer check in skb release when last reference is gone. For skb clone, skb expand private head and skb copy, it still keeps copy the buffers to kernel, so we can avoid user application, like tcpdump to hold the user-space buffers too long. Thanks Shirley -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 06/28/2011 09:51 AM, Shirley Ma wrote: > On Mon, 2011-06-27 at 15:54 -0700, David Miller wrote: >> From: Shirley Ma<mashirle@us.ibm.com> >> Date: Mon, 27 Jun 2011 08:45:10 -0700 >> >>> To support skb zero-copy, a pointer is needed to add to skb share >> info. >>> Do you agree with this approach? If not, do you have any other >>> suggestions? >> >> I really can't form an opinion unless I am shown the complete >> implementation, what this give us in return, what the impact is, etc. > > zero-copy skb buffers can save significant CPUs. Right now, I only > implements macvtap/vhost zero-copy between KVM guest and host. The > performance is as follow: > > Single TCP_STREAM 120 secs test results 2.6.39-rc3 over ixgbe 10Gb NIC > results: > > Message BW(Gb/s)qemu-kvm (NumCPU)vhost-net(NumCPU) PerfTop irq/s > 4K 7408.57 92.1% 22.6% 1229 > 4K(Orig)4913.17 118.1% 84.1% 2086 > > 8K 9129.90 89.3% 23.3% 1141 > 8K(Orig)7094.55 115.9% 84.7% 2157 > > 16K 9178.81 89.1% 23.3% 1139 > 16K(Orig)8927.1 118.7% 83.4% 2262 > > 64K 9171.43 88.4% 24.9% 1253 > 64K(Orig)9085.85 115.9% 82.4% 2229 > > You can see the overall CPU saved 50% w/i zero-copy. While this isn't the copy between netperf and the stack, at some point you may want to enable netperf's "DIRTY" mode (./configure --enable-dirty) to cause it to start either dirtying buffers before send, or reading from buffers after receive. I cannot guarantee that there hasn't been bitrot in that area of netperf though :) Particularly in a TCP_MAERTS test. The "DIRTY" mode code will not do anything in a TCP_SENDFILE test. A simple sanity check of the effect of the changes on a TCP_RR test would probably be goodness as well. happy benchmarking, rick jones one of these days I'll have to find a good way to get accurate overall CPU utilization from within a guest and teach netperf about it. > > The impact is every skb allocation consumed one more pointer in skb > share info, and a pointer check in skb release when last reference is > gone. > > For skb clone, skb expand private head and skb copy, it still keeps copy > the buffers to kernel, so we can avoid user application, like tcpdump to > hold the user-space buffers too long. > > Thanks > Shirley > > -- > To unsubscribe from this list: send the line "unsubscribe netdev" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
From: Shirley Ma <mashirle@us.ibm.com> Date: Tue, 28 Jun 2011 09:51:32 -0700 > On Mon, 2011-06-27 at 15:54 -0700, David Miller wrote: >> From: Shirley Ma <mashirle@us.ibm.com> >> Date: Mon, 27 Jun 2011 08:45:10 -0700 >> >> > To support skb zero-copy, a pointer is needed to add to skb share >> info. >> > Do you agree with this approach? If not, do you have any other >> > suggestions? >> >> I really can't form an opinion unless I am shown the complete >> implementation, what this give us in return, what the impact is, etc. .. > You can see the overall CPU saved 50% w/i zero-copy. > > The impact is every skb allocation consumed one more pointer in skb > share info, and a pointer check in skb release when last reference is > gone. > > For skb clone, skb expand private head and skb copy, it still keeps copy > the buffers to kernel, so we can avoid user application, like tcpdump to > hold the user-space buffers too long. Ok, now show me the "complete implementation". I'm as interested in the code as I am in the numbers, that's why I asked for both. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
I have submitted this patchset from Version 1 to Version 7 already in the past few months. Here is the link to the patchset: http://lists.openwall.net/netdev/2011/05/28/ I am working on V8 now. Thanks Shirley On Tue, 2011-06-28 at 16:35 -0700, David Miller wrote: > From: Shirley Ma <mashirle@us.ibm.com> > Date: Tue, 28 Jun 2011 09:51:32 -0700 > > > On Mon, 2011-06-27 at 15:54 -0700, David Miller wrote: > >> From: Shirley Ma <mashirle@us.ibm.com> > >> Date: Mon, 27 Jun 2011 08:45:10 -0700 > >> > >> > To support skb zero-copy, a pointer is needed to add to skb share > >> info. > >> > Do you agree with this approach? If not, do you have any other > >> > suggestions? > >> > >> I really can't form an opinion unless I am shown the complete > >> implementation, what this give us in return, what the impact is, > etc. > .. > > You can see the overall CPU saved 50% w/i zero-copy. > > > > The impact is every skb allocation consumed one more pointer in skb > > share info, and a pointer check in skb release when last reference > is > > gone. > > > > For skb clone, skb expand private head and skb copy, it still keeps > copy > > the buffers to kernel, so we can avoid user application, like > tcpdump to > > hold the user-space buffers too long. > > Ok, now show me the "complete implementation". I'm as interested in > the code as I am in the numbers, that's why I asked for both. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index e8b78ce..37a2cb4 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -189,6 +189,17 @@ enum { SKBTX_DRV_NEEDS_SK_REF = 1 << 3, }; +/* + * The callback notifies userspace to release buffers when skb DMA is done in + * lower device, the skb last reference should be 0 when calling this. + * The desc is used to track userspace buffer index. + */ +struct ubuf_info { + void (*callback)(void *); + void *arg; + unsigned long desc; +}; + /* This data is invariant across clones and lives at * the end of the header data, ie. at skb->end. */ @@ -203,6 +214,9 @@ struct skb_shared_info { struct sk_buff *frag_list; struct skb_shared_hwtstamps hwtstamps; + /* DMA mapping from/to userspace buffers info */ + void *ubuf_arg; + /* * Warning : all fields before dataref are cleared in __alloc_skb() */ @@ -2261,5 +2275,15 @@ static inline void skb_checksum_none_assert(struct sk_buff *skb) } bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off); + +/* + * skb *uarg - is the buffer from userspace + * @skb: buffer to check + */ +static inline int skb_ubuf(const struct sk_buff *skb) +{ + return skb_shinfo(skb)->ubuf_arg != NULL; +} + #endif /* __KERNEL__ */ #endif /* _LINUX_SKBUFF_H */ diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 46cbd28..115c5ca 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -329,6 +329,19 @@ static void skb_release_data(struct sk_buff *skb) put_page(skb_shinfo(skb)->frags[i].page); } + /* + * If skb buf is from userspace, we need to notify the caller + * the lower device DMA has done; + */ + if (skb_ubuf(skb)) { + struct ubuf_info *uarg; + + uarg = (struct ubuf_info *)skb_shinfo(skb)->ubuf_arg; + + if (uarg->callback) + uarg->callback(uarg); + } + if (skb_has_frag_list(skb)) skb_drop_fraglist(skb); @@ -481,6 +494,9 @@ bool skb_recycle_check(struct sk_buff *skb, int skb_size) if (irqs_disabled()) return false; + if (skb_ubuf(skb)) + return false; + if (skb_is_nonlinear(skb) || skb->fclone != SKB_FCLONE_UNAVAILABLE) return false; @@ -573,6 +589,7 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) atomic_set(&n->users, 1); atomic_inc(&(skb_shinfo(skb)->dataref)); + skb_shinfo(skb)->ubuf_arg = NULL; skb->cloned = 1; return n; @@ -596,6 +613,51 @@ struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src) } EXPORT_SYMBOL_GPL(skb_morph); +/* skb frags copy userspace buffers to kernel */ +static int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) +{ + int i; + int num_frags = skb_shinfo(skb)->nr_frags; + struct page *page, *head = NULL; + struct ubuf_info *uarg = skb_shinfo(skb)->ubuf_arg; + + for (i = 0; i < num_frags; i++) { + u8 *vaddr; + skb_frag_t *f = &skb_shinfo(skb)->frags[i]; + + page = alloc_page(GFP_ATOMIC); + if (!page) { + while (head) { + put_page(head); + head = (struct page *)head->private; + } + return -ENOMEM; + } + vaddr = kmap_skb_frag(&skb_shinfo(skb)->frags[i]); + memcpy(page_address(page), + vaddr + f->page_offset, f->size); + kunmap_skb_frag(vaddr); + page->private = (unsigned long)head; + head = page; + } + + /* skb frags release userspace buffers */ + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) + put_page(skb_shinfo(skb)->frags[i].page); + + uarg->callback(uarg); + skb_shinfo(skb)->ubuf_arg = NULL; + + /* skb frags point to kernel buffers */ + for (i = skb_shinfo(skb)->nr_frags; i > 0; i--) { + skb_shinfo(skb)->frags[i - 1].page_offset = 0; + skb_shinfo(skb)->frags[i - 1].page = head; + head = (struct page *)head->private; + } + return 0; +} + + /** * skb_clone - duplicate an sk_buff * @skb: buffer to clone @@ -614,6 +676,11 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) { struct sk_buff *n; + if (skb_ubuf(skb)) { + if (skb_copy_ubufs(skb, gfp_mask)) + return NULL; + } + n = skb + 1; if (skb->fclone == SKB_FCLONE_ORIG && n->fclone == SKB_FCLONE_UNAVAILABLE) { @@ -731,6 +798,12 @@ struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask) if (skb_shinfo(skb)->nr_frags) { int i; + if (skb_ubuf(skb)) { + if (skb_copy_ubufs(skb, gfp_mask)) { + kfree(n); + goto out; + } + } for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i]; get_page(skb_shinfo(n)->frags[i].page);
This patch adds userspace buffers support in skb shared info. A new struct skb_ubuf_info is needed to maintain the userspace buffers argument and index, a callback is used to notify userspace to release the buffers once lower device has done DMA (Last reference to that skb has gone). If there is any userspace apps to reference these userspace buffers, then these userspaces buffers will be copied into kernel. This way we can prevent userspace apps to hold these userspace buffers too long. One userspace buffer info pointer is added in skb_shared_info. Is that safe to use destructor_arg? From the comments, this destructor_arg is used for destructor, destructor calls doesn't wait for last reference to that skb is gone. Signed-off-by: Shirley Ma <xma@us.ibm.com> --- include/linux/skbuff.h | 24 ++++++++++++++++ net/core/skbuff.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 97 insertions(+), 0 deletions(-) -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html