Message ID | 1306438593.5180.33.camel@localhost.localdomain (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Le jeudi 26 mai 2011 à 12:36 -0700, Shirley Ma a écrit : > This patch adds userspace buffers support in skb shared info. A new > struct skb_ubuf_info is needed to maintain the userspace buffers > argument and index, a callback is used to notify userspace to release > the buffers once lower device has done DMA (Last reference to that skb > has gone). > > If there is any userspace apps to reference these userspace buffers, > then these userspaces buffers will be copied into kernel. This way we > can prevent userspace apps to hold these userspace buffers too long. > > Signed-off-by: Shirley Ma <xma@us.ibm.com> > --- > > include/linux/skbuff.h | 26 +++++++++++++++ > net/core/skbuff.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++- > 2 files changed, 104 insertions(+), 2 deletions(-) > > diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h > index d0ae90a..025de5c 100644 > --- a/include/linux/skbuff.h > +++ b/include/linux/skbuff.h > @@ -189,6 +189,18 @@ enum { > SKBTX_DRV_NEEDS_SK_REF = 1 << 3, > }; > > +/* > + * The callback notifies userspace to release buffers when skb DMA is done in > + * lower device, the skb last reference should be 0 when calling this. > + * The desc is used to track userspace buffer index. > + */ > +struct skb_ubuf_info { > + /* support buffers allocation from userspace */ > + void (*callback)(struct sk_buff *); > + void *arg; > + size_t desc; > +}; Thats 24 bytes on each skb... desc for example is not used in this patch (yes, its used later in patch 4/4) But still... could you instead use one pointer only in skb ? > + > /* This data is invariant across clones and lives at > * the end of the header data, ie. at skb->end. > */ > @@ -211,6 +223,10 @@ struct skb_shared_info { > /* Intermediate layers must ensure that destructor_arg > * remains valid until skb destructor */ > void * destructor_arg; > + > + /* DMA mapping from/to userspace buffers */ > + struct skb_ubuf_info ubuf; > + > /* must be last field, see pskb_expand_head() */ > skb_frag_t frags[MAX_SKB_FRAGS]; > }; > @@ -2261,5 +2277,15 @@ static inline void skb_checksum_none_assert(struct sk_buff *skb) > } > > bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off); > + > +/* > + * skb_ubuf - is the buffer from userspace > + * @skb: buffer to check > + */ > +static inline int skb_ubuf(const struct sk_buff *skb) > +{ > + return (skb_shinfo(skb)->ubuf.callback != NULL); return skb_shinfo(skb)->ubuf.callback != NULL; > +} > + > #endif /* __KERNEL__ */ > #endif /* _LINUX_SKBUFF_H */ > diff --git a/net/core/skbuff.c b/net/core/skbuff.c > index 7ebeed0..890447c 100644 > --- a/net/core/skbuff.c > +++ b/net/core/skbuff.c > @@ -210,6 +210,8 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, > shinfo = skb_shinfo(skb); > memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); > atomic_set(&shinfo->dataref, 1); > + shinfo->ubuf.callback = NULL; > + shinfo->ubuf.arg = NULL; if you put ubuf ptr before dataref, no need to add this (the memset() clear all shared_info up to dataref) > kmemcheck_annotate_variable(shinfo->destructor_arg); > > if (fclone) { > @@ -328,6 +330,14 @@ static void skb_release_data(struct sk_buff *skb) > put_page(skb_shinfo(skb)->frags[i].page); > } > > + /* > + * if skb buf is from userspace, we need to notify the caller > + * the lower device DMA has done; > + */ > + if (skb_ubuf(skb)) { > + skb_shinfo(skb)->ubuf.callback(skb); > + skb_shinfo(skb)->ubuf.callback = NULL; > + } > if (skb_has_frag_list(skb)) > skb_drop_fraglist(skb); > > @@ -480,6 +490,9 @@ bool skb_recycle_check(struct sk_buff *skb, int skb_size) > if (irqs_disabled()) > return false; > > + if (skb_ubuf(skb)) > + return false; > + > if (skb_is_nonlinear(skb) || skb->fclone != SKB_FCLONE_UNAVAILABLE) > return false; > > @@ -572,6 +585,7 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) > atomic_set(&n->users, 1); > > atomic_inc(&(skb_shinfo(skb)->dataref)); > + skb_shinfo(skb)->ubuf.callback = NULL; > skb->cloned = 1; > > return n; > @@ -595,6 +609,48 @@ struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src) > } > EXPORT_SYMBOL_GPL(skb_morph); > > +/* skb frags copy userspace buffers to kernel */ > +static int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) > +{ > + int i; > + int num_frags = skb_shinfo(skb)->nr_frags; > + struct page *page, *head = NULL; > + > + for (i = 0; i < num_frags; i++) { > + u8 *vaddr; > + skb_frag_t *f = &skb_shinfo(skb)->frags[i]; > + > + page = alloc_page(GFP_ATOMIC); > + if (!page) { > + while (head) { > + put_page(head); > + head = (struct page *)head->private; > + } > + return -ENOMEM; > + } > + vaddr = kmap_skb_frag(&skb_shinfo(skb)->frags[i]); > + memcpy(page_address(page), vaddr + f->page_offset, f->size); > + kunmap_skb_frag(vaddr); > + page->private = (unsigned long)head; > + head = page; > + } > + > + /* skb frags release userspace buffers */ > + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) > + put_page(skb_shinfo(skb)->frags[i].page); > + skb_shinfo(skb)->ubuf.callback(skb); > + skb_shinfo(skb)->ubuf.callback = NULL; > + > + /* skb frags point to kernel buffers */ > + for (i = skb_shinfo(skb)->nr_frags; i > 0; i--) { > + skb_shinfo(skb)->frags[i - 1].page_offset = 0; > + skb_shinfo(skb)->frags[i - 1].page = head; > + head = (struct page *)head->private; > + } > + return 0; > +} > + > + > /** > * skb_clone - duplicate an sk_buff > * @skb: buffer to clone > @@ -613,6 +669,11 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) > { > struct sk_buff *n; > > + if (skb_ubuf(skb)) { > + if (skb_copy_ubufs(skb, gfp_mask)) > + return NULL; > + } > + > n = skb + 1; > if (skb->fclone == SKB_FCLONE_ORIG && > n->fclone == SKB_FCLONE_UNAVAILABLE) { > @@ -730,6 +791,12 @@ struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask) > if (skb_shinfo(skb)->nr_frags) { > int i; > > + if (skb_ubuf(skb)) { > + if (skb_copy_ubufs(skb, gfp_mask)) { > + kfree(n); > + goto out; > + } > + } > for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { > skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i]; > get_page(skb_shinfo(n)->frags[i].page); > @@ -743,6 +810,7 @@ struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask) > } > > copy_skb_header(n, skb); > + please dont add new lines like this in your patch > out: > return n; > } > @@ -787,7 +855,6 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, > fastpath = true; > else { > int delta = skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1; > - and dont delete this one as well. > fastpath = atomic_read(&skb_shinfo(skb)->dataref) == delta; > } > > @@ -818,14 +885,19 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, > if (fastpath) { > kfree(skb->head); > } else { > + /* copy this zero copy skb frags */ > + if (skb_ubuf(skb)) { > + if (skb_copy_ubufs(skb, gfp_mask)) > + goto nofrags; > + } > for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) > get_page(skb_shinfo(skb)->frags[i].page); > - ditto > if (skb_has_frag_list(skb)) > skb_clone_fraglist(skb); > > skb_release_data(skb); > } > + ditto > off = (data + nhead) - skb->head; > > skb->head = data; > @@ -852,6 +924,8 @@ adjust_others: > atomic_set(&skb_shinfo(skb)->dataref, 1); > return 0; > > +nofrags: > + kfree(data); > nodata: > return -ENOMEM; > } > @@ -1353,6 +1427,8 @@ int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len) > } > start = end; > } > + skb_shinfo(skb)->ubuf.callback = NULL; > + > if (!len) > return 0; > > > Thanks -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, 2011-05-26 at 22:04 +0200, Eric Dumazet wrote: > Le jeudi 26 mai 2011 à 12:36 -0700, Shirley Ma a écrit : > > This patch adds userspace buffers support in skb shared info. A new > > struct skb_ubuf_info is needed to maintain the userspace buffers > > argument and index, a callback is used to notify userspace to > release > > the buffers once lower device has done DMA (Last reference to that > skb > > has gone). > > > > If there is any userspace apps to reference these userspace buffers, > > then these userspaces buffers will be copied into kernel. This way > we > > can prevent userspace apps to hold these userspace buffers too long. > > > > Signed-off-by: Shirley Ma <xma@us.ibm.com> > > --- > > > > include/linux/skbuff.h | 26 +++++++++++++++ > > net/core/skbuff.c | 80 > ++++++++++++++++++++++++++++++++++++++++++++++- > > 2 files changed, 104 insertions(+), 2 deletions(-) > > > > diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h > > index d0ae90a..025de5c 100644 > > --- a/include/linux/skbuff.h > > +++ b/include/linux/skbuff.h > > @@ -189,6 +189,18 @@ enum { > > SKBTX_DRV_NEEDS_SK_REF = 1 << 3, > > }; > > > > +/* > > + * The callback notifies userspace to release buffers when skb DMA > is done in > > + * lower device, the skb last reference should be 0 when calling > this. > > + * The desc is used to track userspace buffer index. > > + */ > > +struct skb_ubuf_info { > > + /* support buffers allocation from userspace */ > > + void (*callback)(struct sk_buff *); > > + void *arg; > > + size_t desc; > > +}; > > Thats 24 bytes on each skb... desc for example is not used in this > patch (yes, its used later in patch 4/4) > > But still... could you instead use one pointer only in skb ? I could reduce callback pointer by moving it to *arg, but not desc, this indicates that which buffer DMA hasn't done yet in *arg. > > > + > > /* This data is invariant across clones and lives at > > * the end of the header data, ie. at skb->end. > > */ > > @@ -211,6 +223,10 @@ struct skb_shared_info { > > /* Intermediate layers must ensure that destructor_arg > > * remains valid until skb destructor */ > > void * destructor_arg; > > + > > + /* DMA mapping from/to userspace buffers */ > > + struct skb_ubuf_info ubuf; > > + > > /* must be last field, see pskb_expand_head() */ > > skb_frag_t frags[MAX_SKB_FRAGS]; > > }; > > @@ -2261,5 +2277,15 @@ static inline void > skb_checksum_none_assert(struct sk_buff *skb) > > } > > > > bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off); > > + > > +/* > > + * skb_ubuf - is the buffer from userspace > > + * @skb: buffer to check > > + */ > > +static inline int skb_ubuf(const struct sk_buff *skb) > > +{ > > + return (skb_shinfo(skb)->ubuf.callback != NULL); > > return skb_shinfo(skb)->ubuf.callback != NULL; > > > +} > > + > > #endif /* __KERNEL__ */ > > #endif /* _LINUX_SKBUFF_H */ > > diff --git a/net/core/skbuff.c b/net/core/skbuff.c > > index 7ebeed0..890447c 100644 > > --- a/net/core/skbuff.c > > +++ b/net/core/skbuff.c > > @@ -210,6 +210,8 @@ struct sk_buff *__alloc_skb(unsigned int size, > gfp_t gfp_mask, > > shinfo = skb_shinfo(skb); > > memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); > > atomic_set(&shinfo->dataref, 1); > > + shinfo->ubuf.callback = NULL; > > + shinfo->ubuf.arg = NULL; > > if you put ubuf ptr before dataref, no need to add this (the memset() > clear all shared_info up to dataref) Ok, will remove it. > > kmemcheck_annotate_variable(shinfo->destructor_arg); > > > > if (fclone) { > > @@ -328,6 +330,14 @@ static void skb_release_data(struct sk_buff > *skb) > > put_page(skb_shinfo(skb)->frags[i].page); > > } > > > > + /* > > + * if skb buf is from userspace, we need to notify the > caller > > + * the lower device DMA has done; > > + */ > > + if (skb_ubuf(skb)) { > > + skb_shinfo(skb)->ubuf.callback(skb); > > + skb_shinfo(skb)->ubuf.callback = NULL; > > + } > > if (skb_has_frag_list(skb)) > > skb_drop_fraglist(skb); > > > > @@ -480,6 +490,9 @@ bool skb_recycle_check(struct sk_buff *skb, int > skb_size) > > if (irqs_disabled()) > > return false; > > > > + if (skb_ubuf(skb)) > > + return false; > > + > > if (skb_is_nonlinear(skb) || skb->fclone != > SKB_FCLONE_UNAVAILABLE) > > return false; > > > > @@ -572,6 +585,7 @@ static struct sk_buff *__skb_clone(struct > sk_buff *n, struct sk_buff *skb) > > atomic_set(&n->users, 1); > > > > atomic_inc(&(skb_shinfo(skb)->dataref)); > > + skb_shinfo(skb)->ubuf.callback = NULL; > > skb->cloned = 1; > > > > return n; > > @@ -595,6 +609,48 @@ struct sk_buff *skb_morph(struct sk_buff *dst, > struct sk_buff *src) > > } > > EXPORT_SYMBOL_GPL(skb_morph); > > > > +/* skb frags copy userspace buffers to kernel */ > > +static int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) > > +{ > > + int i; > > + int num_frags = skb_shinfo(skb)->nr_frags; > > + struct page *page, *head = NULL; > > + > > + for (i = 0; i < num_frags; i++) { > > + u8 *vaddr; > > + skb_frag_t *f = &skb_shinfo(skb)->frags[i]; > > + > > + page = alloc_page(GFP_ATOMIC); > > + if (!page) { > > + while (head) { > > + put_page(head); > > + head = (struct page *)head->private; > > + } > > + return -ENOMEM; > > + } > > + vaddr = kmap_skb_frag(&skb_shinfo(skb)->frags[i]); > > + memcpy(page_address(page), vaddr + f->page_offset, > f->size); > > + kunmap_skb_frag(vaddr); > > + page->private = (unsigned long)head; > > + head = page; > > + } > > + > > + /* skb frags release userspace buffers */ > > + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) > > + put_page(skb_shinfo(skb)->frags[i].page); > > + skb_shinfo(skb)->ubuf.callback(skb); > > + skb_shinfo(skb)->ubuf.callback = NULL; > > + > > + /* skb frags point to kernel buffers */ > > + for (i = skb_shinfo(skb)->nr_frags; i > 0; i--) { > > + skb_shinfo(skb)->frags[i - 1].page_offset = 0; > > + skb_shinfo(skb)->frags[i - 1].page = head; > > + head = (struct page *)head->private; > > + } > > + return 0; > > +} > > + > > + > > /** > > * skb_clone - duplicate an sk_buff > > * @skb: buffer to clone > > @@ -613,6 +669,11 @@ struct sk_buff *skb_clone(struct sk_buff *skb, > gfp_t gfp_mask) > > { > > struct sk_buff *n; > > > > + if (skb_ubuf(skb)) { > > + if (skb_copy_ubufs(skb, gfp_mask)) > > + return NULL; > > + } > > + > > n = skb + 1; > > if (skb->fclone == SKB_FCLONE_ORIG && > > n->fclone == SKB_FCLONE_UNAVAILABLE) { > > @@ -730,6 +791,12 @@ struct sk_buff *pskb_copy(struct sk_buff *skb, > gfp_t gfp_mask) > > if (skb_shinfo(skb)->nr_frags) { > > int i; > > > > + if (skb_ubuf(skb)) { > > + if (skb_copy_ubufs(skb, gfp_mask)) { > > + kfree(n); > > + goto out; > > + } > > + } > > for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { > > skb_shinfo(n)->frags[i] = > skb_shinfo(skb)->frags[i]; > > get_page(skb_shinfo(n)->frags[i].page); > > @@ -743,6 +810,7 @@ struct sk_buff *pskb_copy(struct sk_buff *skb, > gfp_t gfp_mask) > > } > > > > copy_skb_header(n, skb); > > + > > please dont add new lines like this in your patch checkpatch doesn't check this kind of line, will remove it. > > out: > > return n; > > } > > @@ -787,7 +855,6 @@ int pskb_expand_head(struct sk_buff *skb, int > nhead, int ntail, > > fastpath = true; > > else { > > int delta = skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + > 1 : 1; > > - > > and dont delete this one as well. > > > fastpath = atomic_read(&skb_shinfo(skb)->dataref) == > delta; > > } > > > > @@ -818,14 +885,19 @@ int pskb_expand_head(struct sk_buff *skb, int > nhead, int ntail, > > if (fastpath) { > > kfree(skb->head); > > } else { > > + /* copy this zero copy skb frags */ > > + if (skb_ubuf(skb)) { > > + if (skb_copy_ubufs(skb, gfp_mask)) > > + goto nofrags; > > + } > > for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) > > get_page(skb_shinfo(skb)->frags[i].page); > > - > > ditto > > > if (skb_has_frag_list(skb)) > > skb_clone_fraglist(skb); > > > > skb_release_data(skb); > > } > > + > > ditto > > > off = (data + nhead) - skb->head; > > > > skb->head = data; > > @@ -852,6 +924,8 @@ adjust_others: > > atomic_set(&skb_shinfo(skb)->dataref, 1); > > return 0; > > > > +nofrags: > > + kfree(data); > > nodata: > > return -ENOMEM; > > } > > @@ -1353,6 +1427,8 @@ int skb_copy_bits(const struct sk_buff *skb, > int offset, void *to, int len) > > } > > start = end; > > } > > + skb_shinfo(skb)->ubuf.callback = NULL; > > + > > if (!len) > > return 0; > > > > > > > > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Le jeudi 26 mai 2011 à 13:24 -0700, Shirley Ma a écrit : > I could reduce callback pointer by moving it to *arg, but not desc, this > indicates that which buffer DMA hasn't done yet in *arg. I guess you dont need to use skb itself to hold all your states ? I understand its convenient for you, but I believe its worth the pain to use only one pointer to a (small) object where you put all your stuff. Some machines alloc/free millions of skbs per second. If/when most skb uses are for userspace zero-copy buffers we can embbed your small object in skb itself ;) -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, 2011-05-26 at 22:55 +0200, Eric Dumazet wrote: > Le jeudi 26 mai 2011 à 13:24 -0700, Shirley Ma a écrit : > > > I could reduce callback pointer by moving it to *arg, but not desc, > this > > indicates that which buffer DMA hasn't done yet in *arg. > > > I guess you dont need to use skb itself to hold all your states ? > > I understand its convenient for you, but I believe its worth the pain > to > use only one pointer to a (small) object where you put all your stuff. > > Some machines alloc/free millions of skbs per second. > > If/when most skb uses are for userspace zero-copy buffers we can > embbed > your small object in skb itself ;) You are right, w/o this desc, there will be lots of alloc/dealloc in the caller side. To have better performance, maybe I should have changed most skbs to use zero-copy buffers. :) Ok, for now let me try to move it to caller, just leave one pointer *uarg here. Thanks Shirley -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index d0ae90a..025de5c 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -189,6 +189,18 @@ enum { SKBTX_DRV_NEEDS_SK_REF = 1 << 3, }; +/* + * The callback notifies userspace to release buffers when skb DMA is done in + * lower device, the skb last reference should be 0 when calling this. + * The desc is used to track userspace buffer index. + */ +struct skb_ubuf_info { + /* support buffers allocation from userspace */ + void (*callback)(struct sk_buff *); + void *arg; + size_t desc; +}; + /* This data is invariant across clones and lives at * the end of the header data, ie. at skb->end. */ @@ -211,6 +223,10 @@ struct skb_shared_info { /* Intermediate layers must ensure that destructor_arg * remains valid until skb destructor */ void * destructor_arg; + + /* DMA mapping from/to userspace buffers */ + struct skb_ubuf_info ubuf; + /* must be last field, see pskb_expand_head() */ skb_frag_t frags[MAX_SKB_FRAGS]; }; @@ -2261,5 +2277,15 @@ static inline void skb_checksum_none_assert(struct sk_buff *skb) } bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off); + +/* + * skb_ubuf - is the buffer from userspace + * @skb: buffer to check + */ +static inline int skb_ubuf(const struct sk_buff *skb) +{ + return (skb_shinfo(skb)->ubuf.callback != NULL); +} + #endif /* __KERNEL__ */ #endif /* _LINUX_SKBUFF_H */ diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 7ebeed0..890447c 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -210,6 +210,8 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, shinfo = skb_shinfo(skb); memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); atomic_set(&shinfo->dataref, 1); + shinfo->ubuf.callback = NULL; + shinfo->ubuf.arg = NULL; kmemcheck_annotate_variable(shinfo->destructor_arg); if (fclone) { @@ -328,6 +330,14 @@ static void skb_release_data(struct sk_buff *skb) put_page(skb_shinfo(skb)->frags[i].page); } + /* + * if skb buf is from userspace, we need to notify the caller + * the lower device DMA has done; + */ + if (skb_ubuf(skb)) { + skb_shinfo(skb)->ubuf.callback(skb); + skb_shinfo(skb)->ubuf.callback = NULL; + } if (skb_has_frag_list(skb)) skb_drop_fraglist(skb); @@ -480,6 +490,9 @@ bool skb_recycle_check(struct sk_buff *skb, int skb_size) if (irqs_disabled()) return false; + if (skb_ubuf(skb)) + return false; + if (skb_is_nonlinear(skb) || skb->fclone != SKB_FCLONE_UNAVAILABLE) return false; @@ -572,6 +585,7 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) atomic_set(&n->users, 1); atomic_inc(&(skb_shinfo(skb)->dataref)); + skb_shinfo(skb)->ubuf.callback = NULL; skb->cloned = 1; return n; @@ -595,6 +609,48 @@ struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src) } EXPORT_SYMBOL_GPL(skb_morph); +/* skb frags copy userspace buffers to kernel */ +static int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) +{ + int i; + int num_frags = skb_shinfo(skb)->nr_frags; + struct page *page, *head = NULL; + + for (i = 0; i < num_frags; i++) { + u8 *vaddr; + skb_frag_t *f = &skb_shinfo(skb)->frags[i]; + + page = alloc_page(GFP_ATOMIC); + if (!page) { + while (head) { + put_page(head); + head = (struct page *)head->private; + } + return -ENOMEM; + } + vaddr = kmap_skb_frag(&skb_shinfo(skb)->frags[i]); + memcpy(page_address(page), vaddr + f->page_offset, f->size); + kunmap_skb_frag(vaddr); + page->private = (unsigned long)head; + head = page; + } + + /* skb frags release userspace buffers */ + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) + put_page(skb_shinfo(skb)->frags[i].page); + skb_shinfo(skb)->ubuf.callback(skb); + skb_shinfo(skb)->ubuf.callback = NULL; + + /* skb frags point to kernel buffers */ + for (i = skb_shinfo(skb)->nr_frags; i > 0; i--) { + skb_shinfo(skb)->frags[i - 1].page_offset = 0; + skb_shinfo(skb)->frags[i - 1].page = head; + head = (struct page *)head->private; + } + return 0; +} + + /** * skb_clone - duplicate an sk_buff * @skb: buffer to clone @@ -613,6 +669,11 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) { struct sk_buff *n; + if (skb_ubuf(skb)) { + if (skb_copy_ubufs(skb, gfp_mask)) + return NULL; + } + n = skb + 1; if (skb->fclone == SKB_FCLONE_ORIG && n->fclone == SKB_FCLONE_UNAVAILABLE) { @@ -730,6 +791,12 @@ struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask) if (skb_shinfo(skb)->nr_frags) { int i; + if (skb_ubuf(skb)) { + if (skb_copy_ubufs(skb, gfp_mask)) { + kfree(n); + goto out; + } + } for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i]; get_page(skb_shinfo(n)->frags[i].page); @@ -743,6 +810,7 @@ struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask) } copy_skb_header(n, skb); + out: return n; } @@ -787,7 +855,6 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, fastpath = true; else { int delta = skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1; - fastpath = atomic_read(&skb_shinfo(skb)->dataref) == delta; } @@ -818,14 +885,19 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, if (fastpath) { kfree(skb->head); } else { + /* copy this zero copy skb frags */ + if (skb_ubuf(skb)) { + if (skb_copy_ubufs(skb, gfp_mask)) + goto nofrags; + } for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) get_page(skb_shinfo(skb)->frags[i].page); - if (skb_has_frag_list(skb)) skb_clone_fraglist(skb); skb_release_data(skb); } + off = (data + nhead) - skb->head; skb->head = data; @@ -852,6 +924,8 @@ adjust_others: atomic_set(&skb_shinfo(skb)->dataref, 1); return 0; +nofrags: + kfree(data); nodata: return -ENOMEM; } @@ -1353,6 +1427,8 @@ int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len) } start = end; } + skb_shinfo(skb)->ubuf.callback = NULL; + if (!len) return 0;
This patch adds userspace buffers support in skb shared info. A new struct skb_ubuf_info is needed to maintain the userspace buffers argument and index, a callback is used to notify userspace to release the buffers once lower device has done DMA (Last reference to that skb has gone). If there is any userspace apps to reference these userspace buffers, then these userspaces buffers will be copied into kernel. This way we can prevent userspace apps to hold these userspace buffers too long. Signed-off-by: Shirley Ma <xma@us.ibm.com> --- include/linux/skbuff.h | 26 +++++++++++++++ net/core/skbuff.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 104 insertions(+), 2 deletions(-) -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html