Message ID | 20240419214819.671536-3-zijianzhang@bytedance.com (mailing list archive) |
---|---|
State | Changes Requested |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | net: A lightweight zero-copy notification | expand |
zijianzhang@ wrote: > From: Zijian Zhang <zijianzhang@bytedance.com> > > The MSG_ZEROCOPY flag enables copy avoidance for socket send calls. > However, zerocopy is not a free lunch. Apart from the management of user > pages, the combination of poll + recvmsg to receive notifications incurs > unignorable overhead in the applications. The overhead of such sometimes > might be more than the CPU savings from zerocopy. We try to solve this > problem with a new notification mechanism based on msgcontrol. > This new mechanism aims to reduce the overhead associated with receiving > notifications by embedding them directly into user arguments passed with > each sendmsg control message. By doing so, we can significantly reduce > the complexity and overhead for managing notifications. In an ideal > pattern, the user will keep calling sendmsg with SO_ZC_NOTIFICATION > msg_control, and the notification will be delivered as soon as possible. > > Signed-off-by: Zijian Zhang <zijianzhang@bytedance.com> > Signed-off-by: Xiaochun Lu <xiaochun.lu@bytedance.com> > --- > arch/alpha/include/uapi/asm/socket.h | 2 + > arch/mips/include/uapi/asm/socket.h | 2 + > arch/parisc/include/uapi/asm/socket.h | 2 + > arch/sparc/include/uapi/asm/socket.h | 2 + > include/uapi/asm-generic/socket.h | 2 + > include/uapi/linux/socket.h | 16 ++++++ > net/core/sock.c | 70 +++++++++++++++++++++++++++ > 7 files changed, 96 insertions(+) > > diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h > index e94f621903fe..b24622a9cd47 100644 > --- a/arch/alpha/include/uapi/asm/socket.h > +++ b/arch/alpha/include/uapi/asm/socket.h > @@ -140,6 +140,8 @@ > #define SO_PASSPIDFD 76 > #define SO_PEERPIDFD 77 > > +#define SO_ZC_NOTIFICATION 78 > + SCM_ for cmsgs > /* > * Desired design of maximum size and alignment (see RFC2553) > */ > @@ -35,4 +37,18 @@ struct __kernel_sockaddr_storage { > #define SOCK_TXREHASH_DISABLED 0 > #define SOCK_TXREHASH_ENABLED 1 > > +#define SOCK_ZC_INFO_MAX 256 > + > +struct zc_info_elem { > + __u32 lo; > + __u32 hi; > + __u8 zerocopy; > +}; > + > +struct zc_info_usr { > + __u64 usr_addr; > + unsigned int length; > + struct zc_info_elem info[]; > +}; > + Don't pass a pointer to user memory, just have msg_control point to an array of zc_info_elem. > #endif /* _UAPI_LINUX_SOCKET_H */ > diff --git a/net/core/sock.c b/net/core/sock.c > index fe9195186c13..13f06480f2d8 100644 > --- a/net/core/sock.c > +++ b/net/core/sock.c > @@ -2809,6 +2809,13 @@ int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, > struct sockcm_cookie *sockc) > { > u32 tsflags; > + int ret, zc_info_size, i = 0; > + unsigned long flags; > + struct sk_buff_head *q, local_q; > + struct sk_buff *skb, *tmp; > + struct sock_exterr_skb *serr; > + struct zc_info_usr *zc_info_usr_p, *zc_info_kern_p; > + void __user *usr_addr; Please wrap the case in parentheses and define variables in that scope (Since there are so many variables for this case only.) > > switch (cmsg->cmsg_type) { > case SO_MARK: > @@ -2842,6 +2849,69 @@ int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, > case SCM_RIGHTS: > case SCM_CREDENTIALS: > break; > + case SO_ZC_NOTIFICATION: > + if (!sock_flag(sk, SOCK_ZEROCOPY) || sk->sk_family == PF_RDS) > + return -EINVAL; > + Why allow PF_RDS without the sock flag set? > + zc_info_usr_p = (struct zc_info_usr *)CMSG_DATA(cmsg); > + if (zc_info_usr_p->length <= 0 || zc_info_usr_p->length > SOCK_ZC_INFO_MAX) > + return -EINVAL; > + > + zc_info_size = struct_size(zc_info_usr_p, info, zc_info_usr_p->length); > + if (cmsg->cmsg_len != CMSG_LEN(zc_info_size)) > + return -EINVAL; By passing a straightforward array, the array len can be inferred from cmsg_len, simplifying all these checks. See for instance how SO_DEVMEM_DONTNEED returns an array of tokens to the kernel. > + > + usr_addr = (void *)(uintptr_t)(zc_info_usr_p->usr_addr); > + if (!access_ok(usr_addr, zc_info_size)) > + return -EFAULT; > + > + zc_info_kern_p = kmalloc(zc_info_size, GFP_KERNEL); > + if (!zc_info_kern_p) > + return -ENOMEM; > + > + q = &sk->sk_error_queue; > + skb_queue_head_init(&local_q); > + spin_lock_irqsave(&q->lock, flags); > + skb = skb_peek(q); > + while (skb && i < zc_info_usr_p->length) { > + struct sk_buff *skb_next = skb_peek_next(skb, q); > + > + serr = SKB_EXT_ERR(skb); > + if (serr->ee.ee_errno == 0 && > + serr->ee.ee_origin == SO_EE_ORIGIN_ZEROCOPY) { > + zc_info_kern_p->info[i].hi = serr->ee.ee_data; > + zc_info_kern_p->info[i].lo = serr->ee.ee_info; > + zc_info_kern_p->info[i].zerocopy = !(serr->ee.ee_code > + & SO_EE_CODE_ZEROCOPY_COPIED); > + __skb_unlink(skb, q); > + __skb_queue_tail(&local_q, skb); > + i++; > + } > + skb = skb_next; > + } > + spin_unlock_irqrestore(&q->lock, flags); In almost all sane cases, all outstanding notifications can be passed to userspace. It may be interesting to experiment with briefly taking the lock to move to a private list. See for instance net_rx_action. Then if userspace cannot handle all notifications, the rest have to be spliced back. This can reorder notifications. But rare reordering is not a correctness issue. I would choose the more complex splice approach only if it shows benefit, i.e., if taking the lock does contend with error enqueue events. > + > + zc_info_kern_p->usr_addr = zc_info_usr_p->usr_addr; > + zc_info_kern_p->length = i; > + > + ret = copy_to_user(usr_addr, > + zc_info_kern_p, > + struct_size(zc_info_kern_p, info, i)); You'll still need to support the gnarly MSG_CMSG_COMPAT version too. Wait, is this the reason to pass a usr_addr explicitly? To get around any compat issues? Or even the entire issue of having to copy msg_sys->msg_control to user if !msg_control_is_user. I suppose this simplifies a lot in terms of development. If making the user interface uglier. IMHO the sane interface should be used eventually. There may also be other uses of passing msg_control data up to userspace from sendmsg. But this approach works for now for evaluation and discussion.
Hi, kernel test robot noticed the following build warnings: [auto build test WARNING on net-next/main] url: https://github.com/intel-lab-lkp/linux/commits/zijianzhang-bytedance-com/selftests-fix-OOM-problem-in-msg_zerocopy-selftest/20240420-055035 base: net-next/main patch link: https://lore.kernel.org/r/20240419214819.671536-3-zijianzhang%40bytedance.com patch subject: [PATCH net-next v2 2/3] sock: add MSG_ZEROCOPY notification mechanism based on msg_control config: i386-randconfig-061-20240423 (https://download.01.org/0day-ci/archive/20240423/202404231145.7NfmE7Hn-lkp@intel.com/config) compiler: gcc-13 (Ubuntu 13.2.0-4ubuntu3) 13.2.0 reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240423/202404231145.7NfmE7Hn-lkp@intel.com/reproduce) If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot <lkp@intel.com> | Closes: https://lore.kernel.org/oe-kbuild-all/202404231145.7NfmE7Hn-lkp@intel.com/ sparse warnings: (new ones prefixed by >>) >> net/core/sock.c:2864:26: sparse: sparse: incorrect type in assignment (different address spaces) @@ expected void [noderef] __user *usr_addr @@ got void * @@ net/core/sock.c:2864:26: sparse: expected void [noderef] __user *usr_addr net/core/sock.c:2864:26: sparse: got void * net/core/sock.c:2393:9: sparse: sparse: context imbalance in 'sk_clone_lock' - wrong count at exit net/core/sock.c:2397:6: sparse: sparse: context imbalance in 'sk_free_unlock_clone' - unexpected unlock net/core/sock.c:4103:13: sparse: sparse: context imbalance in 'proto_seq_start' - wrong count at exit net/core/sock.c:4115:13: sparse: sparse: context imbalance in 'proto_seq_stop' - wrong count at exit vim +2864 net/core/sock.c 2807 2808 int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, 2809 struct sockcm_cookie *sockc) 2810 { 2811 u32 tsflags; 2812 int ret, zc_info_size, i = 0; 2813 unsigned long flags; 2814 struct sk_buff_head *q, local_q; 2815 struct sk_buff *skb, *tmp; 2816 struct sock_exterr_skb *serr; 2817 struct zc_info_usr *zc_info_usr_p, *zc_info_kern_p; 2818 void __user *usr_addr; 2819 2820 switch (cmsg->cmsg_type) { 2821 case SO_MARK: 2822 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && 2823 !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 2824 return -EPERM; 2825 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 2826 return -EINVAL; 2827 sockc->mark = *(u32 *)CMSG_DATA(cmsg); 2828 break; 2829 case SO_TIMESTAMPING_OLD: 2830 case SO_TIMESTAMPING_NEW: 2831 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 2832 return -EINVAL; 2833 2834 tsflags = *(u32 *)CMSG_DATA(cmsg); 2835 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK) 2836 return -EINVAL; 2837 2838 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK; 2839 sockc->tsflags |= tsflags; 2840 break; 2841 case SCM_TXTIME: 2842 if (!sock_flag(sk, SOCK_TXTIME)) 2843 return -EINVAL; 2844 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64))) 2845 return -EINVAL; 2846 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg)); 2847 break; 2848 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */ 2849 case SCM_RIGHTS: 2850 case SCM_CREDENTIALS: 2851 break; 2852 case SO_ZC_NOTIFICATION: 2853 if (!sock_flag(sk, SOCK_ZEROCOPY) || sk->sk_family == PF_RDS) 2854 return -EINVAL; 2855 2856 zc_info_usr_p = (struct zc_info_usr *)CMSG_DATA(cmsg); 2857 if (zc_info_usr_p->length <= 0 || zc_info_usr_p->length > SOCK_ZC_INFO_MAX) 2858 return -EINVAL; 2859 2860 zc_info_size = struct_size(zc_info_usr_p, info, zc_info_usr_p->length); 2861 if (cmsg->cmsg_len != CMSG_LEN(zc_info_size)) 2862 return -EINVAL; 2863 > 2864 usr_addr = (void *)(uintptr_t)(zc_info_usr_p->usr_addr); 2865 if (!access_ok(usr_addr, zc_info_size)) 2866 return -EFAULT; 2867 2868 zc_info_kern_p = kmalloc(zc_info_size, GFP_KERNEL); 2869 if (!zc_info_kern_p) 2870 return -ENOMEM; 2871 2872 q = &sk->sk_error_queue; 2873 skb_queue_head_init(&local_q); 2874 spin_lock_irqsave(&q->lock, flags); 2875 skb = skb_peek(q); 2876 while (skb && i < zc_info_usr_p->length) { 2877 struct sk_buff *skb_next = skb_peek_next(skb, q); 2878 2879 serr = SKB_EXT_ERR(skb); 2880 if (serr->ee.ee_errno == 0 && 2881 serr->ee.ee_origin == SO_EE_ORIGIN_ZEROCOPY) { 2882 zc_info_kern_p->info[i].hi = serr->ee.ee_data; 2883 zc_info_kern_p->info[i].lo = serr->ee.ee_info; 2884 zc_info_kern_p->info[i].zerocopy = !(serr->ee.ee_code 2885 & SO_EE_CODE_ZEROCOPY_COPIED); 2886 __skb_unlink(skb, q); 2887 __skb_queue_tail(&local_q, skb); 2888 i++; 2889 } 2890 skb = skb_next; 2891 } 2892 spin_unlock_irqrestore(&q->lock, flags); 2893 2894 zc_info_kern_p->usr_addr = zc_info_usr_p->usr_addr; 2895 zc_info_kern_p->length = i; 2896 2897 ret = copy_to_user(usr_addr, 2898 zc_info_kern_p, 2899 struct_size(zc_info_kern_p, info, i)); 2900 kfree(zc_info_kern_p); 2901 2902 if (unlikely(ret)) { 2903 spin_lock_irqsave(&q->lock, flags); 2904 skb_queue_reverse_walk_safe(&local_q, skb, tmp) { 2905 __skb_unlink(skb, &local_q); 2906 __skb_queue_head(q, skb); 2907 } 2908 spin_unlock_irqrestore(&q->lock, flags); 2909 return -EFAULT; 2910 } 2911 2912 while ((skb = __skb_dequeue(&local_q))) 2913 consume_skb(skb); 2914 break; 2915 default: 2916 return -EINVAL; 2917 } 2918 return 0; 2919 } 2920 EXPORT_SYMBOL(__sock_cmsg_send); 2921
On 4/21/24 8:45 AM, Willem de Bruijn wrote: > zijianzhang@ wrote: >> From: Zijian Zhang <zijianzhang@bytedance.com> >> >> The MSG_ZEROCOPY flag enables copy avoidance for socket send calls. >> However, zerocopy is not a free lunch. Apart from the management of user >> pages, the combination of poll + recvmsg to receive notifications incurs >> unignorable overhead in the applications. The overhead of such sometimes >> might be more than the CPU savings from zerocopy. We try to solve this >> problem with a new notification mechanism based on msgcontrol. >> This new mechanism aims to reduce the overhead associated with receiving >> notifications by embedding them directly into user arguments passed with >> each sendmsg control message. By doing so, we can significantly reduce >> the complexity and overhead for managing notifications. In an ideal >> pattern, the user will keep calling sendmsg with SO_ZC_NOTIFICATION >> msg_control, and the notification will be delivered as soon as possible. >> >> Signed-off-by: Zijian Zhang <zijianzhang@bytedance.com> >> Signed-off-by: Xiaochun Lu <xiaochun.lu@bytedance.com> >> --- >> arch/alpha/include/uapi/asm/socket.h | 2 + >> arch/mips/include/uapi/asm/socket.h | 2 + >> arch/parisc/include/uapi/asm/socket.h | 2 + >> arch/sparc/include/uapi/asm/socket.h | 2 + >> include/uapi/asm-generic/socket.h | 2 + >> include/uapi/linux/socket.h | 16 ++++++ >> net/core/sock.c | 70 +++++++++++++++++++++++++++ >> 7 files changed, 96 insertions(+) >> >> diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h >> index e94f621903fe..b24622a9cd47 100644 >> --- a/arch/alpha/include/uapi/asm/socket.h >> +++ b/arch/alpha/include/uapi/asm/socket.h >> @@ -140,6 +140,8 @@ >> #define SO_PASSPIDFD 76 >> #define SO_PEERPIDFD 77 >> >> +#define SO_ZC_NOTIFICATION 78 >> + > > SCM_ for cmsgs > Ack. >> /* >> * Desired design of maximum size and alignment (see RFC2553) >> */ >> @@ -35,4 +37,18 @@ struct __kernel_sockaddr_storage { >> #define SOCK_TXREHASH_DISABLED 0 >> #define SOCK_TXREHASH_ENABLED 1 >> >> +#define SOCK_ZC_INFO_MAX 256 >> + >> +struct zc_info_elem { >> + __u32 lo; >> + __u32 hi; >> + __u8 zerocopy; >> +}; >> + >> +struct zc_info_usr { >> + __u64 usr_addr; >> + unsigned int length; >> + struct zc_info_elem info[]; >> +}; >> + > > Don't pass a pointer to user memory, just have msg_control point to an > array of zc_info_elem. > Ack. >> #endif /* _UAPI_LINUX_SOCKET_H */ >> diff --git a/net/core/sock.c b/net/core/sock.c >> index fe9195186c13..13f06480f2d8 100644 >> --- a/net/core/sock.c >> +++ b/net/core/sock.c >> @@ -2809,6 +2809,13 @@ int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, >> struct sockcm_cookie *sockc) >> { >> u32 tsflags; >> + int ret, zc_info_size, i = 0; >> + unsigned long flags; >> + struct sk_buff_head *q, local_q; >> + struct sk_buff *skb, *tmp; >> + struct sock_exterr_skb *serr; >> + struct zc_info_usr *zc_info_usr_p, *zc_info_kern_p; >> + void __user *usr_addr; > > Please wrap the case in parentheses and define variables in that scope > (Since there are so many variables for this case only.) > >> >> switch (cmsg->cmsg_type) { >> case SO_MARK: >> @@ -2842,6 +2849,69 @@ int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, >> case SCM_RIGHTS: >> case SCM_CREDENTIALS: >> break; >> + case SO_ZC_NOTIFICATION: >> + if (!sock_flag(sk, SOCK_ZEROCOPY) || sk->sk_family == PF_RDS) >> + return -EINVAL; >> + > > Why allow PF_RDS without the sock flag set? > PF_RDS uses POLLIN instead of POLLERR, thus this mechanism cannot work for PF_RDS. I am rejecting any PF_RDS socket here. >> + zc_info_usr_p = (struct zc_info_usr *)CMSG_DATA(cmsg); >> + if (zc_info_usr_p->length <= 0 || zc_info_usr_p->length > SOCK_ZC_INFO_MAX) >> + return -EINVAL; >> + >> + zc_info_size = struct_size(zc_info_usr_p, info, zc_info_usr_p->length); >> + if (cmsg->cmsg_len != CMSG_LEN(zc_info_size)) >> + return -EINVAL; > > By passing a straightforward array, the array len can be inferred from > cmsg_len, simplifying all these checks. > > See for instance how SO_DEVMEM_DONTNEED returns an array of tokens to > the kernel. > Ack. >> + >> + usr_addr = (void *)(uintptr_t)(zc_info_usr_p->usr_addr); >> + if (!access_ok(usr_addr, zc_info_size)) >> + return -EFAULT; >> + >> + zc_info_kern_p = kmalloc(zc_info_size, GFP_KERNEL); >> + if (!zc_info_kern_p) >> + return -ENOMEM; >> + >> + q = &sk->sk_error_queue; >> + skb_queue_head_init(&local_q); >> + spin_lock_irqsave(&q->lock, flags); >> + skb = skb_peek(q); >> + while (skb && i < zc_info_usr_p->length) { >> + struct sk_buff *skb_next = skb_peek_next(skb, q); >> + >> + serr = SKB_EXT_ERR(skb); >> + if (serr->ee.ee_errno == 0 && >> + serr->ee.ee_origin == SO_EE_ORIGIN_ZEROCOPY) { >> + zc_info_kern_p->info[i].hi = serr->ee.ee_data; >> + zc_info_kern_p->info[i].lo = serr->ee.ee_info; >> + zc_info_kern_p->info[i].zerocopy = !(serr->ee.ee_code >> + & SO_EE_CODE_ZEROCOPY_COPIED); >> + __skb_unlink(skb, q); >> + __skb_queue_tail(&local_q, skb); >> + i++; >> + } >> + skb = skb_next; >> + } >> + spin_unlock_irqrestore(&q->lock, flags); > > In almost all sane cases, all outstanding notifications can be passed > to userspace. > > It may be interesting to experiment with briefly taking the lock to > move to a private list. See for instance net_rx_action. > Nice catch. > Then if userspace cannot handle all notifications, the rest have to be > spliced back. This can reorder notifications. But rare reordering is > not a correctness issue. > > I would choose the more complex splice approach only if it shows > benefit, i.e., if taking the lock does contend with error enqueue > events. > Maybe when the network is very busy, it will contend with __msg_zerocopy_callback(where new notifications are added)? I think splice is a better idea. >> + >> + zc_info_kern_p->usr_addr = zc_info_usr_p->usr_addr; >> + zc_info_kern_p->length = i; >> + >> + ret = copy_to_user(usr_addr, >> + zc_info_kern_p, >> + struct_size(zc_info_kern_p, info, i)); > > You'll still need to support the gnarly MSG_CMSG_COMPAT version too. > Assume users pass in zc_info_elem array pointer here, I may use in_compat_syscall function to check if it's compat. If so, I can use compat_ptr to convert it. Is it correct? > Wait, is this the reason to pass a usr_addr explicitly? To get around > any compat issues? > Yes, I try to make it u64 regardless of 32-bit or 64-bit program, but it is indeed ugly though. > Or even the entire issue of having to copy msg_sys->msg_control to > user if !msg_control_is_user. > > I suppose this simplifies a lot in terms of development. If making the > user interface uglier. > > IMHO the sane interface should be used eventually. There may also be > other uses of passing msg_control data up to userspace from sendmsg. > > But this approach works for now for evaluation and discussion. > >
diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h index e94f621903fe..b24622a9cd47 100644 --- a/arch/alpha/include/uapi/asm/socket.h +++ b/arch/alpha/include/uapi/asm/socket.h @@ -140,6 +140,8 @@ #define SO_PASSPIDFD 76 #define SO_PEERPIDFD 77 +#define SO_ZC_NOTIFICATION 78 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h index 60ebaed28a4c..638a4ebbffa7 100644 --- a/arch/mips/include/uapi/asm/socket.h +++ b/arch/mips/include/uapi/asm/socket.h @@ -151,6 +151,8 @@ #define SO_PASSPIDFD 76 #define SO_PEERPIDFD 77 +#define SO_ZC_NOTIFICATION 78 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h index be264c2b1a11..393f1a6e9562 100644 --- a/arch/parisc/include/uapi/asm/socket.h +++ b/arch/parisc/include/uapi/asm/socket.h @@ -132,6 +132,8 @@ #define SO_PASSPIDFD 0x404A #define SO_PEERPIDFD 0x404B +#define SO_ZC_NOTIFICATION 0x404C + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h index 682da3714686..4fe127b0682b 100644 --- a/arch/sparc/include/uapi/asm/socket.h +++ b/arch/sparc/include/uapi/asm/socket.h @@ -133,6 +133,8 @@ #define SO_PASSPIDFD 0x0055 #define SO_PEERPIDFD 0x0056 +#define SO_ZC_NOTIFICATION 0x0057 + #if !defined(__KERNEL__) diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h index 8ce8a39a1e5f..acbbbe7ac06a 100644 --- a/include/uapi/asm-generic/socket.h +++ b/include/uapi/asm-generic/socket.h @@ -135,6 +135,8 @@ #define SO_PASSPIDFD 76 #define SO_PEERPIDFD 77 +#define SO_ZC_NOTIFICATION 78 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__)) diff --git a/include/uapi/linux/socket.h b/include/uapi/linux/socket.h index d3fcd3b5ec53..60e4db759d49 100644 --- a/include/uapi/linux/socket.h +++ b/include/uapi/linux/socket.h @@ -2,6 +2,8 @@ #ifndef _UAPI_LINUX_SOCKET_H #define _UAPI_LINUX_SOCKET_H +#include <linux/types.h> + /* * Desired design of maximum size and alignment (see RFC2553) */ @@ -35,4 +37,18 @@ struct __kernel_sockaddr_storage { #define SOCK_TXREHASH_DISABLED 0 #define SOCK_TXREHASH_ENABLED 1 +#define SOCK_ZC_INFO_MAX 256 + +struct zc_info_elem { + __u32 lo; + __u32 hi; + __u8 zerocopy; +}; + +struct zc_info_usr { + __u64 usr_addr; + unsigned int length; + struct zc_info_elem info[]; +}; + #endif /* _UAPI_LINUX_SOCKET_H */ diff --git a/net/core/sock.c b/net/core/sock.c index fe9195186c13..13f06480f2d8 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2809,6 +2809,13 @@ int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, struct sockcm_cookie *sockc) { u32 tsflags; + int ret, zc_info_size, i = 0; + unsigned long flags; + struct sk_buff_head *q, local_q; + struct sk_buff *skb, *tmp; + struct sock_exterr_skb *serr; + struct zc_info_usr *zc_info_usr_p, *zc_info_kern_p; + void __user *usr_addr; switch (cmsg->cmsg_type) { case SO_MARK: @@ -2842,6 +2849,69 @@ int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, case SCM_RIGHTS: case SCM_CREDENTIALS: break; + case SO_ZC_NOTIFICATION: + if (!sock_flag(sk, SOCK_ZEROCOPY) || sk->sk_family == PF_RDS) + return -EINVAL; + + zc_info_usr_p = (struct zc_info_usr *)CMSG_DATA(cmsg); + if (zc_info_usr_p->length <= 0 || zc_info_usr_p->length > SOCK_ZC_INFO_MAX) + return -EINVAL; + + zc_info_size = struct_size(zc_info_usr_p, info, zc_info_usr_p->length); + if (cmsg->cmsg_len != CMSG_LEN(zc_info_size)) + return -EINVAL; + + usr_addr = (void *)(uintptr_t)(zc_info_usr_p->usr_addr); + if (!access_ok(usr_addr, zc_info_size)) + return -EFAULT; + + zc_info_kern_p = kmalloc(zc_info_size, GFP_KERNEL); + if (!zc_info_kern_p) + return -ENOMEM; + + q = &sk->sk_error_queue; + skb_queue_head_init(&local_q); + spin_lock_irqsave(&q->lock, flags); + skb = skb_peek(q); + while (skb && i < zc_info_usr_p->length) { + struct sk_buff *skb_next = skb_peek_next(skb, q); + + serr = SKB_EXT_ERR(skb); + if (serr->ee.ee_errno == 0 && + serr->ee.ee_origin == SO_EE_ORIGIN_ZEROCOPY) { + zc_info_kern_p->info[i].hi = serr->ee.ee_data; + zc_info_kern_p->info[i].lo = serr->ee.ee_info; + zc_info_kern_p->info[i].zerocopy = !(serr->ee.ee_code + & SO_EE_CODE_ZEROCOPY_COPIED); + __skb_unlink(skb, q); + __skb_queue_tail(&local_q, skb); + i++; + } + skb = skb_next; + } + spin_unlock_irqrestore(&q->lock, flags); + + zc_info_kern_p->usr_addr = zc_info_usr_p->usr_addr; + zc_info_kern_p->length = i; + + ret = copy_to_user(usr_addr, + zc_info_kern_p, + struct_size(zc_info_kern_p, info, i)); + kfree(zc_info_kern_p); + + if (unlikely(ret)) { + spin_lock_irqsave(&q->lock, flags); + skb_queue_reverse_walk_safe(&local_q, skb, tmp) { + __skb_unlink(skb, &local_q); + __skb_queue_head(q, skb); + } + spin_unlock_irqrestore(&q->lock, flags); + return -EFAULT; + } + + while ((skb = __skb_dequeue(&local_q))) + consume_skb(skb); + break; default: return -EINVAL; }