diff mbox series

[net-next] support SO_PRIORITY cmsg

Message ID 20241029144142.31382-1-annaemesenyiri@gmail.com (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series [net-next] support SO_PRIORITY cmsg | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for net-next, async
netdev/apply fail Patch does not apply to net-next-0

Commit Message

Anna Nyiri Oct. 29, 2024, 2:41 p.m. UTC
The Linux socket API currently supports setting SO_PRIORITY at the socket
level, which applies a uniform priority to all packets sent through that
socket. The only exception is IP_TOS, if that is specified as ancillary
data, the packet does not inherit the socket's priority. Instead, the
priority value is computed when handling the ancillary data (as implemented
in commit <f02db315b8d888570cb0d4496cfbb7e4acb047cb>: "ipv4: IP_TOS
and IP_TTL can be specified as ancillary data").

Currently, there is no option to set the priority directly from userspace
on a per-packet basis. The following changes allow SO_PRIORITY to be set
through control messages (CMSG), giving userspace applications more
granular control over packet priorities.

This patch enables setting skb->priority using CMSG. If SO_PRIORITY is
specified as ancillary data, the packet is sent with the priority value
set through sockc->priority_cmsg_value, overriding the socket-level
values set via the traditional setsockopt() method. This is analogous to
existing support for SO_MARK (as implemented in commit
<c6af0c227a22bb6bb8ff72f043e0fb6d99fd6515>, “ip: support SO_MARK
cmsg”).

Suggested-by: Ferenc Fejes <fejes@inf.elte.hu>
Signed-off-by: Anna Emese Nyiri <annaemesenyiri@gmail.com>
---
 include/net/inet_sock.h |  2 ++
 include/net/sock.h      |  5 ++++-
 net/can/raw.c           |  6 +++++-
 net/core/sock.c         | 12 ++++++++++++
 net/ipv4/ip_output.c    | 11 ++++++++++-
 net/ipv4/raw.c          |  5 ++++-
 net/ipv6/ip6_output.c   |  8 +++++++-
 net/ipv6/raw.c          |  6 +++++-
 net/packet/af_packet.c  |  6 +++++-
 9 files changed, 54 insertions(+), 7 deletions(-)

Comments

Willem de Bruijn Oct. 29, 2024, 3:15 p.m. UTC | #1
Anna Emese Nyiri wrote:
> The Linux socket API currently supports setting SO_PRIORITY at the socket
> level, which applies a uniform priority to all packets sent through that
> socket. The only exception is IP_TOS, if that is specified as ancillary
> data, the packet does not inherit the socket's priority. Instead, the
> priority value is computed when handling the ancillary data (as implemented
> in commit <f02db315b8d888570cb0d4496cfbb7e4acb047cb>: "ipv4: IP_TOS
> and IP_TTL can be specified as ancillary data").

Please use commit format <$SHA1:12> ("subject"). Checkpatch might also
flag this.
 
> Currently, there is no option to set the priority directly from userspace
> on a per-packet basis. The following changes allow SO_PRIORITY to be set
> through control messages (CMSG), giving userspace applications more
> granular control over packet priorities.
> 
> This patch enables setting skb->priority using CMSG. If SO_PRIORITY is
> specified as ancillary data, the packet is sent with the priority value
> set through sockc->priority_cmsg_value, overriding the socket-level
> values set via the traditional setsockopt() method.

Please also describe how this interacts with priority set from IP_TOS or
IPV6_TCLASS.

> This is analogous to
> existing support for SO_MARK (as implemented in commit
> <c6af0c227a22bb6bb8ff72f043e0fb6d99fd6515>, “ip: support SO_MARK
> cmsg”).
> 
> Suggested-by: Ferenc Fejes <fejes@inf.elte.hu>
> Signed-off-by: Anna Emese Nyiri <annaemesenyiri@gmail.com>
> ---
>  include/net/inet_sock.h |  2 ++
>  include/net/sock.h      |  5 ++++-
>  net/can/raw.c           |  6 +++++-
>  net/core/sock.c         | 12 ++++++++++++
>  net/ipv4/ip_output.c    | 11 ++++++++++-
>  net/ipv4/raw.c          |  5 ++++-
>  net/ipv6/ip6_output.c   |  8 +++++++-
>  net/ipv6/raw.c          |  6 +++++-
>  net/packet/af_packet.c  |  6 +++++-
>  9 files changed, 54 insertions(+), 7 deletions(-)
> 
> diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
> index f9ddd47dc4f8..9d4e4e2a8232 100644
> --- a/include/net/inet_sock.h
> +++ b/include/net/inet_sock.h
> @@ -175,6 +175,8 @@ struct inet_cork {
>  	__u16			gso_size;
>  	u64			transmit_time;
>  	u32			mark;
> +	__u8		priority_cmsg_set;
> +	u32			priority_cmsg_value;

Just priority, drop the cmsg value.

Instead of an explicit "is set" bit, preferred is to initialize the
cookie field from the sock. See sockcm_init(), below, and also
ipcm_init_sk(). That also avoids the branches later in the datapath.

>  };
>  
>  struct inet_cork_full {
> diff --git a/include/net/sock.h b/include/net/sock.h
> index cce23ac4d514..e02170977165 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -1794,13 +1794,16 @@ struct sockcm_cookie {
>  	u64 transmit_time;
>  	u32 mark;
>  	u32 tsflags;
> +	u32 priority_cmsg_value;
> +	u8 priority_cmsg_set;
>  };
>  
>  static inline void sockcm_init(struct sockcm_cookie *sockc,
>  			       const struct sock *sk)
>  {
>  	*sockc = (struct sockcm_cookie) {
> -		.tsflags = READ_ONCE(sk->sk_tsflags)
> +		.tsflags = READ_ONCE(sk->sk_tsflags),
> +		.priority_cmsg_set = 0
>  	};
>  }
>  
> diff --git a/net/can/raw.c b/net/can/raw.c
> index 00533f64d69d..cf7e7ae64cde 100644
> --- a/net/can/raw.c
> +++ b/net/can/raw.c
> @@ -962,7 +962,11 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
>  	}
>  
>  	skb->dev = dev;
> -	skb->priority = READ_ONCE(sk->sk_priority);
> +	if (sockc.priority_cmsg_set)
> +		skb->priority = sockc.priority_cmsg_value;
> +	else
> +		skb->priority = READ_ONCE(sk->sk_priority);
> +
>  	skb->mark = READ_ONCE(sk->sk_mark);
>  	skb->tstamp = sockc.transmit_time;
>  
> diff --git a/net/core/sock.c b/net/core/sock.c
> index 9abc4fe25953..899bf850b52a 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -2863,6 +2863,18 @@ int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
>  	case SCM_RIGHTS:
>  	case SCM_CREDENTIALS:
>  		break;
> +	case SO_PRIORITY:
> +		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
> +			return -EINVAL;
> +
> +		if ((*(u32 *)CMSG_DATA(cmsg) >= 0 && *(u32 *)CMSG_DATA(cmsg) <= 6) ||
> +		    sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
> +		    sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
> +			sockc->priority_cmsg_value = *(u32 *)CMSG_DATA(cmsg);
> +			sockc->priority_cmsg_set = 1;
> +			break;
> +		}

What is the magic constant 6 here?
Anna Nyiri Oct. 31, 2024, 12:39 p.m. UTC | #2
Willem de Bruijn <willemdebruijn.kernel@gmail.com> ezt írta (időpont:
2024. okt. 29., K, 16:15):
>
> Anna Emese Nyiri wrote:
> > The Linux socket API currently supports setting SO_PRIORITY at the socket
> > level, which applies a uniform priority to all packets sent through that
> > socket. The only exception is IP_TOS, if that is specified as ancillary
> > data, the packet does not inherit the socket's priority. Instead, the
> > priority value is computed when handling the ancillary data (as implemented
> > in commit <f02db315b8d888570cb0d4496cfbb7e4acb047cb>: "ipv4: IP_TOS
> > and IP_TTL can be specified as ancillary data").
>
> Please use commit format <$SHA1:12> ("subject"). Checkpatch might also
> flag this.
>
> > Currently, there is no option to set the priority directly from userspace
> > on a per-packet basis. The following changes allow SO_PRIORITY to be set
> > through control messages (CMSG), giving userspace applications more
> > granular control over packet priorities.
> >
> > This patch enables setting skb->priority using CMSG. If SO_PRIORITY is
> > specified as ancillary data, the packet is sent with the priority value
> > set through sockc->priority_cmsg_value, overriding the socket-level
> > values set via the traditional setsockopt() method.
>
> Please also describe how this interacts with priority set from IP_TOS or
> IPV6_TCLASS.
>
> > This is analogous to
> > existing support for SO_MARK (as implemented in commit
> > <c6af0c227a22bb6bb8ff72f043e0fb6d99fd6515>, “ip: support SO_MARK
> > cmsg”).
> >
> > Suggested-by: Ferenc Fejes <fejes@inf.elte.hu>
> > Signed-off-by: Anna Emese Nyiri <annaemesenyiri@gmail.com>
> > ---
> >  include/net/inet_sock.h |  2 ++
> >  include/net/sock.h      |  5 ++++-
> >  net/can/raw.c           |  6 +++++-
> >  net/core/sock.c         | 12 ++++++++++++
> >  net/ipv4/ip_output.c    | 11 ++++++++++-
> >  net/ipv4/raw.c          |  5 ++++-
> >  net/ipv6/ip6_output.c   |  8 +++++++-
> >  net/ipv6/raw.c          |  6 +++++-
> >  net/packet/af_packet.c  |  6 +++++-
> >  9 files changed, 54 insertions(+), 7 deletions(-)
> >
> > diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
> > index f9ddd47dc4f8..9d4e4e2a8232 100644
> > --- a/include/net/inet_sock.h
> > +++ b/include/net/inet_sock.h
> > @@ -175,6 +175,8 @@ struct inet_cork {
> >       __u16                   gso_size;
> >       u64                     transmit_time;
> >       u32                     mark;
> > +     __u8            priority_cmsg_set;
> > +     u32                     priority_cmsg_value;
>
> Just priority, drop the cmsg value.
>
> Instead of an explicit "is set" bit, preferred is to initialize the
> cookie field from the sock. See sockcm_init(), below, and also
> ipcm_init_sk(). That also avoids the branches later in the datapath.
>
> >  };
> >
> >  struct inet_cork_full {
> > diff --git a/include/net/sock.h b/include/net/sock.h
> > index cce23ac4d514..e02170977165 100644
> > --- a/include/net/sock.h
> > +++ b/include/net/sock.h
> > @@ -1794,13 +1794,16 @@ struct sockcm_cookie {
> >       u64 transmit_time;
> >       u32 mark;
> >       u32 tsflags;
> > +     u32 priority_cmsg_value;
> > +     u8 priority_cmsg_set;
> >  };
> >
> >  static inline void sockcm_init(struct sockcm_cookie *sockc,
> >                              const struct sock *sk)
> >  {
> >       *sockc = (struct sockcm_cookie) {
> > -             .tsflags = READ_ONCE(sk->sk_tsflags)
> > +             .tsflags = READ_ONCE(sk->sk_tsflags),
> > +             .priority_cmsg_set = 0
> >       };
> >  }
> >
> > diff --git a/net/can/raw.c b/net/can/raw.c
> > index 00533f64d69d..cf7e7ae64cde 100644
> > --- a/net/can/raw.c
> > +++ b/net/can/raw.c
> > @@ -962,7 +962,11 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
> >       }
> >
> >       skb->dev = dev;
> > -     skb->priority = READ_ONCE(sk->sk_priority);
> > +     if (sockc.priority_cmsg_set)
> > +             skb->priority = sockc.priority_cmsg_value;
> > +     else
> > +             skb->priority = READ_ONCE(sk->sk_priority);
> > +
> >       skb->mark = READ_ONCE(sk->sk_mark);
> >       skb->tstamp = sockc.transmit_time;
> >
> > diff --git a/net/core/sock.c b/net/core/sock.c
> > index 9abc4fe25953..899bf850b52a 100644
> > --- a/net/core/sock.c
> > +++ b/net/core/sock.c
> > @@ -2863,6 +2863,18 @@ int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
> >       case SCM_RIGHTS:
> >       case SCM_CREDENTIALS:
> >               break;
> > +     case SO_PRIORITY:
> > +             if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
> > +                     return -EINVAL;
> > +
> > +             if ((*(u32 *)CMSG_DATA(cmsg) >= 0 && *(u32 *)CMSG_DATA(cmsg) <= 6) ||
> > +                 sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
> > +                 sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
> > +                     sockc->priority_cmsg_value = *(u32 *)CMSG_DATA(cmsg);
> > +                     sockc->priority_cmsg_set = 1;
> > +                     break;
> > +             }
>
> What is the magic constant 6 here?

The mechanism for setting the priority value via cmsg mirrors that of
setting the priority value through setsockopt. The control of the
priority value is managed by the sk_setsockopt function, which allows
setting the priority within the range of 0 to 6. However, if the user
has CAP_NET_ADMIN or CAP_NET_RAW capability, they are permitted to set
any priority value without restriction. The specified range of 0 to 6
was selected to align with existing priority value check.
Willem de Bruijn Oct. 31, 2024, 1:01 p.m. UTC | #3
Anna Nyiri wrote:
> Willem de Bruijn <willemdebruijn.kernel@gmail.com> ezt írta (időpont:
> 2024. okt. 29., K, 16:15):
> >
> > Anna Emese Nyiri wrote:
> > > The Linux socket API currently supports setting SO_PRIORITY at the socket
> > > level, which applies a uniform priority to all packets sent through that
> > > socket. The only exception is IP_TOS, if that is specified as ancillary
> > > data, the packet does not inherit the socket's priority. Instead, the
> > > priority value is computed when handling the ancillary data (as implemented
> > > in commit <f02db315b8d888570cb0d4496cfbb7e4acb047cb>: "ipv4: IP_TOS
> > > and IP_TTL can be specified as ancillary data").
> >
> > Please use commit format <$SHA1:12> ("subject"). Checkpatch might also
> > flag this.
> >
> > > Currently, there is no option to set the priority directly from userspace
> > > on a per-packet basis. The following changes allow SO_PRIORITY to be set
> > > through control messages (CMSG), giving userspace applications more
> > > granular control over packet priorities.
> > >
> > > This patch enables setting skb->priority using CMSG. If SO_PRIORITY is
> > > specified as ancillary data, the packet is sent with the priority value
> > > set through sockc->priority_cmsg_value, overriding the socket-level
> > > values set via the traditional setsockopt() method.
> >
> > Please also describe how this interacts with priority set from IP_TOS or
> > IPV6_TCLASS.
> >
> > > This is analogous to
> > > existing support for SO_MARK (as implemented in commit
> > > <c6af0c227a22bb6bb8ff72f043e0fb6d99fd6515>, “ip: support SO_MARK
> > > cmsg”).
> > >
> > > Suggested-by: Ferenc Fejes <fejes@inf.elte.hu>
> > > Signed-off-by: Anna Emese Nyiri <annaemesenyiri@gmail.com>
> > > ---
> > >  include/net/inet_sock.h |  2 ++
> > >  include/net/sock.h      |  5 ++++-
> > >  net/can/raw.c           |  6 +++++-
> > >  net/core/sock.c         | 12 ++++++++++++
> > >  net/ipv4/ip_output.c    | 11 ++++++++++-
> > >  net/ipv4/raw.c          |  5 ++++-
> > >  net/ipv6/ip6_output.c   |  8 +++++++-
> > >  net/ipv6/raw.c          |  6 +++++-
> > >  net/packet/af_packet.c  |  6 +++++-
> > >  9 files changed, 54 insertions(+), 7 deletions(-)
> > >
> > > diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
> > > index f9ddd47dc4f8..9d4e4e2a8232 100644
> > > --- a/include/net/inet_sock.h
> > > +++ b/include/net/inet_sock.h
> > > @@ -175,6 +175,8 @@ struct inet_cork {
> > >       __u16                   gso_size;
> > >       u64                     transmit_time;
> > >       u32                     mark;
> > > +     __u8            priority_cmsg_set;
> > > +     u32                     priority_cmsg_value;
> >
> > Just priority, drop the cmsg value.
> >
> > Instead of an explicit "is set" bit, preferred is to initialize the
> > cookie field from the sock. See sockcm_init(), below, and also
> > ipcm_init_sk(). That also avoids the branches later in the datapath.
> >
> > >  };
> > >
> > >  struct inet_cork_full {
> > > diff --git a/include/net/sock.h b/include/net/sock.h
> > > index cce23ac4d514..e02170977165 100644
> > > --- a/include/net/sock.h
> > > +++ b/include/net/sock.h
> > > @@ -1794,13 +1794,16 @@ struct sockcm_cookie {
> > >       u64 transmit_time;
> > >       u32 mark;
> > >       u32 tsflags;
> > > +     u32 priority_cmsg_value;
> > > +     u8 priority_cmsg_set;
> > >  };
> > >
> > >  static inline void sockcm_init(struct sockcm_cookie *sockc,
> > >                              const struct sock *sk)
> > >  {
> > >       *sockc = (struct sockcm_cookie) {
> > > -             .tsflags = READ_ONCE(sk->sk_tsflags)
> > > +             .tsflags = READ_ONCE(sk->sk_tsflags),
> > > +             .priority_cmsg_set = 0
> > >       };
> > >  }
> > >
> > > diff --git a/net/can/raw.c b/net/can/raw.c
> > > index 00533f64d69d..cf7e7ae64cde 100644
> > > --- a/net/can/raw.c
> > > +++ b/net/can/raw.c
> > > @@ -962,7 +962,11 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
> > >       }
> > >
> > >       skb->dev = dev;
> > > -     skb->priority = READ_ONCE(sk->sk_priority);
> > > +     if (sockc.priority_cmsg_set)
> > > +             skb->priority = sockc.priority_cmsg_value;
> > > +     else
> > > +             skb->priority = READ_ONCE(sk->sk_priority);
> > > +
> > >       skb->mark = READ_ONCE(sk->sk_mark);
> > >       skb->tstamp = sockc.transmit_time;
> > >
> > > diff --git a/net/core/sock.c b/net/core/sock.c
> > > index 9abc4fe25953..899bf850b52a 100644
> > > --- a/net/core/sock.c
> > > +++ b/net/core/sock.c
> > > @@ -2863,6 +2863,18 @@ int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
> > >       case SCM_RIGHTS:
> > >       case SCM_CREDENTIALS:
> > >               break;
> > > +     case SO_PRIORITY:
> > > +             if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
> > > +                     return -EINVAL;
> > > +
> > > +             if ((*(u32 *)CMSG_DATA(cmsg) >= 0 && *(u32 *)CMSG_DATA(cmsg) <= 6) ||
> > > +                 sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
> > > +                 sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
> > > +                     sockc->priority_cmsg_value = *(u32 *)CMSG_DATA(cmsg);
> > > +                     sockc->priority_cmsg_set = 1;
> > > +                     break;
> > > +             }
> >
> > What is the magic constant 6 here?
> 
> The mechanism for setting the priority value via cmsg mirrors that of
> setting the priority value through setsockopt. The control of the
> priority value is managed by the sk_setsockopt function, which allows
> setting the priority within the range of 0 to 6. However, if the user
> has CAP_NET_ADMIN or CAP_NET_RAW capability, they are permitted to set
> any priority value without restriction. The specified range of 0 to 6
> was selected to align with existing priority value check.

Oh right. This is just copied from setsockopt SO_PRIORITY.
Having an non-annotated constant there is unfortunate too, but goes
back to before the introduction of git.

And that goes back to the priority bands configured with
rt_tos2priority. As setsockopt IP_TOS is not a privileged operation.

Ideally this would say TC_PRIO_BESTEFFORT and TC_PRIO_INTERACTIVE.

Since both the setsockopt and cmsg check are in net/core/sock.c,
can we deduplicate the logic and introduce helper:

    static bool sk_set_prio_allowed(const struct sock *sk, int val)
    {
            return ((val >= TC_PRIO_BESTEFFORT && val <= TC_PRIO_INTERACTIVE) ||
                    sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
                    sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
    }
kernel test robot Nov. 1, 2024, 3:14 p.m. UTC | #4
Hi Anna,

kernel test robot noticed the following build warnings:

[auto build test WARNING on mkl-can-next/testing]
[also build test WARNING on linus/master v6.12-rc5]
[cannot apply to net-next/main horms-ipvs/master next-20241101]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Anna-Emese-Nyiri/support-SO_PRIORITY-cmsg/20241029-224326
base:   https://git.kernel.org/pub/scm/linux/kernel/git/mkl/linux-can-next.git testing
patch link:    https://lore.kernel.org/r/20241029144142.31382-1-annaemesenyiri%40gmail.com
patch subject: [PATCH net-next] support SO_PRIORITY cmsg
config: i386-randconfig-141-20241101 (https://download.01.org/0day-ci/archive/20241101/202411012324.a9SOqSqV-lkp@intel.com/config)
compiler: gcc-12 (Debian 12.2.0-14) 12.2.0

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202411012324.a9SOqSqV-lkp@intel.com/

smatch warnings:
net/core/sock.c:2870 __sock_cmsg_send() warn: always true condition '(*(cmsg + 12) >= 0) => (0-u32max >= 0)'

vim +2870 net/core/sock.c

  2828	
  2829	int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
  2830			     struct sockcm_cookie *sockc)
  2831	{
  2832		u32 tsflags;
  2833	
  2834		switch (cmsg->cmsg_type) {
  2835		case SO_MARK:
  2836			if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
  2837			    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
  2838				return -EPERM;
  2839			if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
  2840				return -EINVAL;
  2841			sockc->mark = *(u32 *)CMSG_DATA(cmsg);
  2842			break;
  2843		case SO_TIMESTAMPING_OLD:
  2844		case SO_TIMESTAMPING_NEW:
  2845			if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
  2846				return -EINVAL;
  2847	
  2848			tsflags = *(u32 *)CMSG_DATA(cmsg);
  2849			if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
  2850				return -EINVAL;
  2851	
  2852			sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
  2853			sockc->tsflags |= tsflags;
  2854			break;
  2855		case SCM_TXTIME:
  2856			if (!sock_flag(sk, SOCK_TXTIME))
  2857				return -EINVAL;
  2858			if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
  2859				return -EINVAL;
  2860			sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
  2861			break;
  2862		/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
  2863		case SCM_RIGHTS:
  2864		case SCM_CREDENTIALS:
  2865			break;
  2866		case SO_PRIORITY:
  2867			if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
  2868				return -EINVAL;
  2869	
> 2870			if ((*(u32 *)CMSG_DATA(cmsg) >= 0 && *(u32 *)CMSG_DATA(cmsg) <= 6) ||
  2871			    sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
  2872			    sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
  2873				sockc->priority_cmsg_value = *(u32 *)CMSG_DATA(cmsg);
  2874				sockc->priority_cmsg_set = 1;
  2875				break;
  2876			}
  2877			return -EPERM;
  2878		default:
  2879			return -EINVAL;
  2880		}
  2881		return 0;
  2882	}
  2883	EXPORT_SYMBOL(__sock_cmsg_send);
  2884
diff mbox series

Patch

diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index f9ddd47dc4f8..9d4e4e2a8232 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -175,6 +175,8 @@  struct inet_cork {
 	__u16			gso_size;
 	u64			transmit_time;
 	u32			mark;
+	__u8		priority_cmsg_set;
+	u32			priority_cmsg_value;
 };
 
 struct inet_cork_full {
diff --git a/include/net/sock.h b/include/net/sock.h
index cce23ac4d514..e02170977165 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1794,13 +1794,16 @@  struct sockcm_cookie {
 	u64 transmit_time;
 	u32 mark;
 	u32 tsflags;
+	u32 priority_cmsg_value;
+	u8 priority_cmsg_set;
 };
 
 static inline void sockcm_init(struct sockcm_cookie *sockc,
 			       const struct sock *sk)
 {
 	*sockc = (struct sockcm_cookie) {
-		.tsflags = READ_ONCE(sk->sk_tsflags)
+		.tsflags = READ_ONCE(sk->sk_tsflags),
+		.priority_cmsg_set = 0
 	};
 }
 
diff --git a/net/can/raw.c b/net/can/raw.c
index 00533f64d69d..cf7e7ae64cde 100644
--- a/net/can/raw.c
+++ b/net/can/raw.c
@@ -962,7 +962,11 @@  static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
 	}
 
 	skb->dev = dev;
-	skb->priority = READ_ONCE(sk->sk_priority);
+	if (sockc.priority_cmsg_set)
+		skb->priority = sockc.priority_cmsg_value;
+	else
+		skb->priority = READ_ONCE(sk->sk_priority);
+
 	skb->mark = READ_ONCE(sk->sk_mark);
 	skb->tstamp = sockc.transmit_time;
 
diff --git a/net/core/sock.c b/net/core/sock.c
index 9abc4fe25953..899bf850b52a 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2863,6 +2863,18 @@  int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
 	case SCM_RIGHTS:
 	case SCM_CREDENTIALS:
 		break;
+	case SO_PRIORITY:
+		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
+			return -EINVAL;
+
+		if ((*(u32 *)CMSG_DATA(cmsg) >= 0 && *(u32 *)CMSG_DATA(cmsg) <= 6) ||
+		    sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
+		    sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
+			sockc->priority_cmsg_value = *(u32 *)CMSG_DATA(cmsg);
+			sockc->priority_cmsg_set = 1;
+			break;
+		}
+		return -EPERM;
 	default:
 		return -EINVAL;
 	}
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index b90d0f78ac80..0e44ebd031f7 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1322,6 +1322,8 @@  static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
 	cork->ttl = ipc->ttl;
 	cork->tos = ipc->tos;
 	cork->mark = ipc->sockc.mark;
+	cork->priority_cmsg_value = ipc->sockc.priority_cmsg_value;
+	cork->priority_cmsg_set = ipc->sockc.priority_cmsg_set;
 	cork->priority = ipc->priority;
 	cork->transmit_time = ipc->sockc.transmit_time;
 	cork->tx_flags = 0;
@@ -1455,8 +1457,15 @@  struct sk_buff *__ip_make_skb(struct sock *sk,
 		ip_options_build(skb, opt, cork->addr, rt);
 	}
 
-	skb->priority = (cork->tos != -1) ? cork->priority: READ_ONCE(sk->sk_priority);
+	if (cork->tos != -1)
+		skb->priority = cork->priority;
+	else if (cork->priority_cmsg_set)
+		skb->priority = cork->priority_cmsg_value;
+	else
+		skb->priority = READ_ONCE(sk->sk_priority);
+
 	skb->mark = cork->mark;
+
 	if (sk_is_tcp(sk))
 		skb_set_delivery_time(skb, cork->transmit_time, SKB_CLOCK_MONOTONIC);
 	else
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 474dfd263c8b..bbe481dc98a9 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -358,7 +358,10 @@  static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
 	skb_reserve(skb, hlen);
 
 	skb->protocol = htons(ETH_P_IP);
-	skb->priority = READ_ONCE(sk->sk_priority);
+	if (sockc->priority_cmsg_set)
+		skb->priority = sockc->priority_cmsg_value;
+	else
+		skb->priority = READ_ONCE(sk->sk_priority);
 	skb->mark = sockc->mark;
 	skb_set_delivery_type_by_clockid(skb, sockc->transmit_time, sk->sk_clockid);
 	skb_dst_set(skb, &rt->dst);
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index f26841f1490f..4c4f4b76ef90 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1401,6 +1401,8 @@  static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
 	cork->base.gso_size = ipc6->gso_size;
 	cork->base.tx_flags = 0;
 	cork->base.mark = ipc6->sockc.mark;
+	cork->base.priority_cmsg_set = ipc6->sockc.priority_cmsg_set;
+	cork->base.priority_cmsg_value = ipc6->sockc.priority_cmsg_value;
 	sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
 
 	cork->base.length = 0;
@@ -1931,7 +1933,11 @@  struct sk_buff *__ip6_make_skb(struct sock *sk,
 	hdr->saddr = fl6->saddr;
 	hdr->daddr = *final_dst;
 
-	skb->priority = READ_ONCE(sk->sk_priority);
+	if (cork->base.priority_cmsg_set)
+		skb->priority = cork->base.priority_cmsg_value;
+	else
+		skb->priority = READ_ONCE(sk->sk_priority);
+
 	skb->mark = cork->base.mark;
 	if (sk_is_tcp(sk))
 		skb_set_delivery_time(skb, cork->base.transmit_time, SKB_CLOCK_MONOTONIC);
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 608fa9d05b55..6944dc3ec4c9 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -619,7 +619,11 @@  static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length,
 	skb_reserve(skb, hlen);
 
 	skb->protocol = htons(ETH_P_IPV6);
-	skb->priority = READ_ONCE(sk->sk_priority);
+	if (sockc->priority_cmsg_set)
+		skb->priority = sockc->priority_cmsg_value;
+	else
+		skb->priority = READ_ONCE(sk->sk_priority);
+
 	skb->mark = sockc->mark;
 	skb_set_delivery_type_by_clockid(skb, sockc->transmit_time, sk->sk_clockid);
 
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 4a364cdd445e..8b7924f775a4 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -3125,7 +3125,11 @@  static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
 
 	skb->protocol = proto;
 	skb->dev = dev;
-	skb->priority = READ_ONCE(sk->sk_priority);
+	if (sockc.priority_cmsg_set)
+		skb->priority = sockc.priority_cmsg_value;
+	else
+		skb->priority = READ_ONCE(sk->sk_priority);
+
 	skb->mark = sockc.mark;
 	skb_set_delivery_type_by_clockid(skb, sockc.transmit_time, sk->sk_clockid);