diff mbox series

[RFCv2,1/9] tcp: authopt: Initial support and key management

Message ID 67c1471683200188b96a3f712dd2e8def7978462.1628544649.git.cdleonard@gmail.com (mailing list archive)
State RFC
Delegated to: Netdev Maintainers
Headers show
Series tcp: Initial support for RFC5925 auth option | expand

Checks

Context Check Description
netdev/cover_letter success Link
netdev/fixes_present success Link
netdev/patch_count success Link
netdev/tree_selection success Guessed tree name to be net-next
netdev/subject_prefix warning Target tree name not specified in the subject
netdev/cc_maintainers success CCed 6 of 6 maintainers
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit fail Errors and warnings before: 2513 this patch: 2378
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/verify_fixes success Link
netdev/checkpatch warning CHECK: Please use a blank line after function/struct/union/enum declarations WARNING: added, moved or deleted file(s), does MAINTAINERS need updating? WARNING: line length of 81 exceeds 80 columns WARNING: line length of 82 exceeds 80 columns WARNING: line length of 85 exceeds 80 columns WARNING: line length of 87 exceeds 80 columns WARNING: line length of 89 exceeds 80 columns WARNING: line length of 92 exceeds 80 columns WARNING: line length of 93 exceeds 80 columns WARNING: line length of 97 exceeds 80 columns
netdev/build_allmodconfig_warn fail Errors and warnings before: 2594 this patch: 2430
netdev/header_inline success Link

Commit Message

Leonard Crestez Aug. 9, 2021, 9:35 p.m. UTC
This commit add support to add and remove keys but does not use them
further.

Similar to tcp md5 a single point to a struct tcp_authopt_info* struct
is added to struct tcp_sock in order to avoid increasing memory usage
for everybody. The data structures related to tcp_authopt are
initialized on setsockopt and only removed on socket close.

Signed-off-by: Leonard Crestez <cdleonard@gmail.com>
---
 include/linux/tcp.h       |   6 ++
 include/net/tcp.h         |   1 +
 include/net/tcp_authopt.h |  55 ++++++++++++
 include/uapi/linux/tcp.h  |  72 ++++++++++++++++
 net/ipv4/Kconfig          |  14 ++++
 net/ipv4/Makefile         |   1 +
 net/ipv4/tcp.c            |  27 ++++++
 net/ipv4/tcp_authopt.c    | 172 ++++++++++++++++++++++++++++++++++++++
 net/ipv4/tcp_ipv4.c       |   2 +
 9 files changed, 350 insertions(+)
 create mode 100644 include/net/tcp_authopt.h
 create mode 100644 net/ipv4/tcp_authopt.c

Comments

Dmitry Safonov Aug. 10, 2021, 8:41 p.m. UTC | #1
Hi Leonard,

On Tue, 10 Aug 2021 at 02:50, Leonard Crestez <cdleonard@gmail.com> wrote:
[..]
> +/* Representation of a Master Key Tuple as per RFC5925 */
> +struct tcp_authopt_key_info {
> +       struct hlist_node node;
> +       /* Local identifier */
> +       u32 local_id;

There is no local_id in RFC5925, what's that?
An MKT is identified by (send_id, recv_id), together with
(src_addr/src_port, dst_addr/dst_port).
Why introducing something new to already complicated RFC?

> +       u32 flags;
> +       /* Wire identifiers */
> +       u8 send_id, recv_id;
> +       u8 alg_id;
> +       u8 keylen;
> +       u8 key[TCP_AUTHOPT_MAXKEYLEN];
> +       struct rcu_head rcu;

This is unaligned and will add padding.
I wonder if it's also worth saving some bytes by something like
struct tcp_ao_key *key;

With
struct tcp_ao_key {
      u8 keylen;
      u8 key[0];
};

Hmm?

> +       struct sockaddr_storage addr;
> +};
> +
> +/* Per-socket information regarding tcp_authopt */
> +struct tcp_authopt_info {
> +       /* List of tcp_authopt_key_info */
> +       struct hlist_head head;
> +       u32 flags;
> +       u32 src_isn;
> +       u32 dst_isn;
> +       struct rcu_head rcu;

Ditto, adds padding on 64-bit.

[..]
> +++ b/include/uapi/linux/tcp.h
> @@ -126,10 +126,12 @@ enum {
>  #define TCP_INQ                        36      /* Notify bytes available to read as a cmsg on read */
>
>  #define TCP_CM_INQ             TCP_INQ
>
>  #define TCP_TX_DELAY           37      /* delay outgoing packets by XX usec */
> +#define TCP_AUTHOPT                    38      /* TCP Authentication Option (RFC2385) */
> +#define TCP_AUTHOPT_KEY                39      /* TCP Authentication Option update key (RFC2385) */

RFC2385 is md5 one.
Also, should there be TCP_AUTHOPT_ADD_KEY, TCP_AUTHOPT_DELTE_KEY
instead of using flags inside setsocketopt()? (no hard feelings)

[..]
> +/**
> + * enum tcp_authopt_flag - flags for `tcp_authopt.flags`
> + */
> +enum tcp_authopt_flag {
> +       /**
> +        * @TCP_AUTHOPT_FLAG_REJECT_UNEXPECTED:
> +        *      Configure behavior of segments with TCP-AO coming from hosts for which no
> +        *      key is configured. The default recommended by RFC is to silently accept
> +        *      such connections.
> +        */
> +       TCP_AUTHOPT_FLAG_REJECT_UNEXPECTED = (1 << 2),
> +};
> +
> +/**
> + * struct tcp_authopt - Per-socket options related to TCP Authentication Option
> + */
> +struct tcp_authopt {
> +       /** @flags: Combination of &enum tcp_authopt_flag */
> +       __u32   flags;
> +};

I'm not sure what's the use of enum here, probably:
#define TCP_AUTHOPT_FLAG_REJECT_UNEXPECTED BIT(2)

[..]
> +struct tcp_authopt_key {
> +       /** @flags: Combination of &enum tcp_authopt_key_flag */
> +       __u32   flags;
> +       /** @local_id: Local identifier */
> +       __u32   local_id;
> +       /** @send_id: keyid value for send */
> +       __u8    send_id;
> +       /** @recv_id: keyid value for receive */
> +       __u8    recv_id;
> +       /** @alg: One of &enum tcp_authopt_alg */
> +       __u8    alg;
> +       /** @keylen: Length of the key buffer */
> +       __u8    keylen;
> +       /** @key: Secret key */
> +       __u8    key[TCP_AUTHOPT_MAXKEYLEN];
> +       /**
> +        * @addr: Key is only valid for this address
> +        *
> +        * Ignored unless TCP_AUTHOPT_KEY_ADDR_BIND flag is set
> +        */
> +       struct __kernel_sockaddr_storage addr;
> +};

It'll be an ABI if this is accepted. As it is - it can't support RFC5925 fully.
Extending syscall ABI is painful. I think, even the initial ABI *must* support
all possible features of the RFC.
In other words, there must be src_addr, src_port, dst_addr and dst_port.
I can see from docs you've written you don't want to support a mix of different
addr/port MKTs, so you can return -EINVAL or -ENOTSUPP for any value
but zero.
This will create an ABI that can be later extended to fully support RFC.

The same is about key: I don't think you need to define/use tcp_authopt_alg.
Just use algo name - that way TCP-AO will automatically be able to use
any algo supported by crypto engine.
See how xfrm does it, e.g.:
struct xfrm_algo_auth {
    char        alg_name[64];
    unsigned int    alg_key_len;    /* in bits */
    unsigned int    alg_trunc_len;  /* in bits */
    char        alg_key[0];
};

So you can let a user chose maclen instead of hard-coding it.
Much more extendable than what you propose.

[..]
> +#ifdef CONFIG_TCP_AUTHOPT
> +       case TCP_AUTHOPT: {
> +               struct tcp_authopt info;
> +
> +               if (get_user(len, optlen))
> +                       return -EFAULT;
> +
> +               lock_sock(sk);
> +               tcp_get_authopt_val(sk, &info);
> +               release_sock(sk);
> +
> +               len = min_t(unsigned int, len, sizeof(info));
> +               if (put_user(len, optlen))
> +                       return -EFAULT;
> +               if (copy_to_user(optval, &info, len))
> +                       return -EFAULT;
> +               return 0;
> +       }

I'm pretty sure it's not a good choice to write partly tcp_authopt.
And a user has no way to check what's the correct len on this kernel.
Instead of len = min_t(unsigned int, len, sizeof(info)), it should be
if (len != sizeof(info))
    return -EINVAL;

[..]
> +int tcp_set_authopt(struct sock *sk, sockptr_t optval, unsigned int optlen)
> +{
> +       struct tcp_authopt opt;
> +       struct tcp_authopt_info *info;
> +
> +       WARN_ON(!lockdep_sock_is_held(sk));

sock_owned_by_me(sk)

> +
> +       /* If userspace optlen is too short fill the rest with zeros */
> +       if (optlen > sizeof(opt))
> +               return -EINVAL;
> +       memset(&opt, 0, sizeof(opt));

it's 4 bytes, why not just (optlen != sizeof(opt))?

[..]
> +int tcp_get_authopt_val(struct sock *sk, struct tcp_authopt *opt)
> +{
> +       struct tcp_sock *tp = tcp_sk(sk);
> +       struct tcp_authopt_info *info;
> +
> +       WARN_ON(!lockdep_sock_is_held(sk));

sock_owned_by_me(sk)

[..]
> +int tcp_set_authopt_key(struct sock *sk, sockptr_t optval, unsigned int optlen)
> +{
> +       struct tcp_authopt_key opt;
> +       struct tcp_authopt_info *info;
> +       struct tcp_authopt_key_info *key_info;
> +
> +       /* If userspace optlen is too short fill the rest with zeros */
> +       if (optlen > sizeof(opt))
> +               return -EINVAL;
> +       memset(&opt, 0, sizeof(opt));
> +       if (copy_from_sockptr(&opt, optval, optlen))
> +               return -EFAULT;

Again, not a good practice to zero-extend struct. Enforce proper size
with -EINVAL.

[..]
> +       /* Initialize tcp_authopt_info if not already set */
> +       info = __tcp_authopt_info_get_or_create(sk);
> +       if (IS_ERR(info))
> +               return PTR_ERR(info);
> +
> +       /* check key family */
> +       if (opt.flags & TCP_AUTHOPT_KEY_ADDR_BIND) {
> +               if (sk->sk_family != opt.addr.ss_family)
> +                       return -EINVAL;
> +       }

Probably, can be in the reverse order, so that
__tcp_authopt_info_get_or_create()
won't allocate memory.

> +       /* If an old value exists for same local_id it is deleted */
> +       key_info = __tcp_authopt_key_info_lookup(sk, info, opt.local_id);
> +       if (key_info)
> +               tcp_authopt_key_del(sk, info, key_info);
> +       key_info = sock_kmalloc(sk, sizeof(*key_info), GFP_KERNEL | __GFP_ZERO);
> +       if (!key_info)
> +               return -ENOMEM;

1. You don't need sock_kmalloc() together with tcp_authopt_key_del().
    It just frees the memory and allocates it back straight away - no
sense in doing that.
2. I think RFC says you must not allow a user to change an existing key:
> MKT parameters are not changed. Instead, new MKTs can be installed, and a connection
> can change which MKT it uses.

IIUC, it means that one can't just change an existing MKT, but one can
remove and later
add MKT with the same (send_id, recv_id, src_addr/port, dst_addr/port).

So, a reasonable thing to do:
if (key_info)
    return -EEXIST.

Thanks,
             Dmitry
Leonard Crestez Aug. 11, 2021, 8:29 a.m. UTC | #2
On 8/10/21 11:41 PM, Dmitry Safonov wrote:
> Hi Leonard,
> 
> On Tue, 10 Aug 2021 at 02:50, Leonard Crestez <cdleonard@gmail.com> wrote:
> [..]
>> +/* Representation of a Master Key Tuple as per RFC5925 */
>> +struct tcp_authopt_key_info {
>> +       struct hlist_node node;
>> +       /* Local identifier */
>> +       u32 local_id;
> 
> There is no local_id in RFC5925, what's that?
> An MKT is identified by (send_id, recv_id), together with
> (src_addr/src_port, dst_addr/dst_port).
> Why introducing something new to already complicated RFC?

It was there to simplify user interface and initial implementation.

But it seems that BGP listeners already needs to support multiple 
keychains for different peers so identifying the key by (send_id, 
recv_id, binding) is easier for userspace to work with. Otherwise they 
need to create their own local_id mapping internally.

>> +       u32 flags;
>> +       /* Wire identifiers */
>> +       u8 send_id, recv_id;
>> +       u8 alg_id;
>> +       u8 keylen;
>> +       u8 key[TCP_AUTHOPT_MAXKEYLEN];
>> +       struct rcu_head rcu;
> 
> This is unaligned and will add padding.

Not clear padding matters. Moving rcu_head higher would avoid it, is 
that what you're suggesting.

> I wonder if it's also worth saving some bytes by something like
> struct tcp_ao_key *key;
> 
> With
> struct tcp_ao_key {
>        u8 keylen;
>        u8 key[0];
> };
> 
> Hmm?

This increases memory management complexity for very minor gains. Very 
few tcp_authopt_key will ever be created.

>> +       struct sockaddr_storage addr;
>> +};
>> +
>> +/* Per-socket information regarding tcp_authopt */
>> +struct tcp_authopt_info {
>> +       /* List of tcp_authopt_key_info */
>> +       struct hlist_head head;
>> +       u32 flags;
>> +       u32 src_isn;
>> +       u32 dst_isn;
>> +       struct rcu_head rcu;
> 
> Ditto, adds padding on 64-bit.
> 
> [..]
>> +++ b/include/uapi/linux/tcp.h
>> @@ -126,10 +126,12 @@ enum {
>>   #define TCP_INQ                        36      /* Notify bytes available to read as a cmsg on read */
>>
>>   #define TCP_CM_INQ             TCP_INQ
>>
>>   #define TCP_TX_DELAY           37      /* delay outgoing packets by XX usec */
>> +#define TCP_AUTHOPT                    38      /* TCP Authentication Option (RFC2385) */
>> +#define TCP_AUTHOPT_KEY                39      /* TCP Authentication Option update key (RFC2385) */
> 
> RFC2385 is md5 one.
> Also, should there be TCP_AUTHOPT_ADD_KEY, TCP_AUTHOPT_DELTE_KEY
> instead of using flags inside setsocketopt()? (no hard feelings)

Fixed RFC reference.

TCP_AUTHOPT_DELETE_KEY could be clearer, I just wanted to avoid bloating 
the sockopt space. But there's not any scarcity.

For reference tcp_md5 handles key deletion based on keylen == 0. This 
seems wrong to me, an empty key is in fact valid though not realistic.

If local_id is removed in favor of "full match on id and binding" then 
the delete sockopt would still need most of a full struct 
tcp_authopt_key anyway.

> [..]
>> +/**
>> + * enum tcp_authopt_flag - flags for `tcp_authopt.flags`
>> + */
>> +enum tcp_authopt_flag {
>> +       /**
>> +        * @TCP_AUTHOPT_FLAG_REJECT_UNEXPECTED:
>> +        *      Configure behavior of segments with TCP-AO coming from hosts for which no
>> +        *      key is configured. The default recommended by RFC is to silently accept
>> +        *      such connections.
>> +        */
>> +       TCP_AUTHOPT_FLAG_REJECT_UNEXPECTED = (1 << 2),
>> +};
>> +
>> +/**
>> + * struct tcp_authopt - Per-socket options related to TCP Authentication Option
>> + */
>> +struct tcp_authopt {
>> +       /** @flags: Combination of &enum tcp_authopt_flag */
>> +       __u32   flags;
>> +};
> 
> I'm not sure what's the use of enum here, probably: > #define TCP_AUTHOPT_FLAG_REJECT_UNEXPECTED BIT(2)

This is an enum because it looks nice in kernel-doc. I couldn't find a 
way to attach docs to a macro and include it somewhere else. BTW, the 
enum gains more members later.

As for BIT() it doesn't see to be allowed in uapi and there were recent 
changes removing such usage.

> [..]
>> +struct tcp_authopt_key {
>> +       /** @flags: Combination of &enum tcp_authopt_key_flag */
>> +       __u32   flags;
>> +       /** @local_id: Local identifier */
>> +       __u32   local_id;
>> +       /** @send_id: keyid value for send */
>> +       __u8    send_id;
>> +       /** @recv_id: keyid value for receive */
>> +       __u8    recv_id;
>> +       /** @alg: One of &enum tcp_authopt_alg */
>> +       __u8    alg;
>> +       /** @keylen: Length of the key buffer */
>> +       __u8    keylen;
>> +       /** @key: Secret key */
>> +       __u8    key[TCP_AUTHOPT_MAXKEYLEN];
>> +       /**
>> +        * @addr: Key is only valid for this address
>> +        *
>> +        * Ignored unless TCP_AUTHOPT_KEY_ADDR_BIND flag is set
>> +        */
>> +       struct __kernel_sockaddr_storage addr;
>> +};
> 
> It'll be an ABI if this is accepted. As it is - it can't support RFC5925 fully.
> Extending syscall ABI is painful. I think, even the initial ABI *must* support
> all possible features of the RFC.
> In other words, there must be src_addr, src_port, dst_addr and dst_port.
> I can see from docs you've written you don't want to support a mix of different
> addr/port MKTs, so you can return -EINVAL or -ENOTSUPP for any value
> but zero.
> This will create an ABI that can be later extended to fully support RFC.

RFC states that MKT connection identifiers can be specified using ranges 
and wildcards and the details are up to the implementation. Keys are 
*NOT* just bound to a classical TCP 4-tuple.

* src_addr and src_port is implicit in socket binding. Maybe in theory 
src_addr they could apply for a server socket bound to 0.0.0.0:PORT but 
userspace can just open more sockets.
* dst_port is not supported by MD5 and I can't think of any useful 
usecase. This is either well known (179 for BGP) or auto-allocated.
* tcp_md5 was recently enhanced allow a "prefixlen" for addr and 
"l3mdev" ifindex binding.

This last point shows that the binding features people require can't be 
easily predicted in advance so it's better to allow the rules to be 
extended.

> The same is about key: I don't think you need to define/use tcp_authopt_alg.
> Just use algo name - that way TCP-AO will automatically be able to use
> any algo supported by crypto engine.
> See how xfrm does it, e.g.:
> struct xfrm_algo_auth {
>      char        alg_name[64];
>      unsigned int    alg_key_len;    /* in bits */
>      unsigned int    alg_trunc_len;  /* in bits */
>      char        alg_key[0];
> };
> 
> So you can let a user chose maclen instead of hard-coding it.
> Much more extendable than what you propose.

This complicates ABI and implementation with features that are not 
needed. I'd much rather only expose an enum of real-world tcp-ao algorithms.

> [..]
>> +#ifdef CONFIG_TCP_AUTHOPT
>> +       case TCP_AUTHOPT: {
>> +               struct tcp_authopt info;
>> +
>> +               if (get_user(len, optlen))
>> +                       return -EFAULT;
>> +
>> +               lock_sock(sk);
>> +               tcp_get_authopt_val(sk, &info);
>> +               release_sock(sk);
>> +
>> +               len = min_t(unsigned int, len, sizeof(info));
>> +               if (put_user(len, optlen))
>> +                       return -EFAULT;
>> +               if (copy_to_user(optval, &info, len))
>> +                       return -EFAULT;
>> +               return 0;
>> +       }
> 
> I'm pretty sure it's not a good choice to write partly tcp_authopt.
> And a user has no way to check what's the correct len on this kernel.
> Instead of len = min_t(unsigned int, len, sizeof(info)), it should be
> if (len != sizeof(info))
>      return -EINVAL;

Purpose is to allow sockopts to grow as md5 has grown.

> [..]
>> +int tcp_set_authopt(struct sock *sk, sockptr_t optval, unsigned int optlen)
>> +{
>> +       struct tcp_authopt opt;
>> +       struct tcp_authopt_info *info;
>> +
>> +       WARN_ON(!lockdep_sock_is_held(sk));
> 
> sock_owned_by_me(sk)

Yes, this seems better. I think there are several socket locking 
mistakes in later commits. The patch adding support for detailed keyid 
control probably needs bh_lock_sock to avoid races between key selection 
and caching on send and key manipulation by userspace.

>> +       /* If userspace optlen is too short fill the rest with zeros */
>> +       if (optlen > sizeof(opt))
>> +               return -EINVAL;
>> +       memset(&opt, 0, sizeof(opt));
> 
> it's 4 bytes, why not just (optlen != sizeof(opt))?

It's extended in a later commit to add detailed keyid control.

> [..]
>> +int tcp_get_authopt_val(struct sock *sk, struct tcp_authopt *opt)
>> +{
>> +       struct tcp_sock *tp = tcp_sk(sk);
>> +       struct tcp_authopt_info *info;
>> +
>> +       WARN_ON(!lockdep_sock_is_held(sk));
> 
> sock_owned_by_me(sk)
> 
> [..]
>> +int tcp_set_authopt_key(struct sock *sk, sockptr_t optval, unsigned int optlen)
>> +{
>> +       struct tcp_authopt_key opt;
>> +       struct tcp_authopt_info *info;
>> +       struct tcp_authopt_key_info *key_info;
>> +
>> +       /* If userspace optlen is too short fill the rest with zeros */
>> +       if (optlen > sizeof(opt))
>> +               return -EINVAL;
>> +       memset(&opt, 0, sizeof(opt));
>> +       if (copy_from_sockptr(&opt, optval, optlen))
>> +               return -EFAULT;
> 
> Again, not a good practice to zero-extend struct. Enforce proper size
> with -EINVAL.
> 
> [..]
>> +       /* Initialize tcp_authopt_info if not already set */
>> +       info = __tcp_authopt_info_get_or_create(sk);
>> +       if (IS_ERR(info))
>> +               return PTR_ERR(info);
>> +
>> +       /* check key family */
>> +       if (opt.flags & TCP_AUTHOPT_KEY_ADDR_BIND) {
>> +               if (sk->sk_family != opt.addr.ss_family)
>> +                       return -EINVAL;
>> +       }
> 
> Probably, can be in the reverse order, so that
> __tcp_authopt_info_get_or_create()
> won't allocate memory.

OK. It only saves a tiny bit of memory on API misuse.

>> +       /* If an old value exists for same local_id it is deleted */
>> +       key_info = __tcp_authopt_key_info_lookup(sk, info, opt.local_id);
>> +       if (key_info)
>> +               tcp_authopt_key_del(sk, info, key_info);
>> +       key_info = sock_kmalloc(sk, sizeof(*key_info), GFP_KERNEL | __GFP_ZERO);
>> +       if (!key_info)
>> +               return -ENOMEM;
> 
> 1. You don't need sock_kmalloc() together with tcp_authopt_key_del().
>      It just frees the memory and allocates it back straight away - no
> sense in doing that.

The list is scanned in multiple places in later commits using nothing 
but an rcu_read_lock, this means that keys can't be updated in-place.

> 2. I think RFC says you must not allow a user to change an existing key:
>> MKT parameters are not changed. Instead, new MKTs can be installed, and a connection
>> can change which MKT it uses.
> 
> IIUC, it means that one can't just change an existing MKT, but one can
> remove and later
> add MKT with the same (send_id, recv_id, src_addr/port, dst_addr/port).
> 
> So, a reasonable thing to do:
> if (key_info)
>      return -EEXIST.

You're right, making the user delete keys explicitly is better.

--
Regards,
Leonard
David Ahern Aug. 11, 2021, 1:42 p.m. UTC | #3
On 8/11/21 2:29 AM, Leonard Crestez wrote:
> On 8/10/21 11:41 PM, Dmitry Safonov wrote:
>> Hi Leonard,
>>
>> On Tue, 10 Aug 2021 at 02:50, Leonard Crestez <cdleonard@gmail.com>
>> wrote:
>> [..]
>>> +/* Representation of a Master Key Tuple as per RFC5925 */
>>> +struct tcp_authopt_key_info {
>>> +       struct hlist_node node;
>>> +       /* Local identifier */
>>> +       u32 local_id;
>>
>> There is no local_id in RFC5925, what's that?
>> An MKT is identified by (send_id, recv_id), together with
>> (src_addr/src_port, dst_addr/dst_port).
>> Why introducing something new to already complicated RFC?
> 
> It was there to simplify user interface and initial implementation.
> 
> But it seems that BGP listeners already needs to support multiple
> keychains for different peers so identifying the key by (send_id,
> recv_id, binding) is easier for userspace to work with. Otherwise they
> need to create their own local_id mapping internally.
> 

any proposed simplification needs to be well explained and how it
relates to the RFC spec.
Dmitry Safonov Aug. 11, 2021, 2:31 p.m. UTC | #4
On 8/11/21 9:29 AM, Leonard Crestez wrote:
> On 8/10/21 11:41 PM, Dmitry Safonov wrote:
[..]
>>> +       u32 flags;
>>> +       /* Wire identifiers */
>>> +       u8 send_id, recv_id;
>>> +       u8 alg_id;
>>> +       u8 keylen;
>>> +       u8 key[TCP_AUTHOPT_MAXKEYLEN];
>>> +       struct rcu_head rcu;
>>
>> This is unaligned and will add padding.
> 
> Not clear padding matters. Moving rcu_head higher would avoid it, is
> that what you're suggesting.

Yes.

>> I wonder if it's also worth saving some bytes by something like
>> struct tcp_ao_key *key;
>>
>> With
>> struct tcp_ao_key {
>>        u8 keylen;
>>        u8 key[0];
>> };
>>
>> Hmm?
> 
> This increases memory management complexity for very minor gains. Very
> few tcp_authopt_key will ever be created.

The change doesn't seem to be big, like:
--- a/net/ipv4/tcp_authopt.c
+++ b/net/ipv4/tcp_authopt.c
@@ -422,8 +422,16 @@ int tcp_set_authopt_key(struct sock *sk, sockptr_t
optval, unsig>
        key_info = __tcp_authopt_key_info_lookup(sk, info, opt.local_id);
        if (key_info)
                tcp_authopt_key_del(sk, info, key_info);
+
+       key = sock_kmalloc(sk, sizeof(*key) + opt.keylen, GFP_KERNEL |
__GFP_ZERO);
+       if (!key) {
+               tcp_authopt_alg_release(alg);
+               return -ENOMEM;
+       }
+
        key_info = sock_kmalloc(sk, sizeof(*key_info), GFP_KERNEL |
__GFP_ZERO);
        if (!key_info) {
+               sock_kfree_s(sk, key, sizeof(*key) + opt.keylen);
                tcp_authopt_alg_release(alg);
                return -ENOMEM;
        }

I don't know, probably it'll be enough for every user to limit their
keys by length of 80, but if one day it won't be enough - this ABI will
be painful to extend.

[..]
>>> +#define TCP_AUTHOPT                    38      /* TCP Authentication
>>> Option (RFC2385) */
>>> +#define TCP_AUTHOPT_KEY                39      /* TCP Authentication
>>> Option update key (RFC2385) */
>>
>> RFC2385 is md5 one.
>> Also, should there be TCP_AUTHOPT_ADD_KEY, TCP_AUTHOPT_DELTE_KEY
>> instead of using flags inside setsocketopt()? (no hard feelings)
> 
> Fixed RFC reference.
> 
> TCP_AUTHOPT_DELETE_KEY could be clearer, I just wanted to avoid bloating
> the sockopt space. But there's not any scarcity.
> 
> For reference tcp_md5 handles key deletion based on keylen == 0. This
> seems wrong to me, an empty key is in fact valid though not realistic.
> 
> If local_id is removed in favor of "full match on id and binding" then
> the delete sockopt would still need most of a full struct
> tcp_authopt_key anyway.

Sounds like a plan.

[..]>> I'm not sure what's the use of enum here, probably: > #define
>> TCP_AUTHOPT_FLAG_REJECT_UNEXPECTED BIT(2)
> 
> This is an enum because it looks nice in kernel-doc. I couldn't find a
> way to attach docs to a macro and include it somewhere else.

Yeah, ok, seems like good justification.

> BTW, the enum gains more members later.
> 
> As for BIT() it doesn't see to be allowed in uapi and there were recent
> changes removing such usage.

Ok, I just saw it's still used in include/uapi, but not aware of the
removal.

> 
>> [..]
>>> +struct tcp_authopt_key {
>>> +       /** @flags: Combination of &enum tcp_authopt_key_flag */
>>> +       __u32   flags;
>>> +       /** @local_id: Local identifier */
>>> +       __u32   local_id;
>>> +       /** @send_id: keyid value for send */
>>> +       __u8    send_id;
>>> +       /** @recv_id: keyid value for receive */
>>> +       __u8    recv_id;
>>> +       /** @alg: One of &enum tcp_authopt_alg */
>>> +       __u8    alg;
>>> +       /** @keylen: Length of the key buffer */
>>> +       __u8    keylen;
>>> +       /** @key: Secret key */
>>> +       __u8    key[TCP_AUTHOPT_MAXKEYLEN];
>>> +       /**
>>> +        * @addr: Key is only valid for this address
>>> +        *
>>> +        * Ignored unless TCP_AUTHOPT_KEY_ADDR_BIND flag is set
>>> +        */
>>> +       struct __kernel_sockaddr_storage addr;
>>> +};
>>
>> It'll be an ABI if this is accepted. As it is - it can't support
>> RFC5925 fully.
>> Extending syscall ABI is painful. I think, even the initial ABI *must*
>> support
>> all possible features of the RFC.
>> In other words, there must be src_addr, src_port, dst_addr and dst_port.
>> I can see from docs you've written you don't want to support a mix of
>> different
>> addr/port MKTs, so you can return -EINVAL or -ENOTSUPP for any value
>> but zero.
>> This will create an ABI that can be later extended to fully support RFC.
> 
> RFC states that MKT connection identifiers can be specified using ranges
> and wildcards and the details are up to the implementation. Keys are
> *NOT* just bound to a classical TCP 4-tuple.
> 
> * src_addr and src_port is implicit in socket binding. Maybe in theory
> src_addr they could apply for a server socket bound to 0.0.0.0:PORT but
> userspace can just open more sockets.
> * dst_port is not supported by MD5 and I can't think of any useful
> usecase. This is either well known (179 for BGP) or auto-allocated.
> * tcp_md5 was recently enhanced allow a "prefixlen" for addr and
> "l3mdev" ifindex binding.
> 
> This last point shows that the binding features people require can't be
> easily predicted in advance so it's better to allow the rules to be
> extended.

Yeah, I see both changes you mention went on easy way as they reused
existing paddings in the ABI structures.
Ok, if you don't want to reserve src_addr/src_port/dst_addr/dst_port,
than how about reserving some space for those instead?

>> The same is about key: I don't think you need to define/use
>> tcp_authopt_alg.
>> Just use algo name - that way TCP-AO will automatically be able to use
>> any algo supported by crypto engine.
>> See how xfrm does it, e.g.:
>> struct xfrm_algo_auth {
>>      char        alg_name[64];
>>      unsigned int    alg_key_len;    /* in bits */
>>      unsigned int    alg_trunc_len;  /* in bits */
>>      char        alg_key[0];
>> };
>>
>> So you can let a user chose maclen instead of hard-coding it.
>> Much more extendable than what you propose.
> 
> This complicates ABI and implementation with features that are not
> needed. I'd much rather only expose an enum of real-world tcp-ao
> algorithms.

I see it exactly the opposite way: a new enum unnecessary complicates
ABI, instead of passing alg_name[] to crypto engine. No need to add any
support in tcp-ao as the algorithms are already provided by kernel.
That is how it transparently works for ipsec, why not for tcp-ao?

> 
>> [..]
>>> +#ifdef CONFIG_TCP_AUTHOPT
>>> +       case TCP_AUTHOPT: {
>>> +               struct tcp_authopt info;
>>> +
>>> +               if (get_user(len, optlen))
>>> +                       return -EFAULT;
>>> +
>>> +               lock_sock(sk);
>>> +               tcp_get_authopt_val(sk, &info);
>>> +               release_sock(sk);
>>> +
>>> +               len = min_t(unsigned int, len, sizeof(info));
>>> +               if (put_user(len, optlen))
>>> +                       return -EFAULT;
>>> +               if (copy_to_user(optval, &info, len))
>>> +                       return -EFAULT;
>>> +               return 0;
>>> +       }
>>
>> I'm pretty sure it's not a good choice to write partly tcp_authopt.
>> And a user has no way to check what's the correct len on this kernel.
>> Instead of len = min_t(unsigned int, len, sizeof(info)), it should be
>> if (len != sizeof(info))
>>      return -EINVAL;
> 
> Purpose is to allow sockopts to grow as md5 has grown.

md5 has not grown. See above.

Another issue with your approach

+       /* If userspace optlen is too short fill the rest with zeros */
+       if (optlen > sizeof(opt))
+               return -EINVAL;
+       memset(&opt, 0, sizeof(opt));
+       if (copy_from_sockptr(&opt, optval, optlen))
+               return -EFAULT;

is that userspace compiled with updated/grew structure will fail on
older kernel. So, no extension without breaking something is possible.
Which also reminds me that currently you don't validate (opt.flags) for
unknown by kernel flags.

Extending syscalls is impossible without breaking userspace if ABI is
not designed with extensibility in mind. That was quite a big problem,
and still is. Please, read this carefully:
https://lwn.net/Articles/830666/

That is why I'm suggesting you all those changes that will be harder to
fix when/if your patches get accepted.
As an example how it should work see in copy_clone_args_from_user().

[..]

Thanks,
         Dmitry
David Ahern Aug. 11, 2021, 5:15 p.m. UTC | #5
On 8/11/21 8:31 AM, Dmitry Safonov wrote:
> On 8/11/21 9:29 AM, Leonard Crestez wrote:
>> On 8/10/21 11:41 PM, Dmitry Safonov wrote:
> [..]
>>>> +       u32 flags;
>>>> +       /* Wire identifiers */
>>>> +       u8 send_id, recv_id;
>>>> +       u8 alg_id;
>>>> +       u8 keylen;
>>>> +       u8 key[TCP_AUTHOPT_MAXKEYLEN];
>>>> +       struct rcu_head rcu;
>>>
>>> This is unaligned and will add padding.
>>
>> Not clear padding matters. Moving rcu_head higher would avoid it, is
>> that what you're suggesting.
> 
> Yes.
> 
>>> I wonder if it's also worth saving some bytes by something like
>>> struct tcp_ao_key *key;
>>>
>>> With
>>> struct tcp_ao_key {
>>>        u8 keylen;
>>>        u8 key[0];
>>> };
>>>
>>> Hmm?
>>
>> This increases memory management complexity for very minor gains. Very
>> few tcp_authopt_key will ever be created.
> 
> The change doesn't seem to be big, like:
> --- a/net/ipv4/tcp_authopt.c
> +++ b/net/ipv4/tcp_authopt.c
> @@ -422,8 +422,16 @@ int tcp_set_authopt_key(struct sock *sk, sockptr_t
> optval, unsig>
>         key_info = __tcp_authopt_key_info_lookup(sk, info, opt.local_id);
>         if (key_info)
>                 tcp_authopt_key_del(sk, info, key_info);
> +
> +       key = sock_kmalloc(sk, sizeof(*key) + opt.keylen, GFP_KERNEL |
> __GFP_ZERO);
> +       if (!key) {
> +               tcp_authopt_alg_release(alg);
> +               return -ENOMEM;
> +       }
> +
>         key_info = sock_kmalloc(sk, sizeof(*key_info), GFP_KERNEL |
> __GFP_ZERO);
>         if (!key_info) {
> +               sock_kfree_s(sk, key, sizeof(*key) + opt.keylen);
>                 tcp_authopt_alg_release(alg);
>                 return -ENOMEM;
>         }
> 
> I don't know, probably it'll be enough for every user to limit their
> keys by length of 80, but if one day it won't be enough - this ABI will
> be painful to extend.
> 
> [..]
>>>> +#define TCP_AUTHOPT                    38      /* TCP Authentication
>>>> Option (RFC2385) */
>>>> +#define TCP_AUTHOPT_KEY                39      /* TCP Authentication
>>>> Option update key (RFC2385) */
>>>
>>> RFC2385 is md5 one.
>>> Also, should there be TCP_AUTHOPT_ADD_KEY, TCP_AUTHOPT_DELTE_KEY
>>> instead of using flags inside setsocketopt()? (no hard feelings)
>>
>> Fixed RFC reference.
>>
>> TCP_AUTHOPT_DELETE_KEY could be clearer, I just wanted to avoid bloating
>> the sockopt space. But there's not any scarcity.
>>
>> For reference tcp_md5 handles key deletion based on keylen == 0. This
>> seems wrong to me, an empty key is in fact valid though not realistic.
>>
>> If local_id is removed in favor of "full match on id and binding" then
>> the delete sockopt would still need most of a full struct
>> tcp_authopt_key anyway.
> 
> Sounds like a plan.
> 
> [..]>> I'm not sure what's the use of enum here, probably: > #define
>>> TCP_AUTHOPT_FLAG_REJECT_UNEXPECTED BIT(2)
>>
>> This is an enum because it looks nice in kernel-doc. I couldn't find a
>> way to attach docs to a macro and include it somewhere else.
> 
> Yeah, ok, seems like good justification.
> 
>> BTW, the enum gains more members later.
>>
>> As for BIT() it doesn't see to be allowed in uapi and there were recent
>> changes removing such usage.
> 
> Ok, I just saw it's still used in include/uapi, but not aware of the
> removal.
> 
>>
>>> [..]
>>>> +struct tcp_authopt_key {
>>>> +       /** @flags: Combination of &enum tcp_authopt_key_flag */
>>>> +       __u32   flags;
>>>> +       /** @local_id: Local identifier */
>>>> +       __u32   local_id;
>>>> +       /** @send_id: keyid value for send */
>>>> +       __u8    send_id;
>>>> +       /** @recv_id: keyid value for receive */
>>>> +       __u8    recv_id;
>>>> +       /** @alg: One of &enum tcp_authopt_alg */
>>>> +       __u8    alg;
>>>> +       /** @keylen: Length of the key buffer */
>>>> +       __u8    keylen;
>>>> +       /** @key: Secret key */
>>>> +       __u8    key[TCP_AUTHOPT_MAXKEYLEN];
>>>> +       /**
>>>> +        * @addr: Key is only valid for this address
>>>> +        *
>>>> +        * Ignored unless TCP_AUTHOPT_KEY_ADDR_BIND flag is set
>>>> +        */
>>>> +       struct __kernel_sockaddr_storage addr;
>>>> +};
>>>
>>> It'll be an ABI if this is accepted. As it is - it can't support
>>> RFC5925 fully.
>>> Extending syscall ABI is painful. I think, even the initial ABI *must*
>>> support
>>> all possible features of the RFC.
>>> In other words, there must be src_addr, src_port, dst_addr and dst_port.
>>> I can see from docs you've written you don't want to support a mix of
>>> different
>>> addr/port MKTs, so you can return -EINVAL or -ENOTSUPP for any value
>>> but zero.
>>> This will create an ABI that can be later extended to fully support RFC.
>>
>> RFC states that MKT connection identifiers can be specified using ranges
>> and wildcards and the details are up to the implementation. Keys are
>> *NOT* just bound to a classical TCP 4-tuple.
>>
>> * src_addr and src_port is implicit in socket binding. Maybe in theory
>> src_addr they could apply for a server socket bound to 0.0.0.0:PORT but
>> userspace can just open more sockets.
>> * dst_port is not supported by MD5 and I can't think of any useful
>> usecase. This is either well known (179 for BGP) or auto-allocated.
>> * tcp_md5 was recently enhanced allow a "prefixlen" for addr and
>> "l3mdev" ifindex binding.
>>
>> This last point shows that the binding features people require can't be
>> easily predicted in advance so it's better to allow the rules to be
>> extended.
> 
> Yeah, I see both changes you mention went on easy way as they reused
> existing paddings in the ABI structures.
> Ok, if you don't want to reserve src_addr/src_port/dst_addr/dst_port,
> than how about reserving some space for those instead?
> 
>>> The same is about key: I don't think you need to define/use
>>> tcp_authopt_alg.
>>> Just use algo name - that way TCP-AO will automatically be able to use
>>> any algo supported by crypto engine.
>>> See how xfrm does it, e.g.:
>>> struct xfrm_algo_auth {
>>>      char        alg_name[64];
>>>      unsigned int    alg_key_len;    /* in bits */
>>>      unsigned int    alg_trunc_len;  /* in bits */
>>>      char        alg_key[0];
>>> };
>>>
>>> So you can let a user chose maclen instead of hard-coding it.
>>> Much more extendable than what you propose.
>>
>> This complicates ABI and implementation with features that are not
>> needed. I'd much rather only expose an enum of real-world tcp-ao
>> algorithms.
> 
> I see it exactly the opposite way: a new enum unnecessary complicates
> ABI, instead of passing alg_name[] to crypto engine. No need to add any
> support in tcp-ao as the algorithms are already provided by kernel.
> That is how it transparently works for ipsec, why not for tcp-ao?
> 
>>
>>> [..]
>>>> +#ifdef CONFIG_TCP_AUTHOPT
>>>> +       case TCP_AUTHOPT: {
>>>> +               struct tcp_authopt info;
>>>> +
>>>> +               if (get_user(len, optlen))
>>>> +                       return -EFAULT;
>>>> +
>>>> +               lock_sock(sk);
>>>> +               tcp_get_authopt_val(sk, &info);
>>>> +               release_sock(sk);
>>>> +
>>>> +               len = min_t(unsigned int, len, sizeof(info));
>>>> +               if (put_user(len, optlen))
>>>> +                       return -EFAULT;
>>>> +               if (copy_to_user(optval, &info, len))
>>>> +                       return -EFAULT;
>>>> +               return 0;
>>>> +       }
>>>
>>> I'm pretty sure it's not a good choice to write partly tcp_authopt.
>>> And a user has no way to check what's the correct len on this kernel.
>>> Instead of len = min_t(unsigned int, len, sizeof(info)), it should be
>>> if (len != sizeof(info))
>>>      return -EINVAL;
>>
>> Purpose is to allow sockopts to grow as md5 has grown.
> 
> md5 has not grown. See above.

MD5 uapi has - e.g., 8917a777be3ba and  6b102db50cdde. We want similar
capabilities for growth with this API.

> 
> Another issue with your approach
> 
> +       /* If userspace optlen is too short fill the rest with zeros */
> +       if (optlen > sizeof(opt))
> +               return -EINVAL;
> +       memset(&opt, 0, sizeof(opt));
> +       if (copy_from_sockptr(&opt, optval, optlen))
> +               return -EFAULT;
> 
> is that userspace compiled with updated/grew structure will fail on
> older kernel. So, no extension without breaking something is possible.
> Which also reminds me that currently you don't validate (opt.flags) for
> unknown by kernel flags.
> 
> Extending syscalls is impossible without breaking userspace if ABI is
> not designed with extensibility in mind. That was quite a big problem,
> and still is. Please, read this carefully:
> https://lwn.net/Articles/830666/
> 
> That is why I'm suggesting you all those changes that will be harder to
> fix when/if your patches get accepted.
> As an example how it should work see in copy_clone_args_from_user().
> 

Look at how TCP_ZEROCOPY_RECEIVE has grown over releases as an example
of how to properly handle this.
Leonard Crestez Aug. 11, 2021, 7:08 p.m. UTC | #6
On 11.08.2021 17:31, Dmitry Safonov wrote:
> On 8/11/21 9:29 AM, Leonard Crestez wrote:
>> On 8/10/21 11:41 PM, Dmitry Safonov wrote:
>>> I wonder if it's also worth saving some bytes by something like
>>> struct tcp_ao_key *key;
>>>
>>> With
>>> struct tcp_ao_key {
>>>         u8 keylen;
>>>         u8 key[0];
>>> };
>>>
>>> Hmm?
>>
>> This increases memory management complexity for very minor gains. Very
>> few tcp_authopt_key will ever be created.
> 
> The change doesn't seem to be big, like:
> --- a/net/ipv4/tcp_authopt.c
> +++ b/net/ipv4/tcp_authopt.c
> @@ -422,8 +422,16 @@ int tcp_set_authopt_key(struct sock *sk, sockptr_t
> optval, unsig>
>          key_info = __tcp_authopt_key_info_lookup(sk, info, opt.local_id);
>          if (key_info)
>                  tcp_authopt_key_del(sk, info, key_info);
> +
> +       key = sock_kmalloc(sk, sizeof(*key) + opt.keylen, GFP_KERNEL |
> __GFP_ZERO);
> +       if (!key) {
> +               tcp_authopt_alg_release(alg);
> +               return -ENOMEM;
> +       }
> +
>          key_info = sock_kmalloc(sk, sizeof(*key_info), GFP_KERNEL |
> __GFP_ZERO);
>          if (!key_info) {
> +               sock_kfree_s(sk, key, sizeof(*key) + opt.keylen);
>                  tcp_authopt_alg_release(alg);
>                  return -ENOMEM;
>          }
> 
> I don't know, probably it'll be enough for every user to limit their
> keys by length of 80, but if one day it won't be enough - this ABI will
> be painful to extend.

struct tcp_authopt_key also needs to be modified and a separate 
copy_from_user is required. It's not very complicated but not very 
useful either.

>>>> +struct tcp_authopt_key {
>>>> +       /** @flags: Combination of &enum tcp_authopt_key_flag */
>>>> +       __u32   flags;
>>>> +       /** @local_id: Local identifier */
>>>> +       __u32   local_id;
>>>> +       /** @send_id: keyid value for send */
>>>> +       __u8    send_id;
>>>> +       /** @recv_id: keyid value for receive */
>>>> +       __u8    recv_id;
>>>> +       /** @alg: One of &enum tcp_authopt_alg */
>>>> +       __u8    alg;
>>>> +       /** @keylen: Length of the key buffer */
>>>> +       __u8    keylen;
>>>> +       /** @key: Secret key */
>>>> +       __u8    key[TCP_AUTHOPT_MAXKEYLEN];
>>>> +       /**
>>>> +        * @addr: Key is only valid for this address
>>>> +        *
>>>> +        * Ignored unless TCP_AUTHOPT_KEY_ADDR_BIND flag is set
>>>> +        */
>>>> +       struct __kernel_sockaddr_storage addr;
>>>> +};
>>>
>>> It'll be an ABI if this is accepted. As it is - it can't support
>>> RFC5925 fully.
>>> Extending syscall ABI is painful. I think, even the initial ABI *must*
>>> support
>>> all possible features of the RFC.
>>> In other words, there must be src_addr, src_port, dst_addr and dst_port.
>>> I can see from docs you've written you don't want to support a mix of
>>> different
>>> addr/port MKTs, so you can return -EINVAL or -ENOTSUPP for any value
>>> but zero.
>>> This will create an ABI that can be later extended to fully support RFC.
>>
>> RFC states that MKT connection identifiers can be specified using ranges
>> and wildcards and the details are up to the implementation. Keys are
>> *NOT* just bound to a classical TCP 4-tuple.
>>
>> * src_addr and src_port is implicit in socket binding. Maybe in theory
>> src_addr they could apply for a server socket bound to 0.0.0.0:PORT but
>> userspace can just open more sockets.
>> * dst_port is not supported by MD5 and I can't think of any useful
>> usecase. This is either well known (179 for BGP) or auto-allocated.
>> * tcp_md5 was recently enhanced allow a "prefixlen" for addr and
>> "l3mdev" ifindex binding.
>>
>> This last point shows that the binding features people require can't be
>> easily predicted in advance so it's better to allow the rules to be
>> extended.
> 
> Yeah, I see both changes you mention went on easy way as they reused
> existing paddings in the ABI structures.
> Ok, if you don't want to reserve src_addr/src_port/dst_addr/dst_port,
> than how about reserving some space for those instead?

My idea was that each additional binding featurs can be added as a new 
bit flag in tcp_authopt_key_flag and the structure extended. Older 
applications won't pass the flag and kernel will silently accept the 
shorter optval and you get compatibility.

As far as I can tell MD5 supports binding in 3 ways:

1) By dst ip address
2) By dst ip address and prefixlen
3) By ifindex for vrfs

Current version of tcp_authopt only supports (1) but I think adding the 
other methods isn't actually difficult at all.

I'd rather not guess at future features by adding unused fields.

>>> The same is about key: I don't think you need to define/use
>>> tcp_authopt_alg.
>>> Just use algo name - that way TCP-AO will automatically be able to use
>>> any algo supported by crypto engine.
>>> See how xfrm does it, e.g.:
>>> struct xfrm_algo_auth {
>>>       char        alg_name[64];
>>>       unsigned int    alg_key_len;    /* in bits */
>>>       unsigned int    alg_trunc_len;  /* in bits */
>>>       char        alg_key[0];
>>> };
>>>
>>> So you can let a user chose maclen instead of hard-coding it.
>>> Much more extendable than what you propose.
>>
>> This complicates ABI and implementation with features that are not
>> needed. I'd much rather only expose an enum of real-world tcp-ao
>> algorithms.
> 
> I see it exactly the opposite way: a new enum unnecessary complicates
> ABI, instead of passing alg_name[] to crypto engine. No need to add any
> support in tcp-ao as the algorithms are already provided by kernel.
> That is how it transparently works for ipsec, why not for tcp-ao?

The TCP Authentication Option standard has been out there for many many 
years now and alternative algorithms are not widely used. I think cisco 
has a third algorithm? What you're asking for is a large extension of 
the IETF standard.

If you look at the rest of this series I had a lot of trouble with 
crypto tfm allocation context so I had to create per-cpu pool, similar 
to tcp-md5. Should I potentially create a pool for each alg known to 
crypto-api?

Letting use control algorithms and traffickey and mac lengths creates 
new potential avenues for privilege escalation that need to be checked.

As much as possible I would like to avoid exposing the linux crypto api 
through TCP sockopts.

I was also thinking of having a non-namespaced sysctl to disable 
tcp_authopt by default for security reasons. Unless you're running a 
router you should never let userspace touch these options.

>>> [..]
>>>> +#ifdef CONFIG_TCP_AUTHOPT
>>>> +       case TCP_AUTHOPT: {
>>>> +               struct tcp_authopt info;
>>>> +
>>>> +               if (get_user(len, optlen))
>>>> +                       return -EFAULT;
>>>> +
>>>> +               lock_sock(sk);
>>>> +               tcp_get_authopt_val(sk, &info);
>>>> +               release_sock(sk);
>>>> +
>>>> +               len = min_t(unsigned int, len, sizeof(info));
>>>> +               if (put_user(len, optlen))
>>>> +                       return -EFAULT;
>>>> +               if (copy_to_user(optval, &info, len))
>>>> +                       return -EFAULT;
>>>> +               return 0;
>>>> +       }
>>>
>>> I'm pretty sure it's not a good choice to write partly tcp_authopt.
>>> And a user has no way to check what's the correct len on this kernel.
>>> Instead of len = min_t(unsigned int, len, sizeof(info)), it should be
>>> if (len != sizeof(info))
>>>       return -EINVAL;
>>
>> Purpose is to allow sockopts to grow as md5 has grown.
> 
> md5 has not grown. See above.
> 
> Another issue with your approach
> 
> +       /* If userspace optlen is too short fill the rest with zeros */
> +       if (optlen > sizeof(opt))
> +               return -EINVAL;
> +       memset(&opt, 0, sizeof(opt));
> +       if (copy_from_sockptr(&opt, optval, optlen))
> +               return -EFAULT;
> 
> is that userspace compiled with updated/grew structure will fail on
> older kernel. So, no extension without breaking something is possible.

Userspace that needs new features and also compatibility with older 
kernels could check something like uname.

I think this is already a problem with md5: passing 
TCP_MD5SIG_FLAG_PREFIX on an old kernel simply won't take effect and 
that's fine.

The bigger concern is to ensure that old binaries work without changes.

> Which also reminds me that currently you don't validate (opt.flags) for
> unknown by kernel flags.

Not sure what you mean, it is explicitly only known flags that are 
copied from userspace. I can make setsockopt return an error on unknown 
flags, this will make new apps fail more explicitly on old kernels.

> Extending syscalls is impossible without breaking userspace if ABI is
> not designed with extensibility in mind. That was quite a big problem,
> and still is. Please, read this carefully:
> https://lwn.net/Articles/830666/
> 
> That is why I'm suggesting you all those changes that will be harder to
> fix when/if your patches get accepted.

Both of the sockopt structs have a "flags" field and structure expansion 
will be accompanied by new flags. Is this not sufficient for compatibility?
Leonard Crestez Aug. 11, 2021, 7:11 p.m. UTC | #7
On 11.08.2021 16:42, David Ahern wrote:
> On 8/11/21 2:29 AM, Leonard Crestez wrote:
>> On 8/10/21 11:41 PM, Dmitry Safonov wrote:
>>> Hi Leonard,
>>>
>>> On Tue, 10 Aug 2021 at 02:50, Leonard Crestez <cdleonard@gmail.com>
>>> wrote:
>>> [..]
>>>> +/* Representation of a Master Key Tuple as per RFC5925 */
>>>> +struct tcp_authopt_key_info {
>>>> +       struct hlist_node node;
>>>> +       /* Local identifier */
>>>> +       u32 local_id;
>>>
>>> There is no local_id in RFC5925, what's that?
>>> An MKT is identified by (send_id, recv_id), together with
>>> (src_addr/src_port, dst_addr/dst_port).
>>> Why introducing something new to already complicated RFC?
>>
>> It was there to simplify user interface and initial implementation.
>>
>> But it seems that BGP listeners already needs to support multiple
>> keychains for different peers so identifying the key by (send_id,
>> recv_id, binding) is easier for userspace to work with. Otherwise they
>> need to create their own local_id mapping internally.
>>
> 
> any proposed simplification needs to be well explained and how it
> relates to the RFC spec.

The local_id only exists between userspace and kernel so it's not really 
covered by the RFC.

There are objections to this and it seems to be unhelpful for userspace 
zo I will replace it with match by binding.

BTW: another somewhat dubious simplification is that I offloaded the RFC 
requirement to never add overlapping keys to userspace. So if userspace 
adds keys with same recvid that match the same TCP 4-tuple then 
connections will just start failing.

It's arguably fine to allow userspace misconfiguration to cause failures.

--
Regards,
Leonard
Dmitry Safonov Aug. 11, 2021, 8:12 p.m. UTC | #8
Hi David,

On 8/11/21 6:15 PM, David Ahern wrote:
> On 8/11/21 8:31 AM, Dmitry Safonov wrote:
>> On 8/11/21 9:29 AM, Leonard Crestez wrote:
>>> On 8/10/21 11:41 PM, Dmitry Safonov wrote:
[..]
>>>> I'm pretty sure it's not a good choice to write partly tcp_authopt.
>>>> And a user has no way to check what's the correct len on this kernel.
>>>> Instead of len = min_t(unsigned int, len, sizeof(info)), it should be
>>>> if (len != sizeof(info))
>>>>      return -EINVAL;
>>>
>>> Purpose is to allow sockopts to grow as md5 has grown.
>>
>> md5 has not grown. See above.
> 
> MD5 uapi has - e.g., 8917a777be3ba and  6b102db50cdde. We want similar
> capabilities for growth with this API.

So, you mean adding a new setsockopt when the struct has to be extended?
Like TCP_AUTHOPT_EXT?

It can work, but sounds like adding a new syscall every time the old one
can't be extended. I can see that with current limitations on TCP-AO RFC
the ABI in these patches will have to be extended.

The second commit started using new cmd.tcpm_flags, where unknown flags
are still at this moment silently ignored by the kernel. So 6b102db50cdd
could have introduced a regression in userspace. By luck and by reason
that md5 isn't probably frequently used it didn't.
Not nice at all example for newer APIs.

>> Another issue with your approach
>>
>> +       /* If userspace optlen is too short fill the rest with zeros */
>> +       if (optlen > sizeof(opt))
>> +               return -EINVAL;
>> +       memset(&opt, 0, sizeof(opt));
>> +       if (copy_from_sockptr(&opt, optval, optlen))
>> +               return -EFAULT;
>>
>> is that userspace compiled with updated/grew structure will fail on
>> older kernel. So, no extension without breaking something is possible.
>> Which also reminds me that currently you don't validate (opt.flags) for
>> unknown by kernel flags.
>>
>> Extending syscalls is impossible without breaking userspace if ABI is
>> not designed with extensibility in mind. That was quite a big problem,
>> and still is. Please, read this carefully:
>> https://lwn.net/Articles/830666/
>>
>> That is why I'm suggesting you all those changes that will be harder to
>> fix when/if your patches get accepted.
>> As an example how it should work see in copy_clone_args_from_user().
>>
> 
> Look at how TCP_ZEROCOPY_RECEIVE has grown over releases as an example
> of how to properly handle this.

Exactly.

: switch (len) {
:		case offsetofend(...)
:		case offsetofend(...)

And than also:
:		if (unlikely(len > sizeof(zc))) {
:			err = check_zeroed_user(optval + sizeof(zc),
:						len - sizeof(zc));

Does it sound similar to what I've written in my ABI review?
And what the LWN article has in it.
Please, look again at the patch I replied to.

Thanks,
         Dmitry
David Ahern Aug. 11, 2021, 8:23 p.m. UTC | #9
On 8/11/21 2:12 PM, Dmitry Safonov wrote:
> Hi David,
> 
> On 8/11/21 6:15 PM, David Ahern wrote:
>> On 8/11/21 8:31 AM, Dmitry Safonov wrote:
>>> On 8/11/21 9:29 AM, Leonard Crestez wrote:
>>>> On 8/10/21 11:41 PM, Dmitry Safonov wrote:
> [..]
>>>>> I'm pretty sure it's not a good choice to write partly tcp_authopt.
>>>>> And a user has no way to check what's the correct len on this kernel.
>>>>> Instead of len = min_t(unsigned int, len, sizeof(info)), it should be
>>>>> if (len != sizeof(info))
>>>>>      return -EINVAL;
>>>>
>>>> Purpose is to allow sockopts to grow as md5 has grown.
>>>
>>> md5 has not grown. See above.
>>
>> MD5 uapi has - e.g., 8917a777be3ba and  6b102db50cdde. We want similar
>> capabilities for growth with this API.
> 
> So, you mean adding a new setsockopt when the struct has to be extended?
> Like TCP_AUTHOPT_EXT?

uh, no. That was needed because of failures with the original
implementation wrt checking that all unused bits are 0. If checking is
not done from day 1, that field can never be used in the future.

My point here was only that MD5 uapi was extended.

My second point is more relevant to Leonard as a very recent example of
how to build an extendable struct.


>>
>> Look at how TCP_ZEROCOPY_RECEIVE has grown over releases as an example
>> of how to properly handle this.
> 
> Exactly.
> 
> : switch (len) {
> :		case offsetofend(...)
> :		case offsetofend(...)
> 
> And than also:
> :		if (unlikely(len > sizeof(zc))) {
> :			err = check_zeroed_user(optval + sizeof(zc),
> :						len - sizeof(zc));
> 
> Does it sound similar to what I've written in my ABI review?
> And what the LWN article has in it.
> Please, look again at the patch I replied to.
> 
> Thanks,
>          Dmitry
>
Dmitry Safonov Aug. 11, 2021, 8:26 p.m. UTC | #10
On 8/11/21 8:11 PM, Leonard Crestez wrote:
> On 11.08.2021 16:42, David Ahern wrote:
[..]
>>
>> any proposed simplification needs to be well explained and how it
>> relates to the RFC spec.
> 
> The local_id only exists between userspace and kernel so it's not really
> covered by the RFC.
> 
> There are objections to this and it seems to be unhelpful for userspace
> zo I will replace it with match by binding.
> 
> BTW: another somewhat dubious simplification is that I offloaded the RFC
> requirement to never add overlapping keys to userspace. So if userspace
> adds keys with same recvid that match the same TCP 4-tuple then
> connections will just start failing.
> 
> It's arguably fine to allow userspace misconfiguration to cause failures.

I think it's fine. But worth documenting. Also, keep in mind that
someone in userspace with his funny ideas might start relying on such
behavior in future.

Thanks,
        Dmitry
David Ahern Aug. 11, 2021, 8:26 p.m. UTC | #11
On 8/11/21 1:11 PM, Leonard Crestez wrote:
> On 11.08.2021 16:42, David Ahern wrote:
>> On 8/11/21 2:29 AM, Leonard Crestez wrote:
>>> On 8/10/21 11:41 PM, Dmitry Safonov wrote:
>>>> Hi Leonard,
>>>>
>>>> On Tue, 10 Aug 2021 at 02:50, Leonard Crestez <cdleonard@gmail.com>
>>>> wrote:
>>>> [..]
>>>>> +/* Representation of a Master Key Tuple as per RFC5925 */
>>>>> +struct tcp_authopt_key_info {
>>>>> +       struct hlist_node node;
>>>>> +       /* Local identifier */
>>>>> +       u32 local_id;
>>>>
>>>> There is no local_id in RFC5925, what's that?
>>>> An MKT is identified by (send_id, recv_id), together with
>>>> (src_addr/src_port, dst_addr/dst_port).
>>>> Why introducing something new to already complicated RFC?
>>>
>>> It was there to simplify user interface and initial implementation.
>>>
>>> But it seems that BGP listeners already needs to support multiple
>>> keychains for different peers so identifying the key by (send_id,
>>> recv_id, binding) is easier for userspace to work with. Otherwise they
>>> need to create their own local_id mapping internally.
>>>
>>
>> any proposed simplification needs to be well explained and how it
>> relates to the RFC spec.
> 
> The local_id only exists between userspace and kernel so it's not really
> covered by the RFC.

My point is that you need to document the uapi (however it ends up) and
how it is used to achieve the intent of the RFC.

> 
> There are objections to this and it seems to be unhelpful for userspace
> zo I will replace it with match by binding.
> 
> BTW: another somewhat dubious simplification is that I offloaded the RFC
> requirement to never add overlapping keys to userspace. So if userspace
> adds keys with same recvid that match the same TCP 4-tuple then
> connections will just start failing.
> 
> It's arguably fine to allow userspace misconfiguration to cause failures.
> 
> -- 
> Regards,
> Leonard
Leonard Crestez Aug. 12, 2021, 7:46 p.m. UTC | #12
On 8/11/21 11:29 AM, Leonard Crestez wrote:
> On 8/10/21 11:41 PM, Dmitry Safonov wrote:
>> On Tue, 10 Aug 2021 at 02:50, Leonard Crestez <cdleonard@gmail.com> 
>>> +       /* If an old value exists for same local_id it is deleted */
>>> +       key_info = __tcp_authopt_key_info_lookup(sk, info, 
>>> opt.local_id);
>>> +       if (key_info)
>>> +               tcp_authopt_key_del(sk, info, key_info);
>>> +       key_info = sock_kmalloc(sk, sizeof(*key_info), GFP_KERNEL | 
>>> __GFP_ZERO);
>>> +       if (!key_info)
>>> +               return -ENOMEM;
>>
>> 1. You don't need sock_kmalloc() together with tcp_authopt_key_del().
>>      It just frees the memory and allocates it back straight away - no
>> sense in doing that.
> 
> The list is scanned in multiple places in later commits using nothing 
> but an rcu_read_lock, this means that keys can't be updated in-place.
> 
>> 2. I think RFC says you must not allow a user to change an existing key:
>>> MKT parameters are not changed. Instead, new MKTs can be installed, 
>>> and a connection
>>> can change which MKT it uses.
>>
>> IIUC, it means that one can't just change an existing MKT, but one can
>> remove and later
>> add MKT with the same (send_id, recv_id, src_addr/port, dst_addr/port).
>>
>> So, a reasonable thing to do:
>> if (key_info)
>>      return -EEXIST.
> 
> You're right, making the user delete keys explicitly is better.

On a second thought this might be required to mark keys as "send-only" 
and "recv-only" atomically.

Separately from RFC5925 some vendors implement a "keychain" model based 
on RFC8177 where each key has a distinct "accept-lifetime" and a 
"send-lifetime". This could be implemented by adding flags 
"expired_for_send" and "expired_for_recv" but requires the ability to 
set an expiration mark without the key ever being deleted.

--
Regards,
Leonard
diff mbox series

Patch

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 48d8a363319e..cfddfc720b00 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -140,10 +140,12 @@  struct tcp_request_sock {
 static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req)
 {
 	return (struct tcp_request_sock *)req;
 }
 
+struct tcp_authopt_info;
+
 struct tcp_sock {
 	/* inet_connection_sock has to be the first member of tcp_sock */
 	struct inet_connection_sock	inet_conn;
 	u16	tcp_header_len;	/* Bytes of tcp header to send		*/
 	u16	gso_segs;	/* Max number of segs per GSO packet	*/
@@ -403,10 +405,14 @@  struct tcp_sock {
 
 /* TCP MD5 Signature Option information */
 	struct tcp_md5sig_info	__rcu *md5sig_info;
 #endif
 
+#ifdef CONFIG_TCP_AUTHOPT
+	struct tcp_authopt_info	__rcu *authopt_info;
+#endif
+
 /* TCP fastopen related information */
 	struct tcp_fastopen_request *fastopen_req;
 	/* fastopen_rsk points to request_sock that resulted in this big
 	 * socket. Used to retransmit SYNACKs etc.
 	 */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 3166dc15d7d6..bb76554e8fe5 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -182,10 +182,11 @@  void tcp_time_wait(struct sock *sk, int state, int timeo);
 #define TCPOPT_WINDOW		3	/* Window scaling */
 #define TCPOPT_SACK_PERM        4       /* SACK Permitted */
 #define TCPOPT_SACK             5       /* SACK Block */
 #define TCPOPT_TIMESTAMP	8	/* Better RTT estimations/PAWS */
 #define TCPOPT_MD5SIG		19	/* MD5 Signature (RFC2385) */
+#define TCPOPT_AUTHOPT		29	/* Auth Option (RFC5925) */
 #define TCPOPT_MPTCP		30	/* Multipath TCP (RFC6824) */
 #define TCPOPT_FASTOPEN		34	/* Fast open (RFC7413) */
 #define TCPOPT_EXP		254	/* Experimental */
 /* Magic number to be after the option value for sharing TCP
  * experimental options. See draft-ietf-tcpm-experimental-options-00.txt
diff --git a/include/net/tcp_authopt.h b/include/net/tcp_authopt.h
new file mode 100644
index 000000000000..458d108bb7a8
--- /dev/null
+++ b/include/net/tcp_authopt.h
@@ -0,0 +1,55 @@ 
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _LINUX_TCP_AUTHOPT_H
+#define _LINUX_TCP_AUTHOPT_H
+
+#include <uapi/linux/tcp.h>
+
+/* Representation of a Master Key Tuple as per RFC5925 */
+struct tcp_authopt_key_info {
+	struct hlist_node node;
+	/* Local identifier */
+	u32 local_id;
+	u32 flags;
+	/* Wire identifiers */
+	u8 send_id, recv_id;
+	u8 alg_id;
+	u8 keylen;
+	u8 key[TCP_AUTHOPT_MAXKEYLEN];
+	struct rcu_head rcu;
+	struct sockaddr_storage addr;
+};
+
+/* Per-socket information regarding tcp_authopt */
+struct tcp_authopt_info {
+	/* List of tcp_authopt_key_info */
+	struct hlist_head head;
+	u32 flags;
+	u32 src_isn;
+	u32 dst_isn;
+	struct rcu_head rcu;
+};
+
+#ifdef CONFIG_TCP_AUTHOPT
+void tcp_authopt_clear(struct sock *sk);
+int tcp_set_authopt(struct sock *sk, sockptr_t optval, unsigned int optlen);
+int tcp_get_authopt_val(struct sock *sk, struct tcp_authopt *key);
+int tcp_set_authopt_key(struct sock *sk, sockptr_t optval, unsigned int optlen);
+#else
+static inline int tcp_set_authopt(struct sock *sk, sockptr_t optval, unsigned int optlen)
+{
+	return -ENOPROTOOPT;
+}
+static inline int tcp_get_authopt_val(struct sock *sk, struct tcp_authopt *key)
+{
+	return -ENOPROTOOPT;
+}
+static inline void tcp_authopt_clear(struct sock *sk)
+{
+}
+static inline int tcp_set_authopt_key(struct sock *sk, sockptr_t optval, unsigned int optlen)
+{
+	return -ENOPROTOOPT;
+}
+#endif
+
+#endif /* _LINUX_TCP_AUTHOPT_H */
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 8fc09e8638b3..bc47664156eb 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -126,10 +126,12 @@  enum {
 #define TCP_INQ			36	/* Notify bytes available to read as a cmsg on read */
 
 #define TCP_CM_INQ		TCP_INQ
 
 #define TCP_TX_DELAY		37	/* delay outgoing packets by XX usec */
+#define TCP_AUTHOPT			38	/* TCP Authentication Option (RFC2385) */
+#define TCP_AUTHOPT_KEY		39	/* TCP Authentication Option update key (RFC2385) */
 
 
 #define TCP_REPAIR_ON		1
 #define TCP_REPAIR_OFF		0
 #define TCP_REPAIR_OFF_NO_WP	-1	/* Turn off without window probes */
@@ -340,10 +342,80 @@  struct tcp_diag_md5sig {
 	__u16	tcpm_keylen;
 	__be32	tcpm_addr[4];
 	__u8	tcpm_key[TCP_MD5SIG_MAXKEYLEN];
 };
 
+/**
+ * enum tcp_authopt_flag - flags for `tcp_authopt.flags`
+ */
+enum tcp_authopt_flag {
+	/**
+	 * @TCP_AUTHOPT_FLAG_REJECT_UNEXPECTED:
+	 *	Configure behavior of segments with TCP-AO coming from hosts for which no
+	 *	key is configured. The default recommended by RFC is to silently accept
+	 *	such connections.
+	 */
+	TCP_AUTHOPT_FLAG_REJECT_UNEXPECTED = (1 << 2),
+};
+
+/**
+ * struct tcp_authopt - Per-socket options related to TCP Authentication Option
+ */
+struct tcp_authopt {
+	/** @flags: Combination of &enum tcp_authopt_flag */
+	__u32	flags;
+};
+
+/**
+ * enum tcp_authopt_key_flag - flags for `tcp_authopt.flags`
+ *
+ * @TCP_AUTHOPT_KEY_DEL: Delete the key by local_id and ignore all other fields.
+ * @TCP_AUTHOPT_KEY_EXCLUDE_OPTS: Exclude TCP options from signature.
+ * @TCP_AUTHOPT_KEY_ADDR_BIND: Key only valid for `tcp_authopt.addr`
+ */
+enum tcp_authopt_key_flag {
+	TCP_AUTHOPT_KEY_DEL = (1 << 0),
+	TCP_AUTHOPT_KEY_EXCLUDE_OPTS = (1 << 1),
+	TCP_AUTHOPT_KEY_ADDR_BIND = (1 << 2),
+};
+
+/* for TCP_AUTHOPT_KEY socket option */
+#define TCP_AUTHOPT_MAXKEYLEN	80
+
+enum tcp_authopt_alg {
+	TCP_AUTHOPT_ALG_HMAC_SHA_1_96 = 1,
+	TCP_AUTHOPT_ALG_AES_128_CMAC_96 = 2,
+};
+
+/**
+ * struct tcp_authopt_key - TCP Authentication KEY
+ *
+ * Each key is identified by a non-zero local_id which is managed by the application.
+ */
+struct tcp_authopt_key {
+	/** @flags: Combination of &enum tcp_authopt_key_flag */
+	__u32	flags;
+	/** @local_id: Local identifier */
+	__u32	local_id;
+	/** @send_id: keyid value for send */
+	__u8	send_id;
+	/** @recv_id: keyid value for receive */
+	__u8	recv_id;
+	/** @alg: One of &enum tcp_authopt_alg */
+	__u8	alg;
+	/** @keylen: Length of the key buffer */
+	__u8	keylen;
+	/** @key: Secret key */
+	__u8	key[TCP_AUTHOPT_MAXKEYLEN];
+	/**
+	 * @addr: Key is only valid for this address
+	 *
+	 * Ignored unless TCP_AUTHOPT_KEY_ADDR_BIND flag is set
+	 */
+	struct __kernel_sockaddr_storage addr;
+};
+
 /* setsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, ...) */
 
 #define TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT 0x1
 struct tcp_zerocopy_receive {
 	__u64 address;		/* in: address of mapping */
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 87983e70f03f..6459f4ea6f1d 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -740,5 +740,19 @@  config TCP_MD5SIG
 	  RFC2385 specifies a method of giving MD5 protection to TCP sessions.
 	  Its main (only?) use is to protect BGP sessions between core routers
 	  on the Internet.
 
 	  If unsure, say N.
+
+config TCP_AUTHOPT
+	bool "TCP: Authentication Option support (RFC5925)"
+	select CRYPTO
+	select CRYPTO_SHA1
+	select CRYPTO_HMAC
+	select CRYPTO_AES
+	select CRYPTO_CMAC
+	help
+	  RFC5925 specifies a new method of giving protection to TCP sessions.
+	  Its intended use is to protect BGP sessions between core routers
+	  on the Internet. It obsoletes TCP MD5 (RFC2385) but is incompatible.
+
+	  If unsure, say N.
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index bbdd9c44f14e..d336f32ce177 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -59,10 +59,11 @@  obj-$(CONFIG_TCP_CONG_NV) += tcp_nv.o
 obj-$(CONFIG_TCP_CONG_VENO) += tcp_veno.o
 obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
 obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
 obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
 obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
+obj-$(CONFIG_TCP_AUTHOPT) += tcp_authopt.o
 obj-$(CONFIG_NET_SOCK_MSG) += tcp_bpf.o
 obj-$(CONFIG_BPF_SYSCALL) += udp_bpf.o
 obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
 
 obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f931def6302e..fd90e80afa2c 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -271,10 +271,11 @@ 
 
 #include <net/icmp.h>
 #include <net/inet_common.h>
 #include <net/tcp.h>
 #include <net/mptcp.h>
+#include <net/tcp_authopt.h>
 #include <net/xfrm.h>
 #include <net/ip.h>
 #include <net/sock.h>
 
 #include <linux/uaccess.h>
@@ -3573,10 +3574,16 @@  static int do_tcp_setsockopt(struct sock *sk, int level, int optname,
 	case TCP_MD5SIG:
 	case TCP_MD5SIG_EXT:
 		err = tp->af_specific->md5_parse(sk, optname, optval, optlen);
 		break;
 #endif
+	case TCP_AUTHOPT:
+		err = tcp_set_authopt(sk, optval, optlen);
+		break;
+	case TCP_AUTHOPT_KEY:
+		err = tcp_set_authopt_key(sk, optval, optlen);
+		break;
 	case TCP_USER_TIMEOUT:
 		/* Cap the max time in ms TCP will retry or probe the window
 		 * before giving up and aborting (ETIMEDOUT) a connection.
 		 */
 		if (val < 0)
@@ -4219,10 +4226,30 @@  static int do_tcp_getsockopt(struct sock *sk, int level,
 		if (!err && copy_to_user(optval, &zc, len))
 			err = -EFAULT;
 		return err;
 	}
 #endif
+#ifdef CONFIG_TCP_AUTHOPT
+	case TCP_AUTHOPT: {
+		struct tcp_authopt info;
+
+		if (get_user(len, optlen))
+			return -EFAULT;
+
+		lock_sock(sk);
+		tcp_get_authopt_val(sk, &info);
+		release_sock(sk);
+
+		len = min_t(unsigned int, len, sizeof(info));
+		if (put_user(len, optlen))
+			return -EFAULT;
+		if (copy_to_user(optval, &info, len))
+			return -EFAULT;
+		return 0;
+	}
+#endif
+
 	default:
 		return -ENOPROTOOPT;
 	}
 
 	if (put_user(len, optlen))
diff --git a/net/ipv4/tcp_authopt.c b/net/ipv4/tcp_authopt.c
new file mode 100644
index 000000000000..5fa7bce8891b
--- /dev/null
+++ b/net/ipv4/tcp_authopt.c
@@ -0,0 +1,172 @@ 
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/kernel.h>
+#include <net/tcp.h>
+#include <net/tcp_authopt.h>
+#include <crypto/hash.h>
+#include <trace/events/tcp.h>
+
+struct tcp_authopt_key_info *__tcp_authopt_key_info_lookup(const struct sock *sk,
+							   struct tcp_authopt_info *info,
+							   int key_id)
+{
+	struct tcp_authopt_key_info *key;
+
+	hlist_for_each_entry_rcu(key, &info->head, node, lockdep_sock_is_held(sk))
+		if (key->local_id == key_id)
+			return key;
+
+	return NULL;
+}
+
+static struct tcp_authopt_info *__tcp_authopt_info_get_or_create(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcp_authopt_info *info;
+
+	info = rcu_dereference_check(tp->authopt_info, lockdep_sock_is_held(sk));
+	if (info)
+		return info;
+
+	info = kmalloc(sizeof(*info), GFP_KERNEL | __GFP_ZERO);
+	if (!info)
+		return ERR_PTR(-ENOMEM);
+
+	sk_nocaps_add(sk, NETIF_F_GSO_MASK);
+	INIT_HLIST_HEAD(&info->head);
+	rcu_assign_pointer(tp->authopt_info, info);
+
+	return info;
+}
+
+int tcp_set_authopt(struct sock *sk, sockptr_t optval, unsigned int optlen)
+{
+	struct tcp_authopt opt;
+	struct tcp_authopt_info *info;
+
+	WARN_ON(!lockdep_sock_is_held(sk));
+
+	/* If userspace optlen is too short fill the rest with zeros */
+	if (optlen > sizeof(opt))
+		return -EINVAL;
+	memset(&opt, 0, sizeof(opt));
+	if (copy_from_sockptr(&opt, optval, optlen))
+		return -EFAULT;
+
+	info = __tcp_authopt_info_get_or_create(sk);
+	if (IS_ERR(info))
+		return PTR_ERR(info);
+
+	info->flags = opt.flags & TCP_AUTHOPT_FLAG_REJECT_UNEXPECTED);
+
+	return 0;
+}
+
+int tcp_get_authopt_val(struct sock *sk, struct tcp_authopt *opt)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcp_authopt_info *info;
+
+	WARN_ON(!lockdep_sock_is_held(sk));
+	memset(opt, 0, sizeof(*opt));
+	info = rcu_dereference_check(tp->authopt_info, lockdep_sock_is_held(sk));
+	if (!info)
+		return -EINVAL;
+	opt->flags = info->flags & TCP_AUTHOPT_FLAG_REJECT_UNEXPECTED;
+
+	return 0;
+}
+
+static void tcp_authopt_key_del(struct sock *sk,
+				struct tcp_authopt_info *info,
+				struct tcp_authopt_key_info *key)
+{
+	hlist_del_rcu(&key->node);
+	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
+	kfree_rcu(key, rcu);
+}
+
+/* free info and keys but don't touch tp->authopt_info */
+void __tcp_authopt_info_free(struct sock *sk, struct tcp_authopt_info *info)
+{
+	struct hlist_node *n;
+	struct tcp_authopt_key_info *key;
+
+	hlist_for_each_entry_safe(key, n, &info->head, node)
+		tcp_authopt_key_del(sk, info, key);
+	kfree_rcu(info, rcu);
+}
+
+/* free everything and clear tcp_sock.authopt_info to NULL */
+void tcp_authopt_clear(struct sock *sk)
+{
+	struct tcp_authopt_info *info;
+
+	info = rcu_dereference_protected(tcp_sk(sk)->authopt_info, lockdep_sock_is_held(sk));
+	if (info) {
+		__tcp_authopt_info_free(sk, info);
+		tcp_sk(sk)->authopt_info = NULL;
+	}
+}
+
+int tcp_set_authopt_key(struct sock *sk, sockptr_t optval, unsigned int optlen)
+{
+	struct tcp_authopt_key opt;
+	struct tcp_authopt_info *info;
+	struct tcp_authopt_key_info *key_info;
+
+	/* If userspace optlen is too short fill the rest with zeros */
+	if (optlen > sizeof(opt))
+		return -EINVAL;
+	memset(&opt, 0, sizeof(opt));
+	if (copy_from_sockptr(&opt, optval, optlen))
+		return -EFAULT;
+
+	if (opt.keylen > TCP_AUTHOPT_MAXKEYLEN)
+		return -EINVAL;
+
+	if (opt.local_id == 0)
+		return -EINVAL;
+
+	/* Delete is a special case: we ignore all fields other than local_id */
+	if (opt.flags & TCP_AUTHOPT_KEY_DEL) {
+		info = rcu_dereference_check(tcp_sk(sk)->authopt_info, lockdep_sock_is_held(sk));
+		if (!info)
+			return -ENOENT;
+		key_info = __tcp_authopt_key_info_lookup(sk, info, opt.local_id);
+		if (!key_info)
+			return -ENOENT;
+		tcp_authopt_key_del(sk, info, key_info);
+		return 0;
+	}
+
+	/* Initialize tcp_authopt_info if not already set */
+	info = __tcp_authopt_info_get_or_create(sk);
+	if (IS_ERR(info))
+		return PTR_ERR(info);
+
+	/* check key family */
+	if (opt.flags & TCP_AUTHOPT_KEY_ADDR_BIND) {
+		if (sk->sk_family != opt.addr.ss_family)
+			return -EINVAL;
+	}
+
+	/* If an old value exists for same local_id it is deleted */
+	key_info = __tcp_authopt_key_info_lookup(sk, info, opt.local_id);
+	if (key_info)
+		tcp_authopt_key_del(sk, info, key_info);
+	key_info = sock_kmalloc(sk, sizeof(*key_info), GFP_KERNEL | __GFP_ZERO);
+	if (!key_info)
+		return -ENOMEM;
+	key_info->local_id = opt.local_id;
+	key_info->flags = opt.flags & (TCP_AUTHOPT_KEY_EXCLUDE_OPTS | TCP_AUTHOPT_KEY_ADDR_BIND);
+	key_info->send_id = opt.send_id;
+	key_info->recv_id = opt.recv_id;
+	key_info->alg_id = opt.alg;
+	key_info->keylen = opt.keylen;
+	memcpy(key_info->key, opt.key, opt.keylen);
+	memcpy(&key_info->addr, &opt.addr, sizeof(key_info->addr));
+	hlist_add_head_rcu(&key_info->node, &info->head);
+
+	return 0;
+}
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 2e62e0d6373a..1348615c7576 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -60,10 +60,11 @@ 
 
 #include <net/net_namespace.h>
 #include <net/icmp.h>
 #include <net/inet_hashtables.h>
 #include <net/tcp.h>
+#include <net/tcp_authopt.h>
 #include <net/transp_v6.h>
 #include <net/ipv6.h>
 #include <net/inet_common.h>
 #include <net/timewait_sock.h>
 #include <net/xfrm.h>
@@ -2256,10 +2257,11 @@  void tcp_v4_destroy_sock(struct sock *sk)
 		tcp_clear_md5_list(sk);
 		kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
 		tp->md5sig_info = NULL;
 	}
 #endif
+	tcp_authopt_clear(sk);
 
 	/* Clean up a referenced TCP bind bucket. */
 	if (inet_csk(sk)->icsk_bind_hash)
 		inet_put_port(sk);