@@ -12,6 +12,7 @@
#include <linux/bits.h>
#include <linux/mctp.h>
#include <net/net_namespace.h>
+#include <net/sock.h>
/* MCTP packet definitions */
struct mctp_hdr {
@@ -46,6 +47,64 @@ static inline struct mctp_hdr *mctp_hdr(struct sk_buff *skb)
return (struct mctp_hdr *)skb_network_header(skb);
}
+/* socket implementation */
+struct mctp_sock {
+ struct sock sk;
+
+ /* bind() params */
+ int bind_net;
+ mctp_eid_t bind_addr;
+ __u8 bind_type;
+
+ /* list of mctp_sk_key, for incoming tag lookup. updates protected
+ * by sk->net->keys_lock
+ */
+ struct hlist_head keys;
+};
+
+/* Key for matching incoming packets to sockets or reassembly contexts.
+ * Packets are matched on (src,dest,tag).
+ *
+ * Lifetime requirements:
+ *
+ * - keys are free()ed via RCU
+ *
+ * - a mctp_sk_key contains a reference to a struct sock; this is valid
+ * for the life of the key. On sock destruction (through unhash), the key is
+ * removed from lists (see below), and will not be observable after a RCU
+ * grace period.
+ *
+ * any RX occurring within that grace period may still queue to the socket,
+ * but will hit the SOCK_DEAD case before the socket is freed.
+ *
+ * - these mctp_sk_keys appear on two lists:
+ * 1) the struct mctp_sock->keys list
+ * 2) the struct netns_mctp->keys list
+ *
+ * updates to either list are performed under the netns_mctp->keys
+ * lock.
+ *
+ * - there is a single destruction path for a mctp_sk_key - through socket
+ * unhash (see mctp_sk_unhash). This performs the list removal under
+ * keys_lock.
+ */
+struct mctp_sk_key {
+ mctp_eid_t peer_addr;
+ mctp_eid_t local_addr;
+ __u8 tag; /* incoming tag match; invert TO for local */
+
+ /* we hold a ref to sk when set */
+ struct sock *sk;
+
+ /* routing lookup list */
+ struct hlist_node hlist;
+
+ /* per-socket list */
+ struct hlist_node sklist;
+
+ struct rcu_head rcu;
+};
+
struct mctp_skb_cb {
unsigned int magic;
unsigned int net;
@@ -12,6 +12,19 @@ struct netns_mctp {
/* Only updated under RTNL, entries freed via RCU */
struct list_head routes;
+ /* Bound sockets: list of sockets bound by type.
+ * This list is updated from non-atomic contexts (under bind_lock),
+ * and read (under rcu) in packet rx
+ */
+ struct mutex bind_lock;
+ struct hlist_head binds;
+
+ /* tag allocations. This list is read and updated from atomic contexts,
+ * but elements are free()ed after a RCU grace-period
+ */
+ spinlock_t keys_lock;
+ struct hlist_head keys;
+
/* neighbour table */
struct mutex neigh_lock;
struct list_head neighbours;
@@ -18,10 +18,6 @@
/* socket implementation */
-struct mctp_sock {
- struct sock sk;
-};
-
static int mctp_release(struct socket *sock)
{
struct sock *sk = sock->sk;
@@ -36,18 +32,160 @@ static int mctp_release(struct socket *sock)
static int mctp_bind(struct socket *sock, struct sockaddr *addr, int addrlen)
{
- return 0;
+ struct sock *sk = sock->sk;
+ struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk);
+ struct sockaddr_mctp *smctp;
+ int rc;
+
+ if (addrlen < sizeof(*smctp))
+ return -EINVAL;
+
+ if (addr->sa_family != AF_MCTP)
+ return -EAFNOSUPPORT;
+
+ if (!capable(CAP_NET_BIND_SERVICE))
+ return -EACCES;
+
+ /* it's a valid sockaddr for MCTP, cast and do protocol checks */
+ smctp = (struct sockaddr_mctp *)addr;
+
+ lock_sock(sk);
+
+ /* TODO: allow rebind */
+ if (sk_hashed(sk)) {
+ rc = -EADDRINUSE;
+ goto out_release;
+ }
+ msk->bind_net = smctp->smctp_network;
+ msk->bind_addr = smctp->smctp_addr.s_addr;
+ msk->bind_type = smctp->smctp_type & 0x7f; /* ignore the IC bit */
+
+ rc = sk->sk_prot->hash(sk);
+
+out_release:
+ release_sock(sk);
+
+ return rc;
}
static int mctp_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
{
- return 0;
+ DECLARE_SOCKADDR(struct sockaddr_mctp *, addr, msg->msg_name);
+ const int hlen = MCTP_HEADER_MAXLEN + sizeof(struct mctp_hdr);
+ int rc, addrlen = msg->msg_namelen;
+ struct sock *sk = sock->sk;
+ struct mctp_skb_cb *cb;
+ struct mctp_route *rt;
+ struct sk_buff *skb;
+
+ if (addr) {
+ if (addrlen < sizeof(struct sockaddr_mctp))
+ return -EINVAL;
+ if (addr->smctp_family != AF_MCTP)
+ return -EINVAL;
+ if (addr->smctp_tag & ~(MCTP_TAG_MASK | MCTP_TAG_OWNER))
+ return -EINVAL;
+
+ } else {
+ /* TODO: connect()ed sockets */
+ return -EDESTADDRREQ;
+ }
+
+ if (!capable(CAP_NET_RAW))
+ return -EACCES;
+
+ rt = mctp_route_lookup(sock_net(sk), addr->smctp_network,
+ addr->smctp_addr.s_addr);
+ if (!rt)
+ return -EHOSTUNREACH;
+
+ skb = sock_alloc_send_skb(sk, hlen + 1 + len,
+ msg->msg_flags & MSG_DONTWAIT, &rc);
+ if (!skb)
+ return rc;
+
+ skb_reserve(skb, hlen);
+
+ /* set type as fist byte in payload */
+ *(u8 *)skb_put(skb, 1) = addr->smctp_type;
+
+ rc = memcpy_from_msg((void *)skb_put(skb, len), msg, len);
+ if (rc < 0) {
+ kfree_skb(skb);
+ return rc;
+ }
+
+ /* set up cb */
+ cb = __mctp_cb(skb);
+ cb->net = addr->smctp_network;
+
+ rc = mctp_local_output(sk, rt, skb, addr->smctp_addr.s_addr,
+ addr->smctp_tag);
+
+ return rc ? : len;
}
static int mctp_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
int flags)
{
- return 0;
+ DECLARE_SOCKADDR(struct sockaddr_mctp *, addr, msg->msg_name);
+ struct sock *sk = sock->sk;
+ struct sk_buff *skb;
+ size_t msglen;
+ u8 type;
+ int rc;
+
+ if (flags & ~(MSG_DONTWAIT | MSG_TRUNC | MSG_PEEK))
+ return -EOPNOTSUPP;
+
+ skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &rc);
+ if (!skb)
+ return rc;
+
+ if (!skb->len) {
+ rc = 0;
+ goto out_free;
+ }
+
+ /* extract message type, remove from data */
+ type = *((u8 *)skb->data);
+ msglen = skb->len - 1;
+
+ if (len < msglen)
+ msg->msg_flags |= MSG_TRUNC;
+ else
+ len = msglen;
+
+ rc = skb_copy_datagram_msg(skb, 1, msg, len);
+ if (rc < 0)
+ goto out_free;
+
+ sock_recv_ts_and_drops(msg, sk, skb);
+
+ if (addr) {
+ struct mctp_skb_cb *cb = mctp_cb(skb);
+ /* TODO: expand mctp_skb_cb for header fields? */
+ struct mctp_hdr *hdr = mctp_hdr(skb);
+
+ hdr = mctp_hdr(skb);
+ addr = msg->msg_name;
+ addr->smctp_family = AF_MCTP;
+ addr->smctp_network = cb->net;
+ addr->smctp_addr.s_addr = hdr->src;
+ addr->smctp_type = type;
+ addr->smctp_tag = hdr->flags_seq_tag &
+ (MCTP_HDR_TAG_MASK | MCTP_HDR_FLAG_TO);
+ msg->msg_namelen = sizeof(*addr);
+ }
+
+ rc = len;
+
+ if (flags & MSG_TRUNC)
+ rc = msglen;
+
+out_free:
+ skb_free_datagram(sk, skb);
+ return rc;
}
static int mctp_setsockopt(struct socket *sock, int level, int optname,
@@ -83,16 +221,63 @@ static const struct proto_ops mctp_dgram_ops = {
.sendpage = sock_no_sendpage,
};
+static int mctp_sk_init(struct sock *sk)
+{
+ struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk);
+
+ INIT_HLIST_HEAD(&msk->keys);
+ return 0;
+}
+
static void mctp_sk_close(struct sock *sk, long timeout)
{
sk_common_release(sk);
}
+static int mctp_sk_hash(struct sock *sk)
+{
+ struct net *net = sock_net(sk);
+
+ mutex_lock(&net->mctp.bind_lock);
+ sk_add_node_rcu(sk, &net->mctp.binds);
+ mutex_unlock(&net->mctp.bind_lock);
+
+ return 0;
+}
+
+static void mctp_sk_unhash(struct sock *sk)
+{
+ struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk);
+ struct net *net = sock_net(sk);
+ struct mctp_sk_key *key;
+ struct hlist_node *tmp;
+ unsigned long flags;
+
+ /* remove from any type-based binds */
+ mutex_lock(&net->mctp.bind_lock);
+ sk_del_node_init_rcu(sk);
+ mutex_unlock(&net->mctp.bind_lock);
+
+ /* remove tag allocations */
+ spin_lock_irqsave(&net->mctp.keys_lock, flags);
+ hlist_for_each_entry_safe(key, tmp, &msk->keys, sklist) {
+ hlist_del_rcu(&key->sklist);
+ hlist_del_rcu(&key->hlist);
+ kfree_rcu(key, rcu);
+ }
+ spin_unlock_irqrestore(&net->mctp.keys_lock, flags);
+
+ synchronize_rcu();
+}
+
static struct proto mctp_proto = {
.name = "MCTP",
.owner = THIS_MODULE,
.obj_size = sizeof(struct mctp_sock),
+ .init = mctp_sk_init,
.close = mctp_sk_close,
+ .hash = mctp_sk_hash,
+ .unhash = mctp_sk_unhash,
};
static int mctp_pf_create(struct net *net, struct socket *sock,
@@ -147,6 +332,10 @@ static __init int mctp_init(void)
{
int rc;
+ /* ensure our uapi tag definitions match the header format */
+ BUILD_BUG_ON(MCTP_TAG_OWNER != MCTP_HDR_FLAG_TO);
+ BUILD_BUG_ON(MCTP_TAG_MASK != MCTP_HDR_TAG_MASK);
+
pr_info("mctp: management component transport protocol core\n");
rc = sock_register(&mctp_pf);
@@ -30,10 +30,139 @@ static int mctp_route_discard(struct mctp_route *route, struct sk_buff *skb)
return 0;
}
+static struct mctp_sock *mctp_lookup_bind(struct net *net, struct sk_buff *skb)
+{
+ struct mctp_skb_cb *cb = mctp_cb(skb);
+ struct mctp_hdr *mh;
+ struct sock *sk;
+ u8 type;
+
+ WARN_ON(!rcu_read_lock_held());
+
+ /* TODO: look up in skb->cb? */
+ mh = mctp_hdr(skb);
+
+ if (!skb_headlen(skb))
+ return NULL;
+
+ type = (*(u8 *)skb->data) & 0x7f;
+
+ sk_for_each_rcu(sk, &net->mctp.binds) {
+ struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk);
+
+ if (msk->bind_net != MCTP_NET_ANY && msk->bind_net != cb->net)
+ continue;
+
+ if (msk->bind_type != type)
+ continue;
+
+ if (msk->bind_addr != MCTP_ADDR_ANY &&
+ msk->bind_addr != mh->dest)
+ continue;
+
+ return msk;
+ }
+
+ return NULL;
+}
+
+static bool mctp_key_match(struct mctp_sk_key *key, mctp_eid_t local,
+ mctp_eid_t peer, u8 tag)
+{
+ if (key->local_addr != local)
+ return false;
+
+ if (key->peer_addr != peer)
+ return false;
+
+ if (key->tag != tag)
+ return false;
+
+ return true;
+}
+
+static struct mctp_sk_key *mctp_lookup_key(struct net *net, struct sk_buff *skb,
+ mctp_eid_t peer)
+{
+ struct mctp_sk_key *key, *ret;
+ struct mctp_hdr *mh;
+ u8 tag;
+
+ WARN_ON(!rcu_read_lock_held());
+
+ mh = mctp_hdr(skb);
+ tag = mh->flags_seq_tag & (MCTP_HDR_TAG_MASK | MCTP_HDR_FLAG_TO);
+
+ ret = NULL;
+
+ hlist_for_each_entry_rcu(key, &net->mctp.keys, hlist) {
+ if (mctp_key_match(key, mh->dest, peer, tag)) {
+ ret = key;
+ break;
+ }
+ }
+
+ return ret;
+}
+
static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb)
{
- /* -> to local stack */
- /* TODO: socket lookup, reassemble */
+ struct net *net = dev_net(skb->dev);
+ struct mctp_sk_key *key;
+ struct mctp_sock *msk;
+ struct mctp_hdr *mh;
+
+ msk = NULL;
+
+ /* we may be receiving a locally-routed packet; drop source sk
+ * accounting
+ */
+ skb_orphan(skb);
+
+ /* ensure we have enough data for a header and a type */
+ if (skb->len < sizeof(struct mctp_hdr) + 1)
+ goto drop;
+
+ /* grab header, advance data ptr */
+ mh = mctp_hdr(skb);
+ skb_pull(skb, sizeof(struct mctp_hdr));
+
+ if (mh->ver != 1)
+ goto drop;
+
+ /* TODO: reassembly */
+ if ((mh->flags_seq_tag & (MCTP_HDR_FLAG_SOM | MCTP_HDR_FLAG_EOM))
+ != (MCTP_HDR_FLAG_SOM | MCTP_HDR_FLAG_EOM))
+ goto drop;
+
+ rcu_read_lock();
+ /* 1. lookup socket matching (src,dest,tag) */
+ key = mctp_lookup_key(net, skb, mh->src);
+
+ /* 2. lookup socket macthing (BCAST,dest,tag) */
+ if (!key)
+ key = mctp_lookup_key(net, skb, MCTP_ADDR_ANY);
+
+ /* 3. SOM? -> lookup bound socket, conditionally (!EOM) create
+ * mapping for future (1)/(2).
+ */
+ if (key)
+ msk = container_of(key->sk, struct mctp_sock, sk);
+ else if (!msk && (mh->flags_seq_tag & MCTP_HDR_FLAG_SOM))
+ msk = mctp_lookup_bind(net, skb);
+
+ if (!msk)
+ goto unlock_drop;
+
+ sock_queue_rcv_skb(&msk->sk, skb);
+
+ rcu_read_unlock();
+
+ return 0;
+
+unlock_drop:
+ rcu_read_unlock();
+drop:
kfree_skb(skb);
return 0;
}
@@ -91,6 +220,80 @@ static struct mctp_route *mctp_route_alloc(void)
return rt;
}
+/* tag management */
+static void mctp_reserve_tag(struct net *net, struct mctp_sk_key *key,
+ struct mctp_sock *msk)
+{
+ struct netns_mctp *mns = &net->mctp;
+
+ lockdep_assert_held(&mns->keys_lock);
+
+ key->sk = &msk->sk;
+
+ /* we hold the net->key_lock here, allowing updates to both
+ * then net and sk
+ */
+ hlist_add_head_rcu(&key->hlist, &mns->keys);
+ hlist_add_head_rcu(&key->sklist, &msk->keys);
+}
+
+/* Allocate a locally-owned tag value for (saddr, daddr), and reserve
+ * it for the socket msk
+ */
+static int mctp_alloc_local_tag(struct mctp_sock *msk,
+ mctp_eid_t saddr, mctp_eid_t daddr, u8 *tagp)
+{
+ struct net *net = sock_net(&msk->sk);
+ struct netns_mctp *mns = &net->mctp;
+ struct mctp_sk_key *key, *tmp;
+ unsigned long flags;
+ int rc = -EAGAIN;
+ u8 tagbits;
+
+ /* be optimistic, alloc now */
+ key = kzalloc(sizeof(*key), GFP_KERNEL);
+ if (!key)
+ return -ENOMEM;
+ key->local_addr = saddr;
+ key->peer_addr = daddr;
+
+ /* 8 possible tag values */
+ tagbits = 0xff;
+
+ spin_lock_irqsave(&mns->keys_lock, flags);
+
+ /* Walk through the existing keys, looking for potential conflicting
+ * tags. If we find a conflict, clear that bit from tagbits
+ */
+ hlist_for_each_entry(tmp, &mns->keys, hlist) {
+ /* if we don't own the tag, it can't conflict */
+ if (tmp->tag & MCTP_HDR_FLAG_TO)
+ continue;
+
+ if ((tmp->peer_addr == daddr ||
+ tmp->peer_addr == MCTP_ADDR_ANY) &&
+ tmp->local_addr == saddr)
+ tagbits &= ~(1 << tmp->tag);
+
+ if (!tagbits)
+ break;
+ }
+
+ if (tagbits) {
+ key->tag = __ffs(tagbits);
+ mctp_reserve_tag(net, key, msk);
+ *tagp = key->tag;
+ rc = 0;
+ }
+
+ spin_unlock_irqrestore(&mns->keys_lock, flags);
+
+ if (!tagbits)
+ kfree(key);
+
+ return rc;
+}
+
/* routing lookups */
static bool mctp_rt_match_eid(struct mctp_route *rt,
unsigned int net, mctp_eid_t eid)
@@ -140,11 +343,13 @@ int mctp_do_route(struct mctp_route *rt, struct sk_buff *skb)
int mctp_local_output(struct sock *sk, struct mctp_route *rt,
struct sk_buff *skb, mctp_eid_t daddr, u8 req_tag)
{
+ struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk);
struct mctp_skb_cb *cb = mctp_cb(skb);
struct mctp_hdr *hdr;
unsigned long flags;
mctp_eid_t saddr;
int rc;
+ u8 tag;
if (WARN_ON(!rt->dev))
return -EINVAL;
@@ -162,6 +367,15 @@ int mctp_local_output(struct sock *sk, struct mctp_route *rt,
if (rc)
return rc;
+ if (req_tag & MCTP_HDR_FLAG_TO) {
+ rc = mctp_alloc_local_tag(msk, saddr, daddr, &tag);
+ if (rc)
+ return rc;
+ tag |= MCTP_HDR_FLAG_TO;
+ } else {
+ tag = req_tag;
+ }
+
/* TODO: we have the route MTU here; packetise */
skb_reset_transport_header(skb);
@@ -171,8 +385,10 @@ int mctp_local_output(struct sock *sk, struct mctp_route *rt,
hdr->ver = 1;
hdr->dest = daddr;
hdr->src = saddr;
- hdr->flags_seq_tag = MCTP_HDR_FLAG_SOM | MCTP_HDR_FLAG_EOM; /* TODO */
+ hdr->flags_seq_tag = MCTP_HDR_FLAG_SOM | MCTP_HDR_FLAG_EOM | /* TODO */
+ tag;
+ skb->dev = rt->dev->dev;
skb->protocol = htons(ETH_P_MCTP);
skb->priority = 0;
@@ -529,6 +745,10 @@ static int __net_init mctp_routes_net_init(struct net *net)
struct netns_mctp *ns = &net->mctp;
INIT_LIST_HEAD(&ns->routes);
+ INIT_HLIST_HEAD(&ns->binds);
+ mutex_init(&ns->bind_lock);
+ INIT_HLIST_HEAD(&ns->keys);
+ spin_lock_init(&ns->keys_lock);
return 0;
}
Start filling-out the socket syscalls: bind, sendmsg & recvmsg. This requires an input route implementation, so we add to mctp_route_input, allowing lookups on binds & message tags. This just handles single-packet messages at present, we will add fragmentation in a future change. Signed-off-by: Jeremy Kerr <jk@codeconstruct.com.au> --- v2: - require CAP_NET_BIND_SERVICE for bind(), CAP_NET_RAW for TX. - be strict about sendmsg() tag bits v3: - fix comment typos v4: - fix mctp_sock->keys rcu annotations --- include/net/mctp.h | 59 ++++++++++ include/net/netns/mctp.h | 13 +++ net/mctp/af_mctp.c | 203 +++++++++++++++++++++++++++++++++-- net/mctp/route.c | 226 ++++++++++++++++++++++++++++++++++++++- 4 files changed, 491 insertions(+), 10 deletions(-)