@@ -107,7 +107,7 @@ int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct net_device *dev,
memcpy(dev_addr->broadcast, dev->broadcast, MAX_ADDR_LEN);
if (dst_dev_addr)
memcpy(dev_addr->dst_dev_addr, dst_dev_addr, MAX_ADDR_LEN);
- dev_addr->src_dev = dev;
+ dev_addr->bound_dev_if = dev->ifindex;
return 0;
}
EXPORT_SYMBOL(rdma_copy_addr);
@@ -117,6 +117,16 @@ int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr)
struct net_device *dev;
int ret = -EADDRNOTAVAIL;
+ /* Someone has specified we must bind to a specific interface.. */
+ if (dev_addr->bound_dev_if) {
+ dev = dev_get_by_index(&init_net, dev_addr->bound_dev_if);
+ if (!dev)
+ return -ENODEV;
+ ret = rdma_copy_addr(dev_addr, dev, NULL);
+ dev_put(dev);
+ return ret;
+ }
+
switch (addr->sa_family) {
case AF_INET:
dev = ip_dev_find(&init_net,
@@ -176,48 +186,9 @@ static void queue_req(struct addr_req *req)
mutex_unlock(&lock);
}
-static void addr_send_arp(struct sockaddr *dst_in)
-{
- struct rtable *rt;
- struct flowi fl;
-
- memset(&fl, 0, sizeof fl);
-
- switch (dst_in->sa_family) {
- case AF_INET:
- fl.nl_u.ip4_u.daddr =
- ((struct sockaddr_in *) dst_in)->sin_addr.s_addr;
-
- if (ip_route_output_key(&init_net, &rt, &fl))
- return;
-
- neigh_event_send(rt->u.dst.neighbour, NULL);
- ip_rt_put(rt);
- break;
-
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
- case AF_INET6:
- {
- struct dst_entry *dst;
-
- fl.nl_u.ip6_u.daddr =
- ((struct sockaddr_in6 *) dst_in)->sin6_addr;
-
- dst = ip6_route_output(&init_net, NULL, &fl);
- if (!dst)
- return;
-
- neigh_event_send(dst->neighbour, NULL);
- dst_release(dst);
- break;
- }
-#endif
- }
-}
-
-static int addr4_resolve_remote(struct sockaddr_in *src_in,
- struct sockaddr_in *dst_in,
- struct rdma_dev_addr *addr)
+static int addr4_resolve(struct sockaddr_in *src_in,
+ struct sockaddr_in *dst_in,
+ struct rdma_dev_addr *addr)
{
__be32 src_ip = src_in->sin_addr.s_addr;
__be32 dst_ip = dst_in->sin_addr.s_addr;
@@ -229,10 +200,22 @@ static int addr4_resolve_remote(struct sockaddr_in *src_in,
memset(&fl, 0, sizeof fl);
fl.nl_u.ip4_u.daddr = dst_ip;
fl.nl_u.ip4_u.saddr = src_ip;
+ fl.oif = addr->bound_dev_if;
ret = ip_route_output_key(&init_net, &rt, &fl);
if (ret)
goto out;
+ src_in->sin_family = AF_INET;
+ src_in->sin_addr.s_addr = rt->rt_src;
+
+ if (rt->idev->dev == init_net.loopback_dev) {
+ ret = rdma_translate_ip((struct sockaddr *)dst_in, addr);
+ if (!ret)
+ memcpy(addr->dst_dev_addr, addr->src_dev_addr,
+ MAX_ADDR_LEN);
+ goto put;
+ }
+
/* If the device does ARP internally, return 'done' */
if (rt->idev->dev->flags & IFF_NOARP) {
rdma_copy_addr(addr, rt->idev->dev, NULL);
@@ -240,24 +223,16 @@ static int addr4_resolve_remote(struct sockaddr_in *src_in,
}
neigh = neigh_lookup(&arp_tbl, &rt->rt_gateway, rt->idev->dev);
- if (!neigh) {
+ if (!neigh || !(neigh->nud_state & NUD_VALID)) {
+ neigh_event_send(rt->u.dst.neighbour, NULL);
ret = -ENODATA;
+ if (neigh)
+ neigh_release(neigh);
goto put;
}
-
- if (!(neigh->nud_state & NUD_VALID)) {
- ret = -ENODATA;
- goto release;
- }
-
- if (!src_ip) {
- src_in->sin_family = dst_in->sin_family;
- src_in->sin_addr.s_addr = rt->rt_src;
- }
+ neigh_release(neigh);
ret = rdma_copy_addr(addr, neigh->dev, neigh->ha);
-release:
- neigh_release(neigh);
put:
ip_rt_put(rt);
out:
@@ -265,36 +240,64 @@ out:
}
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-static int addr6_resolve_remote(struct sockaddr_in6 *src_in,
- struct sockaddr_in6 *dst_in,
- struct rdma_dev_addr *addr)
+static int addr6_resolve(struct sockaddr_in6 *src_in,
+ struct sockaddr_in6 *dst_in,
+ struct rdma_dev_addr *addr)
{
struct flowi fl;
struct neighbour *neigh;
struct dst_entry *dst;
- int ret = -ENODATA;
+ int ret;
memset(&fl, 0, sizeof fl);
- fl.nl_u.ip6_u.daddr = dst_in->sin6_addr;
- fl.nl_u.ip6_u.saddr = src_in->sin6_addr;
+ ipv6_addr_copy(&fl.fl6_dst, &dst_in->sin6_addr);
+ ipv6_addr_copy(&fl.fl6_src, &src_in->sin6_addr);
+ fl.oif = addr->bound_dev_if;
dst = ip6_route_output(&init_net, NULL, &fl);
- if (!dst)
- return ret;
+ if ((ret = dst->error))
+ goto put;
+
+ if (ipv6_addr_any(&fl.fl6_src)) {
+ ret = ipv6_dev_get_saddr(&init_net, ip6_dst_idev(dst)->dev,
+ &fl.fl6_dst, 0,
+ &fl.fl6_src);
+ if (!ret)
+ goto put;
+ src_in->sin6_family = AF_INET6;
+ ipv6_addr_copy(&src_in->sin6_addr, &fl.fl6_src);
+ }
+
+ if (ip6_dst_idev(dst)->dev == init_net.loopback_dev) {
+ ret = rdma_translate_ip((struct sockaddr *)dst_in, addr);
+ if (!ret)
+ memcpy(addr->dst_dev_addr, addr->src_dev_addr,
+ MAX_ADDR_LEN);
+ goto put;
+ }
- if (dst->dev->flags & IFF_NOARP) {
- ret = rdma_copy_addr(addr, dst->dev, NULL);
- } else {
- neigh = dst->neighbour;
- if (neigh && (neigh->nud_state & NUD_VALID))
- ret = rdma_copy_addr(addr, neigh->dev, neigh->ha);
+ /* If the device does ARP internally, return 'done' */
+ if (ip6_dst_idev(dst)->dev->flags & IFF_NOARP) {
+ ret = rdma_copy_addr(addr, ip6_dst_idev(dst)->dev, NULL);
+ goto put;
}
+ /* XXX why is this different from the addr4_resolve case?
+ Is the neigh_lookup redundant? */
+ neigh = dst->neighbour;
+ if (!neigh || !(neigh->nud_state & NUD_VALID)) {
+ neigh_event_send(dst->neighbour, NULL);
+ ret = -ENODATA;
+ goto put;
+ }
+
+ ret = rdma_copy_addr(addr, neigh->dev, neigh->ha);
+put:
dst_release(dst);
return ret;
}
#else
-static int addr6_resolve_remote(struct sockaddr_in6 *src_in,
+static int addr6_resolve(struct sockaddr_in6 *src_in,
struct sockaddr_in6 *dst_in,
struct rdma_dev_addr *addr)
{
@@ -302,15 +305,15 @@ static int addr6_resolve_remote(struct sockaddr_in6 *src_in,
}
#endif
-static int addr_resolve_remote(struct sockaddr *src_in,
- struct sockaddr *dst_in,
- struct rdma_dev_addr *addr)
+static int addr_resolve(struct sockaddr *src_in,
+ struct sockaddr *dst_in,
+ struct rdma_dev_addr *addr)
{
if (src_in->sa_family == AF_INET) {
- return addr4_resolve_remote((struct sockaddr_in *) src_in,
+ return addr4_resolve((struct sockaddr_in *) src_in,
(struct sockaddr_in *) dst_in, addr);
} else
- return addr6_resolve_remote((struct sockaddr_in6 *) src_in,
+ return addr6_resolve((struct sockaddr_in6 *) src_in,
(struct sockaddr_in6 *) dst_in, addr);
}
@@ -327,8 +330,7 @@ static void process_req(struct work_struct *work)
if (req->status == -ENODATA) {
src_in = (struct sockaddr *) &req->src_addr;
dst_in = (struct sockaddr *) &req->dst_addr;
- req->status = addr_resolve_remote(src_in, dst_in,
- req->addr);
+ req->status = addr_resolve(src_in, dst_in, req->addr);
if (req->status && time_after_eq(jiffies, req->timeout))
req->status = -ETIMEDOUT;
else if (req->status == -ENODATA)
@@ -352,82 +354,6 @@ static void process_req(struct work_struct *work)
}
}
-static int addr_resolve_local(struct sockaddr *src_in,
- struct sockaddr *dst_in,
- struct rdma_dev_addr *addr)
-{
- struct net_device *dev;
- int ret;
-
- switch (dst_in->sa_family) {
- case AF_INET:
- {
- __be32 src_ip = ((struct sockaddr_in *) src_in)->sin_addr.s_addr;
- __be32 dst_ip = ((struct sockaddr_in *) dst_in)->sin_addr.s_addr;
-
- dev = ip_dev_find(&init_net, dst_ip);
- if (!dev)
- return -EADDRNOTAVAIL;
-
- if (ipv4_is_zeronet(src_ip)) {
- src_in->sa_family = dst_in->sa_family;
- ((struct sockaddr_in *) src_in)->sin_addr.s_addr = dst_ip;
- ret = rdma_copy_addr(addr, dev, dev->dev_addr);
- } else if (ipv4_is_loopback(src_ip)) {
- ret = rdma_translate_ip(dst_in, addr);
- if (!ret)
- memcpy(addr->dst_dev_addr, dev->dev_addr, MAX_ADDR_LEN);
- } else {
- ret = rdma_translate_ip(src_in, addr);
- if (!ret)
- memcpy(addr->dst_dev_addr, dev->dev_addr, MAX_ADDR_LEN);
- }
- dev_put(dev);
- break;
- }
-
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
- case AF_INET6:
- {
- struct in6_addr *a;
-
- for_each_netdev(&init_net, dev)
- if (ipv6_chk_addr(&init_net,
- &((struct sockaddr_in6 *) dst_in)->sin6_addr,
- dev, 1))
- break;
-
- if (!dev)
- return -EADDRNOTAVAIL;
-
- a = &((struct sockaddr_in6 *) src_in)->sin6_addr;
-
- if (ipv6_addr_any(a)) {
- src_in->sa_family = dst_in->sa_family;
- ((struct sockaddr_in6 *) src_in)->sin6_addr =
- ((struct sockaddr_in6 *) dst_in)->sin6_addr;
- ret = rdma_copy_addr(addr, dev, dev->dev_addr);
- } else if (ipv6_addr_loopback(a)) {
- ret = rdma_translate_ip(dst_in, addr);
- if (!ret)
- memcpy(addr->dst_dev_addr, dev->dev_addr, MAX_ADDR_LEN);
- } else {
- ret = rdma_translate_ip(src_in, addr);
- if (!ret)
- memcpy(addr->dst_dev_addr, dev->dev_addr, MAX_ADDR_LEN);
- }
- break;
- }
-#endif
-
- default:
- ret = -EADDRNOTAVAIL;
- break;
- }
-
- return ret;
-}
-
int rdma_resolve_ip(struct rdma_addr_client *client,
struct sockaddr *src_addr, struct sockaddr *dst_addr,
struct rdma_dev_addr *addr, int timeout_ms,
@@ -455,10 +381,11 @@ int rdma_resolve_ip(struct rdma_addr_client *client,
src_in = (struct sockaddr *) &req->src_addr;
dst_in = (struct sockaddr *) &req->dst_addr;
- req->status = addr_resolve_local(src_in, dst_in, addr);
- if (req->status == -EADDRNOTAVAIL)
- req->status = addr_resolve_remote(src_in, dst_in, addr);
+ req->status = addr_resolve(src_in, dst_in, addr);
+ /* XXX this scares me, ENODATA is not special, there is no guarentee
+ that any of the functions in the IP stack called by addr_resolve
+ don't return ENODATA in some cases.. */
switch (req->status) {
case 0:
req->timeout = jiffies;
@@ -467,7 +394,6 @@ int rdma_resolve_ip(struct rdma_addr_client *client,
case -ENODATA:
req->timeout = msecs_to_jiffies(timeout_ms) + jiffies;
queue_req(req);
- addr_send_arp(dst_in);
break;
default:
ret = req->status;
@@ -1875,13 +1875,25 @@ err:
return ret;
}
-static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
- struct sockaddr *dst_addr)
+/* IPv6 link local addresses must specify a consistent set of devices
+ to bind to. */
+static int check_ip6_ll(struct rdma_dev_addr *dev_addr,
+ const struct sockaddr *addr)
{
- if (src_addr && src_addr->sa_family)
- return rdma_bind_addr(id, src_addr);
- else
- return cma_bind_any(id, dst_addr->sa_family);
+ const struct sockaddr_in6 *addr6 = (const struct sockaddr_in6 *)addr;
+
+ if (addr->sa_family != AF_INET6)
+ return 0;
+
+ if (ipv6_addr_type(&addr6->sin6_addr) & IPV6_ADDR_LINKLOCAL) {
+ if (dev_addr->bound_dev_if &&
+ dev_addr->bound_dev_if != addr6->sin6_scope_id)
+ return -EINVAL;
+ dev_addr->bound_dev_if = addr6->sin6_scope_id;
+ if (!dev_addr->bound_dev_if)
+ return -EINVAL;
+ }
+ return 0;
}
int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
@@ -1891,8 +1903,16 @@ int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
int ret;
id_priv = container_of(id, struct rdma_id_private, id);
+
+ if (src_addr) {
+ if ((ret = check_ip6_ll(&id->route.addr.dev_addr, src_addr)))
+ return ret;
+ }
+ if ((ret = check_ip6_ll(&id->route.addr.dev_addr, dst_addr)))
+ return ret;
+
if (id_priv->state == CMA_IDLE) {
- ret = cma_bind_addr(id, src_addr, dst_addr);
+ ret = cma_bind_any(id, dst_addr->sa_family);
if (ret)
return ret;
}
@@ -1900,12 +1920,18 @@ int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
if (!cma_comp_exch(id_priv, CMA_ADDR_BOUND, CMA_ADDR_QUERY))
return -EINVAL;
+ /* NOTE: If the user has called rdma_bind_addr() and then passes
+ a src_addr value here the interpretation is that they wished
+ to use whatever address they gave to bind to choose a device,
+ and this address as the source IP for the connection. */
+
atomic_inc(&id_priv->refcount);
memcpy(&id->route.addr.dst_addr, dst_addr, ip_addr_size(dst_addr));
if (cma_any_addr(dst_addr))
ret = cma_resolve_loopback(id_priv);
else
- ret = rdma_resolve_ip(&addr_client, (struct sockaddr *) &id->route.addr.src_addr,
+ ret = rdma_resolve_ip(&addr_client,
+ src_addr ? src_addr : (struct sockaddr *) &id->route.addr.src_addr,
dst_addr, &id->route.addr.dev_addr,
timeout_ms, addr_handler, id_priv);
if (ret)
@@ -2077,6 +2103,8 @@ static int cma_get_port(struct rdma_id_private *id_priv)
return ret;
}
+/* rdma_bind_addr does a combination of what socket bind() and
+ SO_BINDTODEVICE would do. */
int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr)
{
struct rdma_id_private *id_priv;
@@ -2085,10 +2113,14 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr)
if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6)
return -EAFNOSUPPORT;
+ if ((ret = check_ip6_ll(&id->route.addr.dev_addr, addr)))
+ return ret;
+
id_priv = container_of(id, struct rdma_id_private, id);
if (!cma_comp_exch(id_priv, CMA_IDLE, CMA_ADDR_BOUND))
return -EINVAL;
+ /* Do socket SO_BINDTODEVICE like behavior */
if (!cma_any_addr(addr)) {
ret = rdma_translate_ip(addr, &id->route.addr.dev_addr);
if (ret)
@@ -2101,6 +2133,7 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr)
goto err1;
}
+ /* Do socket bind() like behavior */
memcpy(&id->route.addr.src_addr, addr, ip_addr_size(addr));
ret = cma_get_port(id_priv);
if (ret)
@@ -2815,7 +2848,7 @@ static int cma_netdev_change(struct net_device *ndev, struct rdma_id_private *id
dev_addr = &id_priv->id.route.addr.dev_addr;
- if ((dev_addr->src_dev == ndev) &&
+ if ((dev_addr->bound_dev_if == ndev->ifindex) &&
memcmp(dev_addr->src_dev_addr, ndev->dev_addr, ndev->addr_len)) {
printk(KERN_INFO "RDMA CM addr change for ndev %s used by id %p\n",
ndev->name, &id_priv->id);
@@ -61,7 +61,7 @@ struct rdma_dev_addr {
unsigned char dst_dev_addr[MAX_ADDR_LEN];
unsigned char broadcast[MAX_ADDR_LEN];
enum rdma_node_type dev_type;
- struct net_device *src_dev;
+ int bound_dev_if;
};
/**