From patchwork Wed Oct 28 05:42:53 2009 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Jason Gunthorpe X-Patchwork-Id: 56224 Received: from vger.kernel.org (vger.kernel.org [209.132.176.167]) by demeter.kernel.org (8.14.2/8.14.2) with ESMTP id n9S5gvIp031475 for ; Wed, 28 Oct 2009 05:43:04 GMT Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1753087AbZJ1Fm7 (ORCPT ); Wed, 28 Oct 2009 01:42:59 -0400 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1754300AbZJ1Fm7 (ORCPT ); Wed, 28 Oct 2009 01:42:59 -0400 Received: from 139-142-54-143.atc.vaillant.ca ([139.142.54.143]:33497 "EHLO quartz.edm.orcorp.ca" rhost-flags-OK-FAIL-OK-FAIL) by vger.kernel.org with ESMTP id S1753087AbZJ1Fm5 (ORCPT ); Wed, 28 Oct 2009 01:42:57 -0400 Received: from bertha1.edm.orcorp.ca ([10.0.0.45]) by quartz.edm.orcorp.ca with esmtps (TLS-1.0:RSA_AES_256_CBC_SHA1:32) (Exim 4.68) (envelope-from ) id 1N31J8-0001DL-06; Tue, 27 Oct 2009 23:42:54 -0600 Received: from jgg by bertha1.edm.orcorp.ca with local (Exim 4.67) (envelope-from ) id 1N31J7-0005yM-IQ; Tue, 27 Oct 2009 23:42:53 -0600 Date: Tue, 27 Oct 2009 23:42:53 -0600 From: Jason Gunthorpe To: "David J. Wilder" , leo.tominna@oracle.com Cc: Sean Hefty , linux-rdma Subject: [PATCH RDMA] Fixup IPv6 support and IPv4 routing corner cases for RDMA CM Message-ID: <20091028054253.GA22882@obsidianresearch.com> MIME-Version: 1.0 Content-Disposition: inline User-Agent: Mutt/1.5.15+20070412 (2007-04-11) Sender: linux-rdma-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: linux-rdma@vger.kernel.org diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index bd07803..1b6d419 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -107,7 +107,7 @@ int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct net_device *dev, memcpy(dev_addr->broadcast, dev->broadcast, MAX_ADDR_LEN); if (dst_dev_addr) memcpy(dev_addr->dst_dev_addr, dst_dev_addr, MAX_ADDR_LEN); - dev_addr->src_dev = dev; + dev_addr->bound_dev_if = dev->ifindex; return 0; } EXPORT_SYMBOL(rdma_copy_addr); @@ -117,6 +117,16 @@ int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr) struct net_device *dev; int ret = -EADDRNOTAVAIL; + /* Someone has specified we must bind to a specific interface.. */ + if (dev_addr->bound_dev_if) { + dev = dev_get_by_index(&init_net, dev_addr->bound_dev_if); + if (!dev) + return -ENODEV; + ret = rdma_copy_addr(dev_addr, dev, NULL); + dev_put(dev); + return ret; + } + switch (addr->sa_family) { case AF_INET: dev = ip_dev_find(&init_net, @@ -176,48 +186,9 @@ static void queue_req(struct addr_req *req) mutex_unlock(&lock); } -static void addr_send_arp(struct sockaddr *dst_in) -{ - struct rtable *rt; - struct flowi fl; - - memset(&fl, 0, sizeof fl); - - switch (dst_in->sa_family) { - case AF_INET: - fl.nl_u.ip4_u.daddr = - ((struct sockaddr_in *) dst_in)->sin_addr.s_addr; - - if (ip_route_output_key(&init_net, &rt, &fl)) - return; - - neigh_event_send(rt->u.dst.neighbour, NULL); - ip_rt_put(rt); - break; - -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) - case AF_INET6: - { - struct dst_entry *dst; - - fl.nl_u.ip6_u.daddr = - ((struct sockaddr_in6 *) dst_in)->sin6_addr; - - dst = ip6_route_output(&init_net, NULL, &fl); - if (!dst) - return; - - neigh_event_send(dst->neighbour, NULL); - dst_release(dst); - break; - } -#endif - } -} - -static int addr4_resolve_remote(struct sockaddr_in *src_in, - struct sockaddr_in *dst_in, - struct rdma_dev_addr *addr) +static int addr4_resolve(struct sockaddr_in *src_in, + struct sockaddr_in *dst_in, + struct rdma_dev_addr *addr) { __be32 src_ip = src_in->sin_addr.s_addr; __be32 dst_ip = dst_in->sin_addr.s_addr; @@ -229,10 +200,22 @@ static int addr4_resolve_remote(struct sockaddr_in *src_in, memset(&fl, 0, sizeof fl); fl.nl_u.ip4_u.daddr = dst_ip; fl.nl_u.ip4_u.saddr = src_ip; + fl.oif = addr->bound_dev_if; ret = ip_route_output_key(&init_net, &rt, &fl); if (ret) goto out; + src_in->sin_family = AF_INET; + src_in->sin_addr.s_addr = rt->rt_src; + + if (rt->idev->dev == init_net.loopback_dev) { + ret = rdma_translate_ip((struct sockaddr *)dst_in, addr); + if (!ret) + memcpy(addr->dst_dev_addr, addr->src_dev_addr, + MAX_ADDR_LEN); + goto put; + } + /* If the device does ARP internally, return 'done' */ if (rt->idev->dev->flags & IFF_NOARP) { rdma_copy_addr(addr, rt->idev->dev, NULL); @@ -240,24 +223,16 @@ static int addr4_resolve_remote(struct sockaddr_in *src_in, } neigh = neigh_lookup(&arp_tbl, &rt->rt_gateway, rt->idev->dev); - if (!neigh) { + if (!neigh || !(neigh->nud_state & NUD_VALID)) { + neigh_event_send(rt->u.dst.neighbour, NULL); ret = -ENODATA; + if (neigh) + neigh_release(neigh); goto put; } - - if (!(neigh->nud_state & NUD_VALID)) { - ret = -ENODATA; - goto release; - } - - if (!src_ip) { - src_in->sin_family = dst_in->sin_family; - src_in->sin_addr.s_addr = rt->rt_src; - } + neigh_release(neigh); ret = rdma_copy_addr(addr, neigh->dev, neigh->ha); -release: - neigh_release(neigh); put: ip_rt_put(rt); out: @@ -265,36 +240,64 @@ out: } #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -static int addr6_resolve_remote(struct sockaddr_in6 *src_in, - struct sockaddr_in6 *dst_in, - struct rdma_dev_addr *addr) +static int addr6_resolve(struct sockaddr_in6 *src_in, + struct sockaddr_in6 *dst_in, + struct rdma_dev_addr *addr) { struct flowi fl; struct neighbour *neigh; struct dst_entry *dst; - int ret = -ENODATA; + int ret; memset(&fl, 0, sizeof fl); - fl.nl_u.ip6_u.daddr = dst_in->sin6_addr; - fl.nl_u.ip6_u.saddr = src_in->sin6_addr; + ipv6_addr_copy(&fl.fl6_dst, &dst_in->sin6_addr); + ipv6_addr_copy(&fl.fl6_src, &src_in->sin6_addr); + fl.oif = addr->bound_dev_if; dst = ip6_route_output(&init_net, NULL, &fl); - if (!dst) - return ret; + if ((ret = dst->error)) + goto put; + + if (ipv6_addr_any(&fl.fl6_src)) { + ret = ipv6_dev_get_saddr(&init_net, ip6_dst_idev(dst)->dev, + &fl.fl6_dst, 0, + &fl.fl6_src); + if (!ret) + goto put; + src_in->sin6_family = AF_INET6; + ipv6_addr_copy(&src_in->sin6_addr, &fl.fl6_src); + } + + if (ip6_dst_idev(dst)->dev == init_net.loopback_dev) { + ret = rdma_translate_ip((struct sockaddr *)dst_in, addr); + if (!ret) + memcpy(addr->dst_dev_addr, addr->src_dev_addr, + MAX_ADDR_LEN); + goto put; + } - if (dst->dev->flags & IFF_NOARP) { - ret = rdma_copy_addr(addr, dst->dev, NULL); - } else { - neigh = dst->neighbour; - if (neigh && (neigh->nud_state & NUD_VALID)) - ret = rdma_copy_addr(addr, neigh->dev, neigh->ha); + /* If the device does ARP internally, return 'done' */ + if (ip6_dst_idev(dst)->dev->flags & IFF_NOARP) { + ret = rdma_copy_addr(addr, ip6_dst_idev(dst)->dev, NULL); + goto put; } + /* XXX why is this different from the addr4_resolve case? + Is the neigh_lookup redundant? */ + neigh = dst->neighbour; + if (!neigh || !(neigh->nud_state & NUD_VALID)) { + neigh_event_send(dst->neighbour, NULL); + ret = -ENODATA; + goto put; + } + + ret = rdma_copy_addr(addr, neigh->dev, neigh->ha); +put: dst_release(dst); return ret; } #else -static int addr6_resolve_remote(struct sockaddr_in6 *src_in, +static int addr6_resolve(struct sockaddr_in6 *src_in, struct sockaddr_in6 *dst_in, struct rdma_dev_addr *addr) { @@ -302,15 +305,15 @@ static int addr6_resolve_remote(struct sockaddr_in6 *src_in, } #endif -static int addr_resolve_remote(struct sockaddr *src_in, - struct sockaddr *dst_in, - struct rdma_dev_addr *addr) +static int addr_resolve(struct sockaddr *src_in, + struct sockaddr *dst_in, + struct rdma_dev_addr *addr) { if (src_in->sa_family == AF_INET) { - return addr4_resolve_remote((struct sockaddr_in *) src_in, + return addr4_resolve((struct sockaddr_in *) src_in, (struct sockaddr_in *) dst_in, addr); } else - return addr6_resolve_remote((struct sockaddr_in6 *) src_in, + return addr6_resolve((struct sockaddr_in6 *) src_in, (struct sockaddr_in6 *) dst_in, addr); } @@ -327,8 +330,7 @@ static void process_req(struct work_struct *work) if (req->status == -ENODATA) { src_in = (struct sockaddr *) &req->src_addr; dst_in = (struct sockaddr *) &req->dst_addr; - req->status = addr_resolve_remote(src_in, dst_in, - req->addr); + req->status = addr_resolve(src_in, dst_in, req->addr); if (req->status && time_after_eq(jiffies, req->timeout)) req->status = -ETIMEDOUT; else if (req->status == -ENODATA) @@ -352,82 +354,6 @@ static void process_req(struct work_struct *work) } } -static int addr_resolve_local(struct sockaddr *src_in, - struct sockaddr *dst_in, - struct rdma_dev_addr *addr) -{ - struct net_device *dev; - int ret; - - switch (dst_in->sa_family) { - case AF_INET: - { - __be32 src_ip = ((struct sockaddr_in *) src_in)->sin_addr.s_addr; - __be32 dst_ip = ((struct sockaddr_in *) dst_in)->sin_addr.s_addr; - - dev = ip_dev_find(&init_net, dst_ip); - if (!dev) - return -EADDRNOTAVAIL; - - if (ipv4_is_zeronet(src_ip)) { - src_in->sa_family = dst_in->sa_family; - ((struct sockaddr_in *) src_in)->sin_addr.s_addr = dst_ip; - ret = rdma_copy_addr(addr, dev, dev->dev_addr); - } else if (ipv4_is_loopback(src_ip)) { - ret = rdma_translate_ip(dst_in, addr); - if (!ret) - memcpy(addr->dst_dev_addr, dev->dev_addr, MAX_ADDR_LEN); - } else { - ret = rdma_translate_ip(src_in, addr); - if (!ret) - memcpy(addr->dst_dev_addr, dev->dev_addr, MAX_ADDR_LEN); - } - dev_put(dev); - break; - } - -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) - case AF_INET6: - { - struct in6_addr *a; - - for_each_netdev(&init_net, dev) - if (ipv6_chk_addr(&init_net, - &((struct sockaddr_in6 *) dst_in)->sin6_addr, - dev, 1)) - break; - - if (!dev) - return -EADDRNOTAVAIL; - - a = &((struct sockaddr_in6 *) src_in)->sin6_addr; - - if (ipv6_addr_any(a)) { - src_in->sa_family = dst_in->sa_family; - ((struct sockaddr_in6 *) src_in)->sin6_addr = - ((struct sockaddr_in6 *) dst_in)->sin6_addr; - ret = rdma_copy_addr(addr, dev, dev->dev_addr); - } else if (ipv6_addr_loopback(a)) { - ret = rdma_translate_ip(dst_in, addr); - if (!ret) - memcpy(addr->dst_dev_addr, dev->dev_addr, MAX_ADDR_LEN); - } else { - ret = rdma_translate_ip(src_in, addr); - if (!ret) - memcpy(addr->dst_dev_addr, dev->dev_addr, MAX_ADDR_LEN); - } - break; - } -#endif - - default: - ret = -EADDRNOTAVAIL; - break; - } - - return ret; -} - int rdma_resolve_ip(struct rdma_addr_client *client, struct sockaddr *src_addr, struct sockaddr *dst_addr, struct rdma_dev_addr *addr, int timeout_ms, @@ -455,10 +381,11 @@ int rdma_resolve_ip(struct rdma_addr_client *client, src_in = (struct sockaddr *) &req->src_addr; dst_in = (struct sockaddr *) &req->dst_addr; - req->status = addr_resolve_local(src_in, dst_in, addr); - if (req->status == -EADDRNOTAVAIL) - req->status = addr_resolve_remote(src_in, dst_in, addr); + req->status = addr_resolve(src_in, dst_in, addr); + /* XXX this scares me, ENODATA is not special, there is no guarentee + that any of the functions in the IP stack called by addr_resolve + don't return ENODATA in some cases.. */ switch (req->status) { case 0: req->timeout = jiffies; @@ -467,7 +394,6 @@ int rdma_resolve_ip(struct rdma_addr_client *client, case -ENODATA: req->timeout = msecs_to_jiffies(timeout_ms) + jiffies; queue_req(req); - addr_send_arp(dst_in); break; default: ret = req->status; diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 0753178..a0fa241 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -1875,13 +1875,25 @@ err: return ret; } -static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, - struct sockaddr *dst_addr) +/* IPv6 link local addresses must specify a consistent set of devices + to bind to. */ +static int check_ip6_ll(struct rdma_dev_addr *dev_addr, + const struct sockaddr *addr) { - if (src_addr && src_addr->sa_family) - return rdma_bind_addr(id, src_addr); - else - return cma_bind_any(id, dst_addr->sa_family); + const struct sockaddr_in6 *addr6 = (const struct sockaddr_in6 *)addr; + + if (addr->sa_family != AF_INET6) + return 0; + + if (ipv6_addr_type(&addr6->sin6_addr) & IPV6_ADDR_LINKLOCAL) { + if (dev_addr->bound_dev_if && + dev_addr->bound_dev_if != addr6->sin6_scope_id) + return -EINVAL; + dev_addr->bound_dev_if = addr6->sin6_scope_id; + if (!dev_addr->bound_dev_if) + return -EINVAL; + } + return 0; } int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, @@ -1891,8 +1903,16 @@ int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, int ret; id_priv = container_of(id, struct rdma_id_private, id); + + if (src_addr) { + if ((ret = check_ip6_ll(&id->route.addr.dev_addr, src_addr))) + return ret; + } + if ((ret = check_ip6_ll(&id->route.addr.dev_addr, dst_addr))) + return ret; + if (id_priv->state == CMA_IDLE) { - ret = cma_bind_addr(id, src_addr, dst_addr); + ret = cma_bind_any(id, dst_addr->sa_family); if (ret) return ret; } @@ -1900,12 +1920,18 @@ int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, if (!cma_comp_exch(id_priv, CMA_ADDR_BOUND, CMA_ADDR_QUERY)) return -EINVAL; + /* NOTE: If the user has called rdma_bind_addr() and then passes + a src_addr value here the interpretation is that they wished + to use whatever address they gave to bind to choose a device, + and this address as the source IP for the connection. */ + atomic_inc(&id_priv->refcount); memcpy(&id->route.addr.dst_addr, dst_addr, ip_addr_size(dst_addr)); if (cma_any_addr(dst_addr)) ret = cma_resolve_loopback(id_priv); else - ret = rdma_resolve_ip(&addr_client, (struct sockaddr *) &id->route.addr.src_addr, + ret = rdma_resolve_ip(&addr_client, + src_addr ? src_addr : (struct sockaddr *) &id->route.addr.src_addr, dst_addr, &id->route.addr.dev_addr, timeout_ms, addr_handler, id_priv); if (ret) @@ -2077,6 +2103,8 @@ static int cma_get_port(struct rdma_id_private *id_priv) return ret; } +/* rdma_bind_addr does a combination of what socket bind() and + SO_BINDTODEVICE would do. */ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) { struct rdma_id_private *id_priv; @@ -2085,10 +2113,14 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6) return -EAFNOSUPPORT; + if ((ret = check_ip6_ll(&id->route.addr.dev_addr, addr))) + return ret; + id_priv = container_of(id, struct rdma_id_private, id); if (!cma_comp_exch(id_priv, CMA_IDLE, CMA_ADDR_BOUND)) return -EINVAL; + /* Do socket SO_BINDTODEVICE like behavior */ if (!cma_any_addr(addr)) { ret = rdma_translate_ip(addr, &id->route.addr.dev_addr); if (ret) @@ -2101,6 +2133,7 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) goto err1; } + /* Do socket bind() like behavior */ memcpy(&id->route.addr.src_addr, addr, ip_addr_size(addr)); ret = cma_get_port(id_priv); if (ret) @@ -2815,7 +2848,7 @@ static int cma_netdev_change(struct net_device *ndev, struct rdma_id_private *id dev_addr = &id_priv->id.route.addr.dev_addr; - if ((dev_addr->src_dev == ndev) && + if ((dev_addr->bound_dev_if == ndev->ifindex) && memcmp(dev_addr->src_dev_addr, ndev->dev_addr, ndev->addr_len)) { printk(KERN_INFO "RDMA CM addr change for ndev %s used by id %p\n", ndev->name, &id_priv->id); diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h index 483057b..27f17cc 100644 --- a/include/rdma/ib_addr.h +++ b/include/rdma/ib_addr.h @@ -61,7 +61,7 @@ struct rdma_dev_addr { unsigned char dst_dev_addr[MAX_ADDR_LEN]; unsigned char broadcast[MAX_ADDR_LEN]; enum rdma_node_type dev_type; - struct net_device *src_dev; + int bound_dev_if; }; /**