===================================================================
@@ -0,0 +1,580 @@
+/*
+ * Copyright (c) 2009-2011 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2009-2011 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/if.h>
+#include <linux/if_vlan.h>
+#include <net/sch_generic.h>
+#include <linux/netfilter.h>
+#include <rdma/ib_addr.h>
+
+#include "rxe.h"
+#include "rxe_net.h"
+
+MODULE_AUTHOR("Bob Pearson, Frank Zago, John Groves");
+MODULE_DESCRIPTION("RDMA transport over Converged Enhanced Ethernet");
+MODULE_LICENSE("Dual BSD/GPL");
+
+static int rxe_eth_proto_id = ETH_P_RXE;
+module_param_named(eth_proto_id, rxe_eth_proto_id, int, 0644);
+MODULE_PARM_DESC(eth_proto_id, "Ethernet protocol ID (default/correct=0x8915)");
+
+static int rxe_xmit_shortcut;
+module_param_named(xmit_shortcut, rxe_xmit_shortcut, int, 0644);
+MODULE_PARM_DESC(xmit_shortcut,
+ "Shortcut transmit (EXPERIMENTAL)");
+
+static int rxe_loopback_mad_grh_fix = 1;
+module_param_named(loopback_mad_grh_fix, rxe_loopback_mad_grh_fix, int, 0644);
+MODULE_PARM_DESC(loopback_mad_grh_fix, "Allow MADs to self without GRH");
+
+/*
+ * note: this table is a replacement for a protocol specific pointer
+ * in struct net_device which exists for other ethertypes
+ * this allows us to not have to patch that data structure
+ * eventually we want to get our own when we're famous
+ */
+struct rxe_net_info net_info[RXE_MAX_IF_INDEX];
+spinlock_t net_info_lock;
+
+static int rxe_net_rcv(struct sk_buff *skb,
+ struct net_device *ndev,
+ struct packet_type *ptype,
+ struct net_device *orig_dev);
+
+static __be64 rxe_mac_to_eui64(struct net_device *ndev)
+{
+ unsigned char *mac_addr = ndev->dev_addr;
+ __be64 eui64;
+ unsigned char *dst = (unsigned char *)&eui64;
+
+ dst[0] = mac_addr[0] ^ 2;
+ dst[1] = mac_addr[1];
+ dst[2] = mac_addr[2];
+ dst[3] = 0xff;
+ dst[4] = 0xfe;
+ dst[5] = mac_addr[3];
+ dst[6] = mac_addr[4];
+ dst[7] = mac_addr[5];
+
+ return eui64;
+}
+
+/* callback when rxe gets released */
+static void release(struct rxe_dev *rxe)
+{
+ module_put(THIS_MODULE);
+}
+
+static __be64 node_guid(struct rxe_dev *rxe)
+{
+ return rxe_mac_to_eui64(rxe->ndev);
+}
+
+static __be64 port_guid(struct rxe_dev *rxe, unsigned int port_num)
+{
+ return rxe_mac_to_eui64(rxe->ndev);
+}
+
+static struct device *dma_device(struct rxe_dev *rxe)
+{
+ struct net_device *ndev;
+
+ ndev = rxe->ndev;
+
+ if (ndev->priv_flags & IFF_802_1Q_VLAN)
+ ndev = vlan_dev_real_dev(ndev);
+
+ return ndev->dev.parent;
+}
+
+static int mcast_add(struct rxe_dev *rxe, union ib_gid *mgid)
+{
+ int err;
+ unsigned char ll_addr[ETH_ALEN];
+
+ ipv6_eth_mc_map((struct in6_addr *)mgid->raw, ll_addr);
+ err = dev_mc_add(rxe->ndev, ll_addr);
+
+ return err;
+}
+
+static int mcast_delete(struct rxe_dev *rxe, union ib_gid *mgid)
+{
+ int err;
+ unsigned char ll_addr[ETH_ALEN];
+
+ ipv6_eth_mc_map((struct in6_addr *)mgid->raw, ll_addr);
+ err = dev_mc_del(rxe->ndev, ll_addr);
+
+ return err;
+}
+
+static inline int queue_deactivated(struct sk_buff *skb)
+{
+ const struct net_device_ops *ops = skb->dev->netdev_ops;
+ u16 queue_index = 0;
+ struct netdev_queue *txq;
+
+ if (ops->ndo_select_queue)
+ queue_index = ops->ndo_select_queue(skb->dev, skb);
+ else if (skb->dev->real_num_tx_queues > 1)
+ queue_index = skb_tx_hash(skb->dev, skb);
+
+ txq = netdev_get_tx_queue(skb->dev, queue_index);
+ return txq->qdisc->state & 2;
+}
+
+static int send_finish(struct sk_buff *skb)
+{
+ if (!rxe_xmit_shortcut)
+ return dev_queue_xmit(skb);
+ else
+ return skb->dev->netdev_ops->ndo_start_xmit(skb, skb->dev);
+}
+
+static int send(struct rxe_dev *rxe, struct sk_buff *skb)
+{
+ if (queue_deactivated(skb))
+ return RXE_QUEUE_STOPPED;
+
+ if (netif_queue_stopped(skb->dev))
+ return RXE_QUEUE_STOPPED;
+
+ return NF_HOOK(NFPROTO_RXE, NF_RXE_OUT, skb, rxe->ndev, NULL,
+ send_finish);
+}
+
+static int loopback_finish(struct sk_buff *skb)
+{
+ struct rxe_pkt_info *pkt = SKB_TO_PKT(skb);
+ struct packet_type *ptype = NULL;
+ struct net_device *orig_dev = pkt->rxe->ndev;
+
+ return rxe_net_rcv(skb, pkt->rxe->ndev, ptype, orig_dev);
+}
+
+static int loopback(struct rxe_dev *rxe, struct sk_buff *skb)
+{
+ return NF_HOOK(NFPROTO_RXE, NF_RXE_OUT, skb, rxe->ndev, NULL,
+ loopback_finish);
+}
+
+static inline int addr_same(struct rxe_dev *rxe, struct rxe_av *av)
+{
+ int port_num = 1;
+ return rxe->port[port_num - 1].guid_tbl[0]
+ == av->attr.grh.dgid.global.interface_id;
+}
+
+static struct sk_buff *init_packet(struct rxe_dev *rxe, struct rxe_av *av,
+ int paylen)
+{
+ struct sk_buff *skb;
+ struct rxe_pkt_info *pkt;
+
+
+ skb = alloc_skb(paylen + RXE_GRH_BYTES + LL_RESERVED_SPACE(rxe->ndev),
+ GFP_ATOMIC);
+ if (!skb)
+ return NULL;
+
+ skb_reserve(skb, LL_RESERVED_SPACE(rxe->ndev));
+ skb_reset_network_header(skb);
+
+ skb->dev = rxe->ndev;
+ skb->protocol = htons(rxe_eth_proto_id);
+
+ pkt = SKB_TO_PKT(skb);
+ pkt->rxe = rxe;
+ pkt->port_num = 1;
+ pkt->hdr = skb_put(skb, RXE_GRH_BYTES + paylen);
+ pkt->mask = RXE_GRH_MASK;
+
+ dev_hard_header(skb, rxe->ndev, rxe_eth_proto_id,
+ av->ll_addr, rxe->ndev->dev_addr, skb->len);
+
+ if (addr_same(rxe, av))
+ pkt->mask |= RXE_LOOPBACK_MASK;
+
+ return skb;
+}
+
+static int init_av(struct rxe_dev *rxe, struct ib_ah_attr *attr,
+ struct rxe_av *av)
+{
+ struct in6_addr *in6 = (struct in6_addr *)attr->grh.dgid.raw;
+
+ /* grh required for rxe_net */
+ if ((attr->ah_flags & IB_AH_GRH) == 0) {
+ if (rxe_loopback_mad_grh_fix) {
+ /* temporary fix so that we can handle mad's to self
+ without grh's included add grh pointing to self */
+ attr->ah_flags |= IB_AH_GRH;
+ attr->grh.dgid.global.subnet_prefix
+ = rxe->port[0].subnet_prefix;
+ attr->grh.dgid.global.interface_id
+ = rxe->port[0].guid_tbl[0];
+ av->attr = *attr;
+ } else {
+ pr_info("rxe_net: attempting to init av without grh\n");
+ return -EINVAL;
+ }
+ }
+
+ if (rdma_link_local_addr(in6))
+ rdma_get_ll_mac(in6, av->ll_addr);
+ else if (rdma_is_multicast_addr(in6))
+ rdma_get_mcast_mac(in6, av->ll_addr);
+ else {
+ int i;
+ char addr[64];
+
+ for (i = 0; i < 16; i++)
+ sprintf(addr+2*i, "%02x", attr->grh.dgid.raw[i]);
+
+ pr_info("rxe_net: non local subnet address not supported %s\n",
+ addr);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/*
+ * this is required by rxe_cfg to match rxe devices in
+ * /sys/class/infiniband up with their underlying ethernet devices
+ */
+static char *parent_name(struct rxe_dev *rxe, unsigned int port_num)
+{
+ return rxe->ndev->name;
+}
+
+static enum rdma_link_layer link_layer(struct rxe_dev *rxe,
+ unsigned int port_num)
+{
+ return IB_LINK_LAYER_ETHERNET;
+}
+
+static struct rxe_ifc_ops ifc_ops = {
+ .release = release,
+ .node_guid = node_guid,
+ .port_guid = port_guid,
+ .dma_device = dma_device,
+ .mcast_add = mcast_add,
+ .mcast_delete = mcast_delete,
+ .send = send,
+ .loopback = loopback,
+ .init_packet = init_packet,
+ .init_av = init_av,
+ .parent_name = parent_name,
+ .link_layer = link_layer,
+};
+
+/* Caller must hold net_info_lock */
+void rxe_net_add(struct net_device *ndev)
+{
+ int err;
+ struct rxe_dev *rxe;
+ unsigned port_num;
+
+ __module_get(THIS_MODULE);
+
+ rxe = (struct rxe_dev *)ib_alloc_device(sizeof(*rxe));
+ if (!rxe) {
+ err = -ENOMEM;
+ goto err1;
+ }
+
+ /* for now we always assign port = 1 */
+ port_num = 1;
+
+ rxe->ifc_ops = &ifc_ops;
+
+ rxe->ndev = ndev;
+
+ err = rxe_add(rxe, ndev->mtu);
+ if (err)
+ goto err2;
+
+ pr_info("rxe_net: added %s to %s\n",
+ rxe->ib_dev.name, ndev->name);
+
+ net_info[ndev->ifindex].rxe = rxe;
+ net_info[ndev->ifindex].port = port_num;
+ net_info[ndev->ifindex].ndev = ndev;
+ return;
+
+err2:
+ ib_dealloc_device(&rxe->ib_dev);
+err1:
+ module_put(THIS_MODULE);
+
+ return;
+}
+
+/* Caller must hold net_info_lock */
+void rxe_net_up(struct net_device *ndev)
+{
+ struct rxe_dev *rxe;
+ struct rxe_port *port;
+ u8 port_num;
+
+ if (ndev->ifindex >= RXE_MAX_IF_INDEX)
+ goto out;
+
+ net_info[ndev->ifindex].status = IB_PORT_ACTIVE;
+
+ rxe = net_to_rxe(ndev);
+ if (!rxe)
+ goto out;
+
+ port_num = net_to_port(ndev);
+ port = &rxe->port[port_num-1];
+ port->attr.state = IB_PORT_ACTIVE;
+ port->attr.phys_state = IB_PHYS_STATE_LINK_UP;
+
+ pr_info("rxe_net: set %s active for %s\n",
+ rxe->ib_dev.name, ndev->name);
+out:
+ return;
+}
+
+/* Caller must hold net_info_lock */
+void rxe_net_down(struct net_device *ndev)
+{
+ struct rxe_dev *rxe;
+ struct rxe_port *port;
+ u8 port_num;
+
+ if (ndev->ifindex >= RXE_MAX_IF_INDEX)
+ goto out;
+
+ net_info[ndev->ifindex].status = IB_PORT_DOWN;
+
+ rxe = net_to_rxe(ndev);
+ if (!rxe)
+ goto out;
+
+ port_num = net_to_port(ndev);
+ port = &rxe->port[port_num-1];
+ port->attr.state = IB_PORT_DOWN;
+ port->attr.phys_state = 3;
+
+ pr_info("rxe_net: set %s down for %s\n",
+ rxe->ib_dev.name, ndev->name);
+out:
+ return;
+}
+
+static int can_support_rxe(struct net_device *ndev)
+{
+ int rc = 0;
+
+ if (ndev->ifindex >= RXE_MAX_IF_INDEX) {
+ pr_debug("%s index %d: too large for rxe ndev table\n",
+ ndev->name, ndev->ifindex);
+ goto out;
+ }
+
+ /* Let's says we support all ethX devices */
+ if (strncmp(ndev->name, "eth", 3) == 0)
+ rc = 1;
+
+out:
+ return rc;
+}
+
+static int rxe_notify(struct notifier_block *not_blk,
+ unsigned long event,
+ void *arg)
+{
+ struct net_device *ndev = arg;
+ struct rxe_dev *rxe;
+
+ if (!can_support_rxe(ndev))
+ goto out;
+
+ spin_lock_bh(&net_info_lock);
+ switch (event) {
+ case NETDEV_REGISTER:
+ /* Keep a record of this NIC. */
+ net_info[ndev->ifindex].status = IB_PORT_DOWN;
+ net_info[ndev->ifindex].rxe = NULL;
+ net_info[ndev->ifindex].port = 1;
+ net_info[ndev->ifindex].ndev = ndev;
+ break;
+
+ case NETDEV_UNREGISTER:
+ if (net_info[ndev->ifindex].rxe) {
+ rxe = net_info[ndev->ifindex].rxe;
+ net_info[ndev->ifindex].rxe = NULL;
+ spin_unlock_bh(&net_info_lock);
+ rxe_remove(rxe);
+ spin_lock_bh(&net_info_lock);
+ }
+ net_info[ndev->ifindex].status = 0;
+ net_info[ndev->ifindex].port = 0;
+ net_info[ndev->ifindex].ndev = NULL;
+ break;
+
+ case NETDEV_UP:
+ rxe_net_up(ndev);
+ break;
+
+ case NETDEV_DOWN:
+ rxe_net_down(ndev);
+ break;
+
+ case NETDEV_CHANGEMTU:
+ rxe = net_to_rxe(ndev);
+ if (rxe) {
+ pr_info("rxe_net: %s changed mtu to %d\n",
+ ndev->name, ndev->mtu);
+ rxe_set_mtu(rxe, ndev->mtu, net_to_port(ndev));
+ }
+ break;
+
+ case NETDEV_REBOOT:
+ case NETDEV_CHANGE:
+ case NETDEV_GOING_DOWN:
+ case NETDEV_CHANGEADDR:
+ case NETDEV_CHANGENAME:
+ case NETDEV_FEAT_CHANGE:
+ default:
+ pr_info("rxe_net: ignoring netdev event = %ld for %s\n",
+ event, ndev->name);
+ break;
+ }
+ spin_unlock_bh(&net_info_lock);
+
+out:
+ return NOTIFY_OK;
+}
+
+static int rxe_net_rcv(struct sk_buff *skb,
+ struct net_device *ndev,
+ struct packet_type *ptype,
+ struct net_device *orig_dev)
+{
+ struct rxe_dev *rxe = net_to_rxe(ndev);
+ struct rxe_pkt_info *pkt = SKB_TO_PKT(skb);
+ int rc = 0;
+
+ if (!rxe)
+ goto drop;
+
+ /* TODO: We can receive packets in fragments. For now we
+ * linearize and it's costly because we may copy a lot of
+ * data. We should handle that case better. */
+ if (skb_linearize(skb))
+ goto drop;
+
+#if 0
+ /* Error injector */
+ {
+ static int x = 8;
+ static int counter;
+ counter++;
+
+ if (counter == x) {
+ x = 13-x; /* 8 or 5 */
+ counter = 0;
+ pr_debug("dropping one packet\n");
+ goto drop;
+ }
+ }
+#endif
+
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (!skb) {
+ /* still return null */
+ goto out;
+ }
+
+ /* set required fields in pkt */
+ pkt->rxe = rxe;
+ pkt->port_num = net_to_port(ndev);
+ pkt->hdr = skb_network_header(skb);
+ pkt->mask = RXE_GRH_MASK;
+
+ rc = NF_HOOK(NFPROTO_RXE, NF_RXE_IN, skb, ndev, NULL, rxe_rcv);
+out:
+ return rc;
+
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+
+static struct packet_type rxe_packet_type = {
+ .func = rxe_net_rcv,
+};
+
+static struct notifier_block rxe_net_notifier = {
+ .notifier_call = rxe_notify,
+};
+
+static int __init rxe_net_init(void)
+{
+ int err;
+
+ spin_lock_init(&net_info_lock);
+
+ if (rxe_eth_proto_id != ETH_P_RXE)
+ pr_info("rxe_net: protoid set to 0x%x\n",
+ rxe_eth_proto_id);
+
+ rxe_packet_type.type = cpu_to_be16(rxe_eth_proto_id);
+ dev_add_pack(&rxe_packet_type);
+
+ err = register_netdevice_notifier(&rxe_net_notifier);
+
+ pr_info("rxe_net: loaded\n");
+
+ return err;
+}
+
+static void __exit rxe_net_exit(void)
+{
+ unregister_netdevice_notifier(&rxe_net_notifier);
+ dev_remove_pack(&rxe_packet_type);
+
+ pr_info("rxe_net: unloaded\n");
+
+ return;
+}
+
+module_init(rxe_net_init);
+module_exit(rxe_net_exit);
implements kernel module that implements an interface between ib_rxe and the netdev stack. Signed-off-by: Bob Pearson <rpearson@systemfabricworks.com> --- drivers/infiniband/hw/rxe/rxe_net.c | 580 ++++++++++++++++++++++++++++++++++++ 1 file changed, 580 insertions(+) -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html