Message ID | 1431975616-23529-4-git-send-email-kaike.wan@intel.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
On 5/18/2015 10:00 PM, kaike.wan@intel.com wrote: Would be good to have consistent setup for patch titles, e.g start with capital letter as you did for patches 1 and 2 ("route SA ..." --> "Route SA..."). > --- a/drivers/infiniband/core/sa_query.c > +++ b/drivers/infiniband/core/sa_query.c > @@ -45,12 +45,22 @@ > #include <uapi/linux/if_ether.h> > #include <rdma/ib_pack.h> > #include <rdma/ib_cache.h> > +#include <rdma/rdma_netlink.h> > +#include <net/netlink.h> > +#include <rdma/rdma_netlink.h> > #include "sa.h" > > MODULE_AUTHOR("Roland Dreier"); > MODULE_DESCRIPTION("InfiniBand subnet administration query support"); > MODULE_LICENSE("Dual BSD/GPL"); > > +#define IB_SA_LOCAL_SVC_TIMEOUT_MIN 100 > +#define IB_SA_LOCAL_SVC_TIMEOUT_DEFAULT 2000 > +static int sa_local_svc_timeout_ms = IB_SA_LOCAL_SVC_TIMEOUT_DEFAULT; > + > +module_param_named(local_svc_timeout_ms, sa_local_svc_timeout_ms, int, 0644); > +MODULE_PARM_DESC(local_svc_timeout_ms, "Local service timeout in millisecond"); what's actually the role of the module param? why it belongs here? is that really unavoidable to have it? > > +struct ib_nl_request_info { > + struct list_head list; > + u32 seq; > + unsigned long timeout; > + struct ib_sa_query *query; > +}; > + > +struct rdma_nl_resp_msg { > + struct nlmsghdr nl_hdr; > + struct ib_sa_mad sa_mad; > +}; You use ib_nl prefix for request and rdma_nl prefix for response... make it consistent. Also, request and response are too generic, maybe use naming which is a bit more informative on what you are doing here? > + > +static LIST_HEAD(ib_nl_request_list); > +static DEFINE_SPINLOCK(ib_nl_request_lock); > +static atomic_t ib_nl_sa_request_seq; > +static struct workqueue_struct *ib_nl_wq; > +static struct delayed_work ib_nl_timed_work; > + > static void ib_sa_add_one(struct ib_device *device); > static void ib_sa_remove_one(struct ib_device *device); > > @@ -381,6 +425,244 @@ static const struct ib_field guidinfo_rec_table[] = { > .size_bits = 512 }, > }; > > +static int ib_nl_send_mad(void *mad, int len, u32 seq) > +{ > + struct sk_buff *skb = NULL; > + struct nlmsghdr *nlh; > + void *data; > + int ret = 0; > + > + skb = nlmsg_new(len, GFP_KERNEL); > + if (!skb) { > + pr_err("alloc failed ret=%d\n", ret); > + return -ENOMEM; > + } > + > + data = ibnl_put_msg(skb, &nlh, seq, len, RDMA_NL_MAD, > + RDMA_NL_MAD_REQUEST, GFP_KERNEL); > + if (!data) { > + kfree_skb(skb); > + return -EMSGSIZE; > + } > + memcpy(data, mad, len); > + > + ret = ibnl_multicast(skb, nlh, RDMA_NL_GROUP_MAD, GFP_KERNEL); > + if (!ret) { > + ret = len; > + } else { > + if (ret != -ESRCH) > + pr_err("ibnl_multicast failed l=%d, r=%d\n", len, ret); > + ret = 0; > + } > + return ret; > +} > + > +static struct ib_nl_request_info * > +ib_nl_alloc_request(struct ib_sa_query *query) > +{ > + struct ib_nl_request_info *rinfo; > + > + rinfo = kzalloc(sizeof(*rinfo), GFP_ATOMIC); > + if (rinfo == NULL) > + return ERR_PTR(-ENOMEM); > + > + INIT_LIST_HEAD(&rinfo->list); > + rinfo->seq = (u32)atomic_inc_return(&ib_nl_sa_request_seq); > + rinfo->query = query; > + > + return rinfo; > +} > + > +static int ib_nl_send_request(struct ib_nl_request_info *rinfo) > +{ > + struct ib_mad_send_buf *send_buf; > + unsigned long flags; > + unsigned long delay; > + int ret; > + > + send_buf = rinfo->query->mad_buf; > + > + delay = msecs_to_jiffies(sa_local_svc_timeout_ms); > + spin_lock_irqsave(&ib_nl_request_lock, flags); > + ret = ib_nl_send_mad(send_buf->mad, > + (send_buf->data_len + send_buf->hdr_len), > + rinfo->seq); > + > + if (ret != (send_buf->data_len + send_buf->hdr_len)) { > + kfree(rinfo); > + ret = -EIO; > + goto request_out; > + } else { > + ret = 0; > + } > + > + rinfo->timeout = delay + jiffies; > + list_add_tail(&rinfo->list, &ib_nl_request_list); > + /* Start the timeout if this is the only request */ > + if (ib_nl_request_list.next == &rinfo->list) > + queue_delayed_work(ib_nl_wq, &ib_nl_timed_work, delay); > + > +request_out: > + spin_unlock_irqrestore(&ib_nl_request_lock, flags); > + > + return ret; > +} > + > +static int ib_nl_make_request(struct ib_sa_query *query) > +{ > + struct ib_nl_request_info *rinfo; > + > + rinfo = ib_nl_alloc_request(query); > + if (IS_ERR(rinfo)) > + return -ENOMEM; > + > + return ib_nl_send_request(rinfo); > +} > + > +static int ib_nl_cancel_request(struct ib_sa_query *query) > +{ > + unsigned long flags; > + struct ib_nl_request_info *rinfo; > + int found = 0; > + > + spin_lock_irqsave(&ib_nl_request_lock, flags); > + list_for_each_entry(rinfo, &ib_nl_request_list, list) { > + /* Let the timeout to take care of the callback */ > + if (query == rinfo->query) { > + IB_SA_CANCEL_QUERY(query); > + rinfo->timeout = jiffies; > + list_move(&rinfo->list, &ib_nl_request_list); > + found = 1; > + mod_delayed_work(ib_nl_wq, &ib_nl_timed_work, 1); > + break; > + } > + } > + spin_unlock_irqrestore(&ib_nl_request_lock, flags); > + > + return found; > +} > + > + > +static int ib_nl_handle_mad_resp(struct sk_buff *skb, > + struct netlink_callback *cb); > +static struct ibnl_client_cbs ib_sa_cb_table[] = { > + [RDMA_NL_MAD_REQUEST] = { > + .dump = ib_nl_handle_mad_resp, > + .module = THIS_MODULE }, > +}; > + > +static void send_handler(struct ib_mad_agent *agent, > + struct ib_mad_send_wc *mad_send_wc); > + > +static void ib_nl_process_good_rsp(struct ib_sa_query *query, > + struct ib_sa_mad *rsp) > +{ > + struct ib_mad_send_wc mad_send_wc; > + > + if (query->callback) > + query->callback(query, 0, rsp); > + > + mad_send_wc.send_buf = query->mad_buf; > + mad_send_wc.status = IB_WC_SUCCESS; > + send_handler(query->mad_buf->mad_agent, &mad_send_wc); > +} > + > +static void ib_nl_request_timeout(struct work_struct *work) > +{ > + unsigned long flags; > + struct ib_nl_request_info *rinfo; > + struct ib_sa_query *query; > + unsigned long delay; > + struct ib_mad_send_wc mad_send_wc; > + int ret; > + > + spin_lock_irqsave(&ib_nl_request_lock, flags); > + while (!list_empty(&ib_nl_request_list)) { > + rinfo = list_entry(ib_nl_request_list.next, > + struct ib_nl_request_info, list); > + > + if (time_after(rinfo->timeout, jiffies)) { > + delay = rinfo->timeout - jiffies; > + if ((long)delay <= 0) > + delay = 1; > + queue_delayed_work(ib_nl_wq, &ib_nl_timed_work, delay); > + break; > + } > + > + list_del(&rinfo->list); > + query = rinfo->query; > + IB_SA_DISABLE_LOCAL_SVC(query); > + /* Hold the lock to protect against query cancellation */ > + if (IB_SA_QUERY_CANCELLED(query)) > + ret = -1; > + else > + ret = ib_post_send_mad(query->mad_buf, NULL); > + if (ret) { > + mad_send_wc.send_buf = query->mad_buf; > + mad_send_wc.status = IB_WC_WR_FLUSH_ERR; > + spin_unlock_irqrestore(&ib_nl_request_lock, flags); > + send_handler(query->port->agent, &mad_send_wc); > + spin_lock_irqsave(&ib_nl_request_lock, flags); > + } > + kfree(rinfo); > + } > + spin_unlock_irqrestore(&ib_nl_request_lock, flags); > +} > + > +static int ib_nl_handle_mad_resp(struct sk_buff *skb, > + struct netlink_callback *cb) > +{ > + struct rdma_nl_resp_msg *nl_msg = (struct rdma_nl_resp_msg *)cb->nlh; > + unsigned long flags; > + struct ib_nl_request_info *rinfo; > + struct ib_sa_query *query; > + struct ib_mad_send_buf *send_buf; > + struct ib_mad_send_wc mad_send_wc; > + int found = 0; > + int ret; > + > + spin_lock_irqsave(&ib_nl_request_lock, flags); > + list_for_each_entry(rinfo, &ib_nl_request_list, list) { > + /* > + * If the query is cancelled, let the timeout routine > + * take care of it. > + */ > + if (nl_msg->nl_hdr.nlmsg_seq == rinfo->seq) { > + found = !IB_SA_QUERY_CANCELLED(rinfo->query); > + if (found) > + list_del(&rinfo->list); > + break; > + } > + } > + > + if (!found) { > + spin_unlock_irqrestore(&ib_nl_request_lock, flags); > + goto resp_out; > + } > + > + query = rinfo->query; > + send_buf = query->mad_buf; > + > + if (nl_msg->sa_mad.mad_hdr.status != 0) { > + /* if the result is a failure, send out the packet via IB */ > + IB_SA_DISABLE_LOCAL_SVC(query); > + ret = ib_post_send_mad(query->mad_buf, NULL); > + spin_unlock_irqrestore(&ib_nl_request_lock, flags); > + if (ret) { > + mad_send_wc.send_buf = send_buf; > + mad_send_wc.status = IB_WC_GENERAL_ERR; > + send_handler(query->port->agent, &mad_send_wc); > + } > + } else { > + spin_unlock_irqrestore(&ib_nl_request_lock, flags); > + ib_nl_process_good_rsp(query, &nl_msg->sa_mad); > + } > + > + kfree(rinfo); > +resp_out: > + return skb->len; > +} > + > static void free_sm_ah(struct kref *kref) > { > struct ib_sa_sm_ah *sm_ah = container_of(kref, struct ib_sa_sm_ah, ref); > @@ -502,7 +784,13 @@ void ib_sa_cancel_query(int id, struct ib_sa_query *query) > mad_buf = query->mad_buf; > spin_unlock_irqrestore(&idr_lock, flags); > > - ib_cancel_mad(agent, mad_buf); > + /* > + * If the query is still on the netlink request list, schedule > + * it to be cancelled by the timeout routine. Otherwise, it has been > + * sent to the MAD layer and has to be cancelled from there. > + */ > + if (!ib_nl_cancel_request(query)) > + ib_cancel_mad(agent, mad_buf); > } > EXPORT_SYMBOL(ib_sa_cancel_query); > > @@ -638,6 +926,14 @@ static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask) > query->mad_buf->context[0] = query; > query->id = id; > > + if (IB_SA_LOCAL_SVC_ENABLED(query)) { > + if (!ibnl_chk_listeners(RDMA_NL_GROUP_MAD)) { > + if (!ib_nl_make_request(query)) > + return id; > + } > + IB_SA_DISABLE_LOCAL_SVC(query); > + } > + > ret = ib_post_send_mad(query->mad_buf, NULL); > if (ret) { > spin_lock_irqsave(&idr_lock, flags); > @@ -739,7 +1035,7 @@ int ib_sa_path_rec_get(struct ib_sa_client *client, > port = &sa_dev->port[port_num - sa_dev->start_port]; > agent = port->agent; > > - query = kmalloc(sizeof *query, gfp_mask); > + query = kzalloc(sizeof(*query), gfp_mask); > if (!query) > return -ENOMEM; > > @@ -766,6 +1062,8 @@ int ib_sa_path_rec_get(struct ib_sa_client *client, > > *sa_query = &query->sa_query; > > + IB_SA_ENABLE_LOCAL_SVC(&query->sa_query); > + > ret = send_mad(&query->sa_query, timeout_ms, gfp_mask); > if (ret < 0) > goto err2; > @@ -861,7 +1159,7 @@ int ib_sa_service_rec_query(struct ib_sa_client *client, > method != IB_SA_METHOD_DELETE) > return -EINVAL; > > - query = kmalloc(sizeof *query, gfp_mask); > + query = kzalloc(sizeof(*query), gfp_mask); > if (!query) > return -ENOMEM; > > @@ -953,7 +1251,7 @@ int ib_sa_mcmember_rec_query(struct ib_sa_client *client, > port = &sa_dev->port[port_num - sa_dev->start_port]; > agent = port->agent; > > - query = kmalloc(sizeof *query, gfp_mask); > + query = kzalloc(sizeof(*query), gfp_mask); > if (!query) > return -ENOMEM; > > @@ -1050,7 +1348,7 @@ int ib_sa_guid_info_rec_query(struct ib_sa_client *client, > port = &sa_dev->port[port_num - sa_dev->start_port]; > agent = port->agent; > > - query = kmalloc(sizeof *query, gfp_mask); > + query = kzalloc(sizeof(*query), gfp_mask); > if (!query) > return -ENOMEM; Please move the three kmalloc --> kzalloc changes above (and more of their such if exist in the patch) to a pre-patch -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 5/18/2015 10:00 PM, kaike.wan@intel.com wrote: > diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c > index c38f030..69fcd28 100644 > --- a/drivers/infiniband/core/sa_query.c > +++ b/drivers/infiniband/core/sa_query.c > @@ -45,12 +45,22 @@ > #include <uapi/linux/if_ether.h> > #include <rdma/ib_pack.h> > #include <rdma/ib_cache.h> > +#include <rdma/rdma_netlink.h> > +#include <net/netlink.h> > +#include <rdma/rdma_netlink.h> Redundant include > #include "sa.h" -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> > On 5/18/2015 10:00 PM, kaike.wan@intel.com wrote: > > Would be good to have consistent setup for patch titles, e.g start with capital > letter as you did for patches 1 and 2 ("route SA ..." --> "Route SA..."). Will be changed. Thank you. > > --- a/drivers/infiniband/core/sa_query.c > > +++ b/drivers/infiniband/core/sa_query.c > > @@ -45,12 +45,22 @@ > > #include <uapi/linux/if_ether.h> > > #include <rdma/ib_pack.h> > > #include <rdma/ib_cache.h> > > +#include <rdma/rdma_netlink.h> > > +#include <net/netlink.h> > > +#include <rdma/rdma_netlink.h> > > #include "sa.h" > > > > MODULE_AUTHOR("Roland Dreier"); > > MODULE_DESCRIPTION("InfiniBand subnet administration query > support"); > > MODULE_LICENSE("Dual BSD/GPL"); > > > > +#define IB_SA_LOCAL_SVC_TIMEOUT_MIN 100 > > +#define IB_SA_LOCAL_SVC_TIMEOUT_DEFAULT 2000 > > +static int sa_local_svc_timeout_ms = > IB_SA_LOCAL_SVC_TIMEOUT_DEFAULT; > > + > > +module_param_named(local_svc_timeout_ms, > sa_local_svc_timeout_ms, > > +int, 0644); MODULE_PARM_DESC(local_svc_timeout_ms, "Local service > > +timeout in millisecond"); > > what's actually the role of the module param? why it belongs here? is that > really unavoidable to have it? It's nice to provide the capability for the user to adjust the netlink timeout based his/her fabric setup. > > > > > > +struct ib_nl_request_info { > > + struct list_head list; > > + u32 seq; > > + unsigned long timeout; > > + struct ib_sa_query *query; > > +}; > > + > > +struct rdma_nl_resp_msg { > > + struct nlmsghdr nl_hdr; > > + struct ib_sa_mad sa_mad; > > +}; > > You use ib_nl prefix for request and rdma_nl prefix for response... make > it consistent. Will be changed. > > Also, request and response are too generic, maybe use naming which is a > bit more informative on what you > are doing here? That's why we have ib_nl prefix to indicate netlink request/response. Any better naming is welcome. > > > > + > > +static LIST_HEAD(ib_nl_request_list); > > +static DEFINE_SPINLOCK(ib_nl_request_lock); > > +static atomic_t ib_nl_sa_request_seq; > > +static struct workqueue_struct *ib_nl_wq; > > +static struct delayed_work ib_nl_timed_work; > > + > > static void ib_sa_add_one(struct ib_device *device); > > static void ib_sa_remove_one(struct ib_device *device); > > > > @@ -381,6 +425,244 @@ static const struct ib_field guidinfo_rec_table[] = { > > .size_bits = 512 }, > > }; > > > > +static int ib_nl_send_mad(void *mad, int len, u32 seq) > > +{ > > + struct sk_buff *skb = NULL; > > + struct nlmsghdr *nlh; > > + void *data; > > + int ret = 0; > > + > > + skb = nlmsg_new(len, GFP_KERNEL); > > + if (!skb) { > > + pr_err("alloc failed ret=%d\n", ret); > > + return -ENOMEM; > > + } > > + > > + data = ibnl_put_msg(skb, &nlh, seq, len, RDMA_NL_MAD, > > + RDMA_NL_MAD_REQUEST, GFP_KERNEL); > > + if (!data) { > > + kfree_skb(skb); > > + return -EMSGSIZE; > > + } > > + memcpy(data, mad, len); > > + > > + ret = ibnl_multicast(skb, nlh, RDMA_NL_GROUP_MAD, > GFP_KERNEL); > > + if (!ret) { > > + ret = len; > > + } else { > > + if (ret != -ESRCH) > > + pr_err("ibnl_multicast failed l=%d, r=%d\n", len, ret); > > + ret = 0; > > + } > > + return ret; > > +} > > + > > +static struct ib_nl_request_info * > > +ib_nl_alloc_request(struct ib_sa_query *query) > > +{ > > + struct ib_nl_request_info *rinfo; > > + > > + rinfo = kzalloc(sizeof(*rinfo), GFP_ATOMIC); > > + if (rinfo == NULL) > > + return ERR_PTR(-ENOMEM); > > + > > + INIT_LIST_HEAD(&rinfo->list); > > + rinfo->seq = (u32)atomic_inc_return(&ib_nl_sa_request_seq); > > + rinfo->query = query; > > + > > + return rinfo; > > +} > > + > > +static int ib_nl_send_request(struct ib_nl_request_info *rinfo) > > +{ > > + struct ib_mad_send_buf *send_buf; > > + unsigned long flags; > > + unsigned long delay; > > + int ret; > > + > > + send_buf = rinfo->query->mad_buf; > > + > > + delay = msecs_to_jiffies(sa_local_svc_timeout_ms); > > + spin_lock_irqsave(&ib_nl_request_lock, flags); > > + ret = ib_nl_send_mad(send_buf->mad, > > + (send_buf->data_len + send_buf->hdr_len), > > + rinfo->seq); > > + > > + if (ret != (send_buf->data_len + send_buf->hdr_len)) { > > + kfree(rinfo); > > + ret = -EIO; > > + goto request_out; > > + } else { > > + ret = 0; > > + } > > + > > + rinfo->timeout = delay + jiffies; > > + list_add_tail(&rinfo->list, &ib_nl_request_list); > > + /* Start the timeout if this is the only request */ > > + if (ib_nl_request_list.next == &rinfo->list) > > + queue_delayed_work(ib_nl_wq, &ib_nl_timed_work, delay); > > + > > +request_out: > > + spin_unlock_irqrestore(&ib_nl_request_lock, flags); > > + > > + return ret; > > +} > > + > > +static int ib_nl_make_request(struct ib_sa_query *query) > > +{ > > + struct ib_nl_request_info *rinfo; > > + > > + rinfo = ib_nl_alloc_request(query); > > + if (IS_ERR(rinfo)) > > + return -ENOMEM; > > + > > + return ib_nl_send_request(rinfo); > > +} > > + > > +static int ib_nl_cancel_request(struct ib_sa_query *query) > > +{ > > + unsigned long flags; > > + struct ib_nl_request_info *rinfo; > > + int found = 0; > > + > > + spin_lock_irqsave(&ib_nl_request_lock, flags); > > + list_for_each_entry(rinfo, &ib_nl_request_list, list) { > > + /* Let the timeout to take care of the callback */ > > + if (query == rinfo->query) { > > + IB_SA_CANCEL_QUERY(query); > > + rinfo->timeout = jiffies; > > + list_move(&rinfo->list, &ib_nl_request_list); > > + found = 1; > > + mod_delayed_work(ib_nl_wq, &ib_nl_timed_work, > 1); > > + break; > > + } > > + } > > + spin_unlock_irqrestore(&ib_nl_request_lock, flags); > > + > > + return found; > > +} > > + > > + > > +static int ib_nl_handle_mad_resp(struct sk_buff *skb, > > + struct netlink_callback *cb); > > +static struct ibnl_client_cbs ib_sa_cb_table[] = { > > + [RDMA_NL_MAD_REQUEST] = { > > + .dump = ib_nl_handle_mad_resp, > > + .module = THIS_MODULE }, > > +}; > > + > > +static void send_handler(struct ib_mad_agent *agent, > > + struct ib_mad_send_wc *mad_send_wc); > > + > > +static void ib_nl_process_good_rsp(struct ib_sa_query *query, > > + struct ib_sa_mad *rsp) > > +{ > > + struct ib_mad_send_wc mad_send_wc; > > + > > + if (query->callback) > > + query->callback(query, 0, rsp); > > + > > + mad_send_wc.send_buf = query->mad_buf; > > + mad_send_wc.status = IB_WC_SUCCESS; > > + send_handler(query->mad_buf->mad_agent, &mad_send_wc); > > +} > > + > > +static void ib_nl_request_timeout(struct work_struct *work) > > +{ > > + unsigned long flags; > > + struct ib_nl_request_info *rinfo; > > + struct ib_sa_query *query; > > + unsigned long delay; > > + struct ib_mad_send_wc mad_send_wc; > > + int ret; > > + > > + spin_lock_irqsave(&ib_nl_request_lock, flags); > > + while (!list_empty(&ib_nl_request_list)) { > > + rinfo = list_entry(ib_nl_request_list.next, > > + struct ib_nl_request_info, list); > > + > > + if (time_after(rinfo->timeout, jiffies)) { > > + delay = rinfo->timeout - jiffies; > > + if ((long)delay <= 0) > > + delay = 1; > > + queue_delayed_work(ib_nl_wq, > &ib_nl_timed_work, delay); > > + break; > > + } > > + > > + list_del(&rinfo->list); > > + query = rinfo->query; > > + IB_SA_DISABLE_LOCAL_SVC(query); > > + /* Hold the lock to protect against query cancellation */ > > + if (IB_SA_QUERY_CANCELLED(query)) > > + ret = -1; > > + else > > + ret = ib_post_send_mad(query->mad_buf, NULL); > > + if (ret) { > > + mad_send_wc.send_buf = query->mad_buf; > > + mad_send_wc.status = IB_WC_WR_FLUSH_ERR; > > + spin_unlock_irqrestore(&ib_nl_request_lock, flags); > > + send_handler(query->port->agent, &mad_send_wc); > > + spin_lock_irqsave(&ib_nl_request_lock, flags); > > + } > > + kfree(rinfo); > > + } > > + spin_unlock_irqrestore(&ib_nl_request_lock, flags); > > +} > > + > > +static int ib_nl_handle_mad_resp(struct sk_buff *skb, > > + struct netlink_callback *cb) > > +{ > > + struct rdma_nl_resp_msg *nl_msg = (struct rdma_nl_resp_msg > *)cb->nlh; > > + unsigned long flags; > > + struct ib_nl_request_info *rinfo; > > + struct ib_sa_query *query; > > + struct ib_mad_send_buf *send_buf; > > + struct ib_mad_send_wc mad_send_wc; > > + int found = 0; > > + int ret; > > + > > + spin_lock_irqsave(&ib_nl_request_lock, flags); > > + list_for_each_entry(rinfo, &ib_nl_request_list, list) { > > + /* > > + * If the query is cancelled, let the timeout routine > > + * take care of it. > > + */ > > + if (nl_msg->nl_hdr.nlmsg_seq == rinfo->seq) { > > + found = !IB_SA_QUERY_CANCELLED(rinfo->query); > > + if (found) > > + list_del(&rinfo->list); > > + break; > > + } > > + } > > + > > + if (!found) { > > + spin_unlock_irqrestore(&ib_nl_request_lock, flags); > > + goto resp_out; > > + } > > + > > + query = rinfo->query; > > + send_buf = query->mad_buf; > > + > > + if (nl_msg->sa_mad.mad_hdr.status != 0) { > > + /* if the result is a failure, send out the packet via IB */ > > + IB_SA_DISABLE_LOCAL_SVC(query); > > + ret = ib_post_send_mad(query->mad_buf, NULL); > > + spin_unlock_irqrestore(&ib_nl_request_lock, flags); > > + if (ret) { > > + mad_send_wc.send_buf = send_buf; > > + mad_send_wc.status = IB_WC_GENERAL_ERR; > > + send_handler(query->port->agent, &mad_send_wc); > > + } > > + } else { > > + spin_unlock_irqrestore(&ib_nl_request_lock, flags); > > + ib_nl_process_good_rsp(query, &nl_msg->sa_mad); > > + } > > + > > + kfree(rinfo); > > +resp_out: > > + return skb->len; > > +} > > + > > static void free_sm_ah(struct kref *kref) > > { > > struct ib_sa_sm_ah *sm_ah = container_of(kref, struct ib_sa_sm_ah, > ref); > > @@ -502,7 +784,13 @@ void ib_sa_cancel_query(int id, struct ib_sa_query > *query) > > mad_buf = query->mad_buf; > > spin_unlock_irqrestore(&idr_lock, flags); > > > > - ib_cancel_mad(agent, mad_buf); > > + /* > > + * If the query is still on the netlink request list, schedule > > + * it to be cancelled by the timeout routine. Otherwise, it has been > > + * sent to the MAD layer and has to be cancelled from there. > > + */ > > + if (!ib_nl_cancel_request(query)) > > + ib_cancel_mad(agent, mad_buf); > > } > > EXPORT_SYMBOL(ib_sa_cancel_query); > > > > @@ -638,6 +926,14 @@ static int send_mad(struct ib_sa_query *query, int > timeout_ms, gfp_t gfp_mask) > > query->mad_buf->context[0] = query; > > query->id = id; > > > > + if (IB_SA_LOCAL_SVC_ENABLED(query)) { > > + if (!ibnl_chk_listeners(RDMA_NL_GROUP_MAD)) { > > + if (!ib_nl_make_request(query)) > > + return id; > > + } > > + IB_SA_DISABLE_LOCAL_SVC(query); > > + } > > + > > ret = ib_post_send_mad(query->mad_buf, NULL); > > if (ret) { > > spin_lock_irqsave(&idr_lock, flags); > > @@ -739,7 +1035,7 @@ int ib_sa_path_rec_get(struct ib_sa_client *client, > > port = &sa_dev->port[port_num - sa_dev->start_port]; > > agent = port->agent; > > > > - query = kmalloc(sizeof *query, gfp_mask); > > + query = kzalloc(sizeof(*query), gfp_mask); > > if (!query) > > return -ENOMEM; > > > > @@ -766,6 +1062,8 @@ int ib_sa_path_rec_get(struct ib_sa_client *client, > > > > *sa_query = &query->sa_query; > > > > + IB_SA_ENABLE_LOCAL_SVC(&query->sa_query); > > + > > ret = send_mad(&query->sa_query, timeout_ms, gfp_mask); > > if (ret < 0) > > goto err2; > > @@ -861,7 +1159,7 @@ int ib_sa_service_rec_query(struct ib_sa_client > *client, > > method != IB_SA_METHOD_DELETE) > > return -EINVAL; > > > > - query = kmalloc(sizeof *query, gfp_mask); > > + query = kzalloc(sizeof(*query), gfp_mask); > > if (!query) > > return -ENOMEM; > > > > @@ -953,7 +1251,7 @@ int ib_sa_mcmember_rec_query(struct > ib_sa_client *client, > > port = &sa_dev->port[port_num - sa_dev->start_port]; > > agent = port->agent; > > > > - query = kmalloc(sizeof *query, gfp_mask); > > + query = kzalloc(sizeof(*query), gfp_mask); > > if (!query) > > return -ENOMEM; > > > > @@ -1050,7 +1348,7 @@ int ib_sa_guid_info_rec_query(struct ib_sa_client > *client, > > port = &sa_dev->port[port_num - sa_dev->start_port]; > > agent = port->agent; > > > > - query = kmalloc(sizeof *query, gfp_mask); > > + query = kzalloc(sizeof(*query), gfp_mask); > > if (!query) > > return -ENOMEM; > > Please move the three kmalloc --> kzalloc changes above (and more of > their such if exist in the patch) to a pre-patch Will do. Thank you, Kaike > -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> On 5/18/2015 10:00 PM, kaike.wan@intel.com wrote: > > diff --git a/drivers/infiniband/core/sa_query.c > b/drivers/infiniband/core/sa_query.c > > index c38f030..69fcd28 100644 > > --- a/drivers/infiniband/core/sa_query.c > > +++ b/drivers/infiniband/core/sa_query.c > > @@ -45,12 +45,22 @@ > > #include <uapi/linux/if_ether.h> > > #include <rdma/ib_pack.h> > > #include <rdma/ib_cache.h> > > +#include <rdma/rdma_netlink.h> > > +#include <net/netlink.h> > > +#include <rdma/rdma_netlink.h> > Redundant include Will be removed. Thank you, Kaike > > > #include "sa.h" -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, May 19, 2015 at 3:24 PM, Wan, Kaike <kaike.wan@intel.com> wrote: >> On 5/18/2015 10:00 PM, kaike.wan@intel.com wrote: >> > --- a/drivers/infiniband/core/sa_query.c >> > +++ b/drivers/infiniband/core/sa_query.c >> > @@ -45,12 +45,22 @@ >> what's actually the role of the module param? why it belongs here? is that >> really unavoidable to have it? > It's nice to provide the capability for the user to adjust the netlink timeout based his/her fabric setup. NO, adding module params should be really your last resort when everything else wouldn't work, definitely not the case here, remove it. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
PiANCj4gPj4gd2hhdCdzIGFjdHVhbGx5IHRoZSByb2xlIG9mIHRoZSBtb2R1bGUgcGFyYW0/IHdo eSBpdCBiZWxvbmdzIGhlcmU/IGlzDQo+ID4+IHRoYXQgcmVhbGx5IHVuYXZvaWRhYmxlIHRvIGhh dmUgaXQ/DQo+IA0KPiA+IEl0J3MgbmljZSB0byBwcm92aWRlIHRoZSBjYXBhYmlsaXR5IGZvciB0 aGUgdXNlciB0byBhZGp1c3QgdGhlIG5ldGxpbmsgdGltZW91dA0KPiBiYXNlZCBoaXMvaGVyIGZh YnJpYyBzZXR1cC4NCj4gDQo+IE5PLCBhZGRpbmcgbW9kdWxlIHBhcmFtcyBzaG91bGQgYmUgcmVh bGx5IHlvdXIgbGFzdCByZXNvcnQgd2hlbiBldmVyeXRoaW5nDQo+IGVsc2Ugd291bGRuJ3Qgd29y aywgZGVmaW5pdGVseSBub3QgdGhlIGNhc2UgaGVyZSwgcmVtb3ZlIGl0Lg0KDQpBcmUgeW91IHN1 Z2dlc3RpbmcgdGhhdCB3ZSB1c2UgdGhlIHRpbWVvdXQgdGhlIGNhbGxlciBwcm92aWRlcz8gT3Ig aWYgdGhlcmUgaXMgbm8gdGltZW91dCAoMCkgcHJvdmlkZWQsIHVzZSBhIGRlZmF1bHQgb25lPyBM b29raW5nIHRocm91Z2ggdGhlIElCIGRyaXZlcnMsIEkgZG8gZmluZCBtYW55IG1vZHVsZSBwYXJh bWV0ZXJzLg0K -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, May 19, 2015 at 9:42 PM, Wan, Kaike <kaike.wan@intel.com> wrote: >> >> >> what's actually the role of the module param? why it belongs here? is >> >> that really unavoidable to have it? >> >> > It's nice to provide the capability for the user to adjust the netlink timeout >> based his/her fabric setup. >> >> NO, adding module params should be really your last resort when everything >> else wouldn't work, definitely not the case here, remove it. > > Are you suggesting that we use the timeout the caller provides? Or if there is no timeout (0) provided, use a default one? Not sure, pick one of these, whatever works for you the best. > Looking through the IB drivers, I do find many module parameters. One of the most basic rules in kernel programming (and suggested for life too), is that if you something done by other drivers, this doesn't necessarily means it's the right thing to do [1] Module params are problematic, and sometimes cause way more damage then benefit, some subsystem kernel maintainers kind of disallow them totally (networking), maybe do some reading and talking with kernel developers @ your firm to learn more. Or. [1] many times the most nasty bugs originates from cut-and-paste -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, May 18, 2015 at 03:00:16PM -0400, kaike.wan@intel.com wrote: > From: Kaike Wan <kaike.wan@intel.com> > > This patch routes a SA pathrecord query to netlink first and processes the > response appropriately. If a failure is returned, the request will be sent > through IB. The decision whether to route the request to netlink first is > determined by the presence of a listener for the MAD netlink multicast > group. If the user-space MAD netlink multicast group listener is not > present, the request will be sent through IB, just like what is currently > being done. I have mixed feelings about using the SA message format for this. In-kernel we don't use that format, and many of the newer APIs were designed around passing the GMP,Forward,Return path tuple, not just a naked GMP. I've argued for this in the past. I'd rather see the netlink data ask for what the kernel needs (eg UD path to X, RC path to X) and return the new format we've already established for AF_IB Jason -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
PiANCj4gT24gVHVlLCBNYXkgMTksIDIwMTUgYXQgOTo0MiBQTSwgV2FuLCBLYWlrZSA8a2Fpa2Uu d2FuQGludGVsLmNvbT4gd3JvdGU6DQo+ID4+DQo+ID4+ID4+IHdoYXQncyBhY3R1YWxseSB0aGUg cm9sZSBvZiB0aGUgbW9kdWxlIHBhcmFtPyB3aHkgaXQgYmVsb25ncyBoZXJlPw0KPiA+PiA+PiBp cyB0aGF0IHJlYWxseSB1bmF2b2lkYWJsZSB0byBoYXZlIGl0Pw0KPiA+Pg0KPiA+PiA+IEl0J3Mg bmljZSB0byBwcm92aWRlIHRoZSBjYXBhYmlsaXR5IGZvciB0aGUgdXNlciB0byBhZGp1c3QgdGhl DQo+ID4+ID4gbmV0bGluayB0aW1lb3V0DQo+ID4+IGJhc2VkIGhpcy9oZXIgZmFicmljIHNldHVw Lg0KPiA+Pg0KPiA+PiBOTywgYWRkaW5nIG1vZHVsZSBwYXJhbXMgc2hvdWxkIGJlIHJlYWxseSB5 b3VyIGxhc3QgcmVzb3J0IHdoZW4NCj4gPj4gZXZlcnl0aGluZyBlbHNlIHdvdWxkbid0IHdvcmss IGRlZmluaXRlbHkgbm90IHRoZSBjYXNlIGhlcmUsIHJlbW92ZSBpdC4NCj4gPg0KPiA+IEFyZSB5 b3Ugc3VnZ2VzdGluZyB0aGF0IHdlIHVzZSB0aGUgdGltZW91dCB0aGUgY2FsbGVyIHByb3ZpZGVz PyBPciBpZiB0aGVyZSBpcw0KPiBubyB0aW1lb3V0ICgwKSBwcm92aWRlZCwgdXNlIGEgZGVmYXVs dCBvbmU/DQo+IA0KPiBOb3Qgc3VyZSwgcGljayBvbmUgb2YgdGhlc2UsIHdoYXRldmVyIHdvcmtz IGZvciB5b3UgdGhlIGJlc3QuDQoNCk5vIHRoaXMgd2lsbCBub3Qgd29yay4gIFdlIGRvbid0IHdh bnQgdG8gdXNlIHRoZSB0aW1lb3V0IHRoZSBjYWxsZXIgcHJvdmlkZXMuICBUaGUgdGltZW91dCBm b3IgbmV0bGluayBtYXkgKGFuZCBzaG91bGQgYmUpIGRpZmZlcmVudCBmcm9tIHRoZSB0aW1lb3V0 IG5lZWRlZCB3aGVuIHNlbmRpbmcgdG8gdGhlIFNBIGRpcmVjdGx5Lg0KDQo+IA0KPiA+IExvb2tp bmcgdGhyb3VnaCB0aGUgSUIgZHJpdmVycywgSSBkbyBmaW5kIG1hbnkgbW9kdWxlIHBhcmFtZXRl cnMuDQo+IA0KPiBPbmUgb2YgdGhlIG1vc3QgYmFzaWMgcnVsZXMgaW4ga2VybmVsIHByb2dyYW1t aW5nIChhbmQgc3VnZ2VzdGVkIGZvciBsaWZlIHRvbyksDQo+IGlzIHRoYXQgaWYgeW91IHNvbWV0 aGluZyBkb25lIGJ5IG90aGVyIGRyaXZlcnMsIHRoaXMgZG9lc24ndCBuZWNlc3NhcmlseSBtZWFu cw0KPiBpdCdzIHRoZSByaWdodCB0aGluZyB0byBkbyBbMV0NCg0KVGhpcyBtYXkgYmUgdHJ1ZS4u Lg0KDQo+IA0KPiBNb2R1bGUgcGFyYW1zIGFyZSBwcm9ibGVtYXRpYywgYW5kIHNvbWV0aW1lcyBj YXVzZSB3YXkgbW9yZSBkYW1hZ2UgdGhlbg0KPiBiZW5lZml0LCBzb21lIHN1YnN5c3RlbSBrZXJu ZWwgbWFpbnRhaW5lcnMga2luZCBvZiBkaXNhbGxvdyB0aGVtIHRvdGFsbHkNCj4gKG5ldHdvcmtp bmcpLCBtYXliZSBkbyBzb21lIHJlYWRpbmcgYW5kIHRhbGtpbmcgd2l0aCBrZXJuZWwgZGV2ZWxv cGVycyBADQo+IHlvdXIgZmlybSB0byBsZWFybiBtb3JlLg0KDQpJIGhhdmUgc2VhcmNoZWQgaGln aCBhbmQgbG93IGZvciB0aGUgcHJvYmxlbXMgd2l0aCBtb2R1bGUgcGFyYW1ldGVycyAgYW5kIGhh dmUgZm91bmQgbm8gZG9jdW1lbnRhdGlvbiB3aGljaCBkaXNhbGxvd3MgdGhlaXIgdXNlLiAgRG8g eW91IGhhdmUgYSByZWZlcmVuY2Ugd2hpY2ggc2hvd3Mgd2h5IHRoZXkgc2hvdWxkIGJlIGRpc2Fs bG93ZWQ/ICBXaGlsZSBvdGhlciBtZWNoYW5pc21zIGNhbiBiZSB1c2VkIHRoZSB1c2Ugb2YgdGhp cyBwYXJhbWV0ZXIgaXMgdmVyeSBzaW1wbGUgYW5kIHRoZSBhZGRpdGlvbiBvZiBhIG1vZHVsZSBw YXJhbWV0ZXIgc2VlbXMgYXBwcm9wcmlhdGUgaW4gdGhpcyBjYXNlLg0KDQpJcmENCg0K -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, May 19, 2015 at 07:17:49PM +0000, Weiny, Ira wrote: > > Not sure, pick one of these, whatever works for you the best. > > No this will not work. We don't want to use the timeout the caller > provides. The timeout for netlink may (and should be) different > from the timeout needed when sending to the SA directly. Why? > N?????r??y????b?X???v?^?)?{.n?+????{????{ay????,j??f???h???z??w??????j:+v???w?j?m????????zZ+??????j"??! Does anyone else ever wonder what this garbage is on some of the messages from @intel folks? Jason -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, May 19, 2015 at 01:21:41PM -0600, Jason Gunthorpe wrote: > On Tue, May 19, 2015 at 07:17:49PM +0000, Weiny, Ira wrote: > > > Not sure, pick one of these, whatever works for you the best. > > > > No this will not work. We don't want to use the timeout the caller > > provides. The timeout for netlink may (and should be) different > > from the timeout needed when sending to the SA directly. > > Why? The best use case is the desire to have the user space cache issue the query to the SA on behalf of this request, cache the data, and return the response. This means the Netlink timeout needs to be longer than the SA direct timeout. > > > N?????r??y????b?X???v?^?)?{.n?+????{????{ay????,j??f???h???z??w??????j:+v???w?j?m????????zZ+??????j"??! > > Does anyone else ever wonder what this garbage is on some of the > messages from @intel folks? Outlook issues... This msg should be better... Ira -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, May 19, 2015 at 03:26:15PM -0400, ira.weiny wrote: > The best use case is the desire to have the user space cache issue the query to > the SA on behalf of this request, cache the data, and return the response. > This means the Netlink timeout needs to be longer than the SA direct timeout. By how much? The SA timeout is already huge and has lots of slack, adding another timeout without an actual need seems premature.. Jason -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> > N?????r??y????b?X???v?^?)?{.n?+????{????{ay????,j ??f???h???z??w??? > > ???j:+v???w?j?m???? ????zZ+??????j"??! > > Does anyone else ever wonder what this garbage is on some of the > messages from @intel folks? I believe it's some weird formatting of: > To unsubscribe from this list: send the line "unsubscribe linux-rdma" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html I have no idea why we get it. I send/receive in plain text.
> I have mixed feelings about using the SA message format for this. > > In-kernel we don't use that format, and many of the newer APIs were > designed around passing the GMP,Forward,Return path tuple, not just a > naked GMP. I think the choice of the netlink protocol is a largely matter of taste. In the internal review of these patches, the netlink hook was previously in the mad layer, which would have allowed redirecting any mad to user space. (The code only attempted to redirect SA queries.) Trying to hook from within ib_mad added a fair amount of complexity, that moving up into sa_query avoided. The proposed interface would allow moving the changes back down, versus limiting this only to the SA query calls. (And I'm not meaning to imply that limiting this only to SA queries is a bad choice.) > I'd rather see the netlink data ask for what the kernel needs (eg UD > path to X, RC path to X) and return the new format we've already > established for AF_IB Does anyone else have input on the netlink protocol? Hal/Ira? MADs seem more flexible, but higher-level structures more efficient and natural. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, May 19, 2015 at 07:58:32PM +0000, Hefty, Sean wrote: > > Does anyone else ever wonder what this garbage is on some of the > > messages from @intel folks? > > I believe it's some weird formatting of: > > > To unsubscribe from this list: send the line "unsubscribe linux-rdma" in > > the body of a message to majordomo@vger.kernel.org > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > I have no idea why we get it. I send/receive in plain text. Hum, I checked one of your messages that had this and it isn't plain text, it was actually sent base64 encoded: Message-ID: <1828884A29C6694DAF28B7E6B8A82373A8FCA2F1@ORSMSX109.amr.corp.intel.com> Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: base64 So the garbage is what you get when you base 64 decode the standard list trailer. No idea why your side is generating base 64 encodings, but it is clearly a list bug that it blindly appends the trailer... Jason -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, May 19, 2015 at 08:27:22PM +0000, Hefty, Sean wrote: > > I have mixed feelings about using the SA message format for this. > > > > In-kernel we don't use that format, and many of the newer APIs were > > designed around passing the GMP,Forward,Return path tuple, not just a > > naked GMP. > > I think the choice of the netlink protocol is a largely matter of > taste. In the internal review of these patches, the netlink hook > was previously in the mad layer, which would have allowed > redirecting any mad to user space. (The code only attempted to > redirect SA queries.) Trying to hook from within ib_mad added a > fair amount of complexity, that moving up into sa_query avoided. > The proposed interface would allow moving the changes back down, > versus limiting this only to the SA query calls. (And I'm not > meaning to imply that limiting this only to SA queries is a bad > choice.) I'm not sure there is much use case for letting user space get involved in arbitary MAD traffic.. Over generalizing feels like over design and doesn't let us add valuable information that could help push policy decisions into user space, like passing the IP and TOS, QP type, etc when available, to userspace. If there is a use case for that, it still cleanly layers, the cache NL query goes first, if that fails then the MAD-based NL query goes next. > > I'd rather see the netlink data ask for what the kernel needs (eg UD > > path to X, RC path to X) and return the new format we've already > > established for AF_IB > > Does anyone else have input on the netlink protocol? Hal/Ira? MADs > seem more flexible, but higher-level structures more efficient and > natural. MADs are not more flexible, they are fixed and controlled by something outside the kernel. Their upside is we can track certain limited changes to the standard without too much effort. But the same reasons we didn't use them for AF_IB holds here too: - Routers are not supported - Asymmetric (non-reversible) mesh networks are not supported We should't bake in the already known limitations of PathRecord when adding a new scheme. Userspace should provide the same kind of information as AF_IB, in a netlink container that could be extended in future. Jason -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> I'm not sure there is much use case for letting user space get > involved in arbitary MAD traffic.. I was thinking more about easily supporting other queries - ServiceRecords, MCMemberRecords, etc. The only use case I'm currently aware of is PathRecords, however. > Over generalizing feels like over design and doesn't let us add > valuable information that could help push policy decisions into user > space, like passing the IP and TOS, QP type, etc when available, to > userspace. I agree. This level of control would be better. The current security model of IB seems hopelessly broken. IMO, all path data should come from privileged users, without any ability of the app to modify it. Changing the protocol shouldn't be a big deal, though if you wanted to make my life easy, we could just adopt the ibacm sockets protocol directly. :) - Sean -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, May 19, 2015 at 02:27:22PM -0600, Hefty, Sean wrote: > > I have mixed feelings about using the SA message format for this. > > > > In-kernel we don't use that format, and many of the newer APIs were > > designed around passing the GMP,Forward,Return path tuple, not just a > > naked GMP. > > I think the choice of the netlink protocol is a largely matter of taste. In the internal review of these patches, the netlink hook was previously in the mad layer, which would have allowed redirecting any mad to user space. (The code only attempted to redirect SA queries.) Trying to hook from within ib_mad added a fair amount of complexity, that moving up into sa_query avoided. The proposed interface would allow moving the changes back down, versus limiting this only to the SA query calls. (And I'm not meaning to imply that limiting this only to SA queries is a bad choice.) > > > I'd rather see the netlink data ask for what the kernel needs (eg UD > > path to X, RC path to X) and return the new format we've already > > established for AF_IB > > Does anyone else have input on the netlink protocol? Hal/Ira? MADs seem more flexible, but higher-level structures more efficient and natural. The idea was to implement a local SA cache. Nothing more. A lot of what is being discussed is beyond the intent of the patches. Ira -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, May 19, 2015 at 03:20:09PM -0600, Hefty, Sean wrote: > > I'm not sure there is much use case for letting user space get > > involved in arbitary MAD traffic.. > > I was thinking more about easily supporting other queries - ServiceRecords, MCMemberRecords, etc. The only use case I'm currently aware of is PathRecords, however. > I think MCMemberRecords (including joins) are good future candidates for this interface. The SSA work which Hal is doing had some plans for doing this. > > Over generalizing feels like over design and doesn't let us add > > valuable information that could help push policy decisions into user > > space, like passing the IP and TOS, QP type, etc when available, to > > userspace. > > I agree. This level of control would be better. The current security model of IB seems hopelessly broken. IMO, all path data should come from privileged users, without any ability of the app to modify it. > I agree with your statement but I'm not sure how Jasons proposal helps to control the use of PR data. As long as verbs supports the individual setting of PR data such as SL an application can change this data. > Changing the protocol shouldn't be a big deal, though if you wanted to make my life easy, we could just adopt the ibacm sockets protocol directly. :) > ib_sa_path_rec_get is pretty specific in what it does. Without over designing, this series adds access to the existing SA data cache (ibacm) for those users who call this function. I think that extending this to return an array of paths should be tackled in future patches. This future work could use a different protocol but I don't think it would be part of ib_sa_path_rec_get. Ira -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> > On Tue, May 19, 2015 at 02:27:22PM -0600, Hefty, Sean wrote: > > > I have mixed feelings about using the SA message format for this. > > > > > > In-kernel we don't use that format, and many of the newer APIs were > > > designed around passing the GMP,Forward,Return path tuple, not just > > > a naked GMP. > > > > I think the choice of the netlink protocol is a largely matter of > > taste. In the internal review of these patches, the netlink hook was > > previously in the mad layer, which would have allowed redirecting any > > mad to user space. (The code only attempted to redirect SA queries.) > > Trying to hook from within ib_mad added a fair amount of complexity, > > that moving up into sa_query avoided. The proposed interface would > > allow moving the changes back down, versus limiting this only to the > > SA query calls. (And I'm not meaning to imply that limiting this only > > to SA queries is a bad choice.) > > > > > I'd rather see the netlink data ask for what the kernel needs (eg UD > > > path to X, RC path to X) and return the new format we've already > > > established for AF_IB > > > > Does anyone else have input on the netlink protocol? Hal/Ira? MADs seem > more flexible, but higher-level structures more efficient and natural. > > > The idea was to implement a local SA cache. Nothing more. A lot of what is > being discussed is beyond the intent of the patches. > > Ira The higher-level scheme discussed so far can be implemented using similar mechanism (netlink), but with a different netlink multicast group, and therefore different data exchange format. Maybe it should be done in another patch series. Kaike -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, May 19, 2015 at 07:53:04PM -0400, ira.weiny wrote: > I think that extending this to return an array of paths should be tackled in > future patches. This future work could use a different protocol but I don't > think it would be part of ib_sa_path_rec_get. No, this is a UAPI, we have to live with it forever, it needs to be well designed, not just rammed into whatever already exists. I already have an experimental patch series that extends CM to support async scenarios, so the basic assumption this UAPI is premised on is already wrong. Jason -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, May 19, 2015 at 01:28:42PM -0600, Jason Gunthorpe wrote: > On Tue, May 19, 2015 at 03:26:15PM -0400, ira.weiny wrote: > > > The best use case is the desire to have the user space cache issue the query to > > the SA on behalf of this request, cache the data, and return the response. > > > This means the Netlink timeout needs to be longer than the SA direct timeout. > > By how much? It depends on the fabric size, SA load, userspace implementation etc. The last of which is potentially the most important. In general I would say we could take SubnetTimeOut + 1 sec as a good starting point if the userspace implementation were to issue individual PR queries. If it attempted some sort of bulk update of its cache in anticipation of additional queries it could be more. IBSSA for example may detect an epoch change and download a significant number of PRs when this query occurs. How long this takes is a function of that cache and not anything kernel space can or should know. > > The SA timeout is already huge and has lots of slack, adding another > timeout without an actual need seems premature.. > Define huge? If I'm doing my math right the max subnet timeout is > 8,000 seconds. None of the kernel users actually use the subnet timeout and most are no where near that max, only 9P seems "huge". iser: ret = rdma_resolve_route(cma_id, 1000); <=== 1 second 9P: #define P9_RDMA_TIMEOUT 30000 /* 30 seconds */ rds: #define RDS_RDMA_RESOLVE_TIMEOUT_MS 5000 <=== 5 seconds NFSoRDMA: #define RDMA_RESOLVE_TIMEOUT (5000) /* 5 seconds */ IPoIB: ib_sa_path_rec_get(&ipoib_sa_client, priv->ca, priv->port, &path->pathrec, IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID | IB_SA_PATH_REC_NUMB_PATH | IB_SA_PATH_REC_TRAFFIC_CLASS | IB_SA_PATH_REC_PKEY, 1 second ==================> 1000, GFP_ATOMIC, path_rec_completion, path, &path->query); SRP: SRP_PATH_REC_TIMEOUT_MS = 1000, <=== 1 second The other issue is that each caller in the kernel specifies a different timeout. Defining this in 1 central place and allowing user space to control the policy of that timeout is much better than allowing the kernel clients to specify the timeout as they do now. Ira -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> The other issue is that each caller in the kernel specifies a different > timeout. Defining this in 1 central place and allowing user space to > control > the policy of that timeout is much better than allowing the kernel clients > to > specify the timeout as they do now. Everything has been randomly hard-coded. IMO, the sa_query module should use its own timeout value, which it updates based on how fast it actually gets responses. But that takes too much work, and no one is ever going to write the code to do this. For the netlink specific problem, I'll propose using a different randomly hard-coded value as a timeout. Then define an 'MRA' type of message that user space can send to the kernel in order to ask it to wait longer. The 'set timeout' message could apply to a single request or all future requests. If we only wanted to the 'all future requests' option, the data value could be written into a file. In any case, this pushes the policy of the timeout value into the hands of the responding daemon. - Sean -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, May 20, 2015 at 10:13:59AM -0600, Hefty, Sean wrote: > > The other issue is that each caller in the kernel specifies a different > > timeout. Defining this in 1 central place and allowing user space to > > control > > the policy of that timeout is much better than allowing the kernel clients > > to > > specify the timeout as they do now. > > Everything has been randomly hard-coded. IMO, the sa_query module should use > its own timeout value, which it updates based on how fast it actually gets > responses. But that takes too much work, and no one is ever going to write > the code to do this. I agree. So lets not do this again. > > For the netlink specific problem, I'll propose using a different randomly > hard-coded value as a timeout. Isn't this what we have as the start default to the module parameter? > Then define an 'MRA' type of message that > user space can send to the kernel in order to ask it to wait longer. The > 'set timeout' message could apply to a single request or all future requests. > If we only wanted to the 'all future requests' option, the data value could > be written into a file. This is effectively what we have with the module parameter. > In any case, this pushes the policy of the timeout > value into the hands of the responding daemon. > I'm flexible on the mechanism used. In addition to the module parameter, we discussed sysctl as well as an additional control protocol for configuring the communication. We avoided these to keep the mechanism simple. But since we are going down the path of designing a new protocol I think it is reasonable to have some "control" packets there. Ira -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, May 20, 2015 at 04:13:59PM +0000, Hefty, Sean wrote: > > The other issue is that each caller in the kernel specifies a different > > timeout. Defining this in 1 central place and allowing user space to > > control > > the policy of that timeout is much better than allowing the kernel clients > > to > > specify the timeout as they do now. > > Everything has been randomly hard-coded. IMO, the sa_query module > should use its own timeout value, which it updates based on how fast > it actually gets responses. But that takes too much work, and no > one is ever going to write the code to do this. The IB spec actually says how to compute the timeout for the SA, and if done properly the SM will configure a timeout appropriate for the network. It looks like everything the kernel does in this area is basically wrong.. > For the netlink specific problem, I'll propose using a different > randomly hard-coded value as a timeout. Then define an 'MRA' type > of message that user space can send to the kernel in order to ask it > to wait longer. The 'set timeout' message could apply to a single > request or all future requests. If we only wanted to the 'all > future requests' option, the data value could be written into a > file. In any case, this pushes the policy of the timeout value into > the hands of the responding daemon. A fixed will known timeout (5s?) and require that user space send a 'operation in progress' positive liveness indication seems very reasonable. The only purpose of the timeout is to detect a locked up daemon so IB doesn't lock up, so a watchdog like scheme seems appropriate. Jason -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index c38f030..69fcd28 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -45,12 +45,22 @@ #include <uapi/linux/if_ether.h> #include <rdma/ib_pack.h> #include <rdma/ib_cache.h> +#include <rdma/rdma_netlink.h> +#include <net/netlink.h> +#include <rdma/rdma_netlink.h> #include "sa.h" MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("InfiniBand subnet administration query support"); MODULE_LICENSE("Dual BSD/GPL"); +#define IB_SA_LOCAL_SVC_TIMEOUT_MIN 100 +#define IB_SA_LOCAL_SVC_TIMEOUT_DEFAULT 2000 +static int sa_local_svc_timeout_ms = IB_SA_LOCAL_SVC_TIMEOUT_DEFAULT; + +module_param_named(local_svc_timeout_ms, sa_local_svc_timeout_ms, int, 0644); +MODULE_PARM_DESC(local_svc_timeout_ms, "Local service timeout in millisecond"); + struct ib_sa_sm_ah { struct ib_ah *ah; struct kref ref; @@ -80,8 +90,24 @@ struct ib_sa_query { struct ib_mad_send_buf *mad_buf; struct ib_sa_sm_ah *sm_ah; int id; + u32 flags; }; +#define IB_SA_ENABLE_LOCAL_SERVICE 0x00000001 +#define IB_SA_CANCEL 0x00000002 + +#define IB_SA_LOCAL_SVC_ENABLED(query) \ + ((query)->flags & IB_SA_ENABLE_LOCAL_SERVICE) +#define IB_SA_ENABLE_LOCAL_SVC(query) \ + ((query)->flags |= IB_SA_ENABLE_LOCAL_SERVICE) +#define IB_SA_DISABLE_LOCAL_SVC(query) \ + ((query)->flags &= ~IB_SA_ENABLE_LOCAL_SERVICE) + +#define IB_SA_QUERY_CANCELLED(query) \ + ((query)->flags & IB_SA_CANCEL) +#define IB_SA_CANCEL_QUERY(query) \ + ((query)->flags |= IB_SA_CANCEL) + struct ib_sa_service_query { void (*callback)(int, struct ib_sa_service_rec *, void *); void *context; @@ -106,6 +132,24 @@ struct ib_sa_mcmember_query { struct ib_sa_query sa_query; }; +struct ib_nl_request_info { + struct list_head list; + u32 seq; + unsigned long timeout; + struct ib_sa_query *query; +}; + +struct rdma_nl_resp_msg { + struct nlmsghdr nl_hdr; + struct ib_sa_mad sa_mad; +}; + +static LIST_HEAD(ib_nl_request_list); +static DEFINE_SPINLOCK(ib_nl_request_lock); +static atomic_t ib_nl_sa_request_seq; +static struct workqueue_struct *ib_nl_wq; +static struct delayed_work ib_nl_timed_work; + static void ib_sa_add_one(struct ib_device *device); static void ib_sa_remove_one(struct ib_device *device); @@ -381,6 +425,244 @@ static const struct ib_field guidinfo_rec_table[] = { .size_bits = 512 }, }; +static int ib_nl_send_mad(void *mad, int len, u32 seq) +{ + struct sk_buff *skb = NULL; + struct nlmsghdr *nlh; + void *data; + int ret = 0; + + skb = nlmsg_new(len, GFP_KERNEL); + if (!skb) { + pr_err("alloc failed ret=%d\n", ret); + return -ENOMEM; + } + + data = ibnl_put_msg(skb, &nlh, seq, len, RDMA_NL_MAD, + RDMA_NL_MAD_REQUEST, GFP_KERNEL); + if (!data) { + kfree_skb(skb); + return -EMSGSIZE; + } + memcpy(data, mad, len); + + ret = ibnl_multicast(skb, nlh, RDMA_NL_GROUP_MAD, GFP_KERNEL); + if (!ret) { + ret = len; + } else { + if (ret != -ESRCH) + pr_err("ibnl_multicast failed l=%d, r=%d\n", len, ret); + ret = 0; + } + return ret; +} + +static struct ib_nl_request_info * +ib_nl_alloc_request(struct ib_sa_query *query) +{ + struct ib_nl_request_info *rinfo; + + rinfo = kzalloc(sizeof(*rinfo), GFP_ATOMIC); + if (rinfo == NULL) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&rinfo->list); + rinfo->seq = (u32)atomic_inc_return(&ib_nl_sa_request_seq); + rinfo->query = query; + + return rinfo; +} + +static int ib_nl_send_request(struct ib_nl_request_info *rinfo) +{ + struct ib_mad_send_buf *send_buf; + unsigned long flags; + unsigned long delay; + int ret; + + send_buf = rinfo->query->mad_buf; + + delay = msecs_to_jiffies(sa_local_svc_timeout_ms); + spin_lock_irqsave(&ib_nl_request_lock, flags); + ret = ib_nl_send_mad(send_buf->mad, + (send_buf->data_len + send_buf->hdr_len), + rinfo->seq); + + if (ret != (send_buf->data_len + send_buf->hdr_len)) { + kfree(rinfo); + ret = -EIO; + goto request_out; + } else { + ret = 0; + } + + rinfo->timeout = delay + jiffies; + list_add_tail(&rinfo->list, &ib_nl_request_list); + /* Start the timeout if this is the only request */ + if (ib_nl_request_list.next == &rinfo->list) + queue_delayed_work(ib_nl_wq, &ib_nl_timed_work, delay); + +request_out: + spin_unlock_irqrestore(&ib_nl_request_lock, flags); + + return ret; +} + +static int ib_nl_make_request(struct ib_sa_query *query) +{ + struct ib_nl_request_info *rinfo; + + rinfo = ib_nl_alloc_request(query); + if (IS_ERR(rinfo)) + return -ENOMEM; + + return ib_nl_send_request(rinfo); +} + +static int ib_nl_cancel_request(struct ib_sa_query *query) +{ + unsigned long flags; + struct ib_nl_request_info *rinfo; + int found = 0; + + spin_lock_irqsave(&ib_nl_request_lock, flags); + list_for_each_entry(rinfo, &ib_nl_request_list, list) { + /* Let the timeout to take care of the callback */ + if (query == rinfo->query) { + IB_SA_CANCEL_QUERY(query); + rinfo->timeout = jiffies; + list_move(&rinfo->list, &ib_nl_request_list); + found = 1; + mod_delayed_work(ib_nl_wq, &ib_nl_timed_work, 1); + break; + } + } + spin_unlock_irqrestore(&ib_nl_request_lock, flags); + + return found; +} + + +static int ib_nl_handle_mad_resp(struct sk_buff *skb, + struct netlink_callback *cb); +static struct ibnl_client_cbs ib_sa_cb_table[] = { + [RDMA_NL_MAD_REQUEST] = { + .dump = ib_nl_handle_mad_resp, + .module = THIS_MODULE }, +}; + +static void send_handler(struct ib_mad_agent *agent, + struct ib_mad_send_wc *mad_send_wc); + +static void ib_nl_process_good_rsp(struct ib_sa_query *query, + struct ib_sa_mad *rsp) +{ + struct ib_mad_send_wc mad_send_wc; + + if (query->callback) + query->callback(query, 0, rsp); + + mad_send_wc.send_buf = query->mad_buf; + mad_send_wc.status = IB_WC_SUCCESS; + send_handler(query->mad_buf->mad_agent, &mad_send_wc); +} + +static void ib_nl_request_timeout(struct work_struct *work) +{ + unsigned long flags; + struct ib_nl_request_info *rinfo; + struct ib_sa_query *query; + unsigned long delay; + struct ib_mad_send_wc mad_send_wc; + int ret; + + spin_lock_irqsave(&ib_nl_request_lock, flags); + while (!list_empty(&ib_nl_request_list)) { + rinfo = list_entry(ib_nl_request_list.next, + struct ib_nl_request_info, list); + + if (time_after(rinfo->timeout, jiffies)) { + delay = rinfo->timeout - jiffies; + if ((long)delay <= 0) + delay = 1; + queue_delayed_work(ib_nl_wq, &ib_nl_timed_work, delay); + break; + } + + list_del(&rinfo->list); + query = rinfo->query; + IB_SA_DISABLE_LOCAL_SVC(query); + /* Hold the lock to protect against query cancellation */ + if (IB_SA_QUERY_CANCELLED(query)) + ret = -1; + else + ret = ib_post_send_mad(query->mad_buf, NULL); + if (ret) { + mad_send_wc.send_buf = query->mad_buf; + mad_send_wc.status = IB_WC_WR_FLUSH_ERR; + spin_unlock_irqrestore(&ib_nl_request_lock, flags); + send_handler(query->port->agent, &mad_send_wc); + spin_lock_irqsave(&ib_nl_request_lock, flags); + } + kfree(rinfo); + } + spin_unlock_irqrestore(&ib_nl_request_lock, flags); +} + +static int ib_nl_handle_mad_resp(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct rdma_nl_resp_msg *nl_msg = (struct rdma_nl_resp_msg *)cb->nlh; + unsigned long flags; + struct ib_nl_request_info *rinfo; + struct ib_sa_query *query; + struct ib_mad_send_buf *send_buf; + struct ib_mad_send_wc mad_send_wc; + int found = 0; + int ret; + + spin_lock_irqsave(&ib_nl_request_lock, flags); + list_for_each_entry(rinfo, &ib_nl_request_list, list) { + /* + * If the query is cancelled, let the timeout routine + * take care of it. + */ + if (nl_msg->nl_hdr.nlmsg_seq == rinfo->seq) { + found = !IB_SA_QUERY_CANCELLED(rinfo->query); + if (found) + list_del(&rinfo->list); + break; + } + } + + if (!found) { + spin_unlock_irqrestore(&ib_nl_request_lock, flags); + goto resp_out; + } + + query = rinfo->query; + send_buf = query->mad_buf; + + if (nl_msg->sa_mad.mad_hdr.status != 0) { + /* if the result is a failure, send out the packet via IB */ + IB_SA_DISABLE_LOCAL_SVC(query); + ret = ib_post_send_mad(query->mad_buf, NULL); + spin_unlock_irqrestore(&ib_nl_request_lock, flags); + if (ret) { + mad_send_wc.send_buf = send_buf; + mad_send_wc.status = IB_WC_GENERAL_ERR; + send_handler(query->port->agent, &mad_send_wc); + } + } else { + spin_unlock_irqrestore(&ib_nl_request_lock, flags); + ib_nl_process_good_rsp(query, &nl_msg->sa_mad); + } + + kfree(rinfo); +resp_out: + return skb->len; +} + static void free_sm_ah(struct kref *kref) { struct ib_sa_sm_ah *sm_ah = container_of(kref, struct ib_sa_sm_ah, ref); @@ -502,7 +784,13 @@ void ib_sa_cancel_query(int id, struct ib_sa_query *query) mad_buf = query->mad_buf; spin_unlock_irqrestore(&idr_lock, flags); - ib_cancel_mad(agent, mad_buf); + /* + * If the query is still on the netlink request list, schedule + * it to be cancelled by the timeout routine. Otherwise, it has been + * sent to the MAD layer and has to be cancelled from there. + */ + if (!ib_nl_cancel_request(query)) + ib_cancel_mad(agent, mad_buf); } EXPORT_SYMBOL(ib_sa_cancel_query); @@ -638,6 +926,14 @@ static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask) query->mad_buf->context[0] = query; query->id = id; + if (IB_SA_LOCAL_SVC_ENABLED(query)) { + if (!ibnl_chk_listeners(RDMA_NL_GROUP_MAD)) { + if (!ib_nl_make_request(query)) + return id; + } + IB_SA_DISABLE_LOCAL_SVC(query); + } + ret = ib_post_send_mad(query->mad_buf, NULL); if (ret) { spin_lock_irqsave(&idr_lock, flags); @@ -739,7 +1035,7 @@ int ib_sa_path_rec_get(struct ib_sa_client *client, port = &sa_dev->port[port_num - sa_dev->start_port]; agent = port->agent; - query = kmalloc(sizeof *query, gfp_mask); + query = kzalloc(sizeof(*query), gfp_mask); if (!query) return -ENOMEM; @@ -766,6 +1062,8 @@ int ib_sa_path_rec_get(struct ib_sa_client *client, *sa_query = &query->sa_query; + IB_SA_ENABLE_LOCAL_SVC(&query->sa_query); + ret = send_mad(&query->sa_query, timeout_ms, gfp_mask); if (ret < 0) goto err2; @@ -861,7 +1159,7 @@ int ib_sa_service_rec_query(struct ib_sa_client *client, method != IB_SA_METHOD_DELETE) return -EINVAL; - query = kmalloc(sizeof *query, gfp_mask); + query = kzalloc(sizeof(*query), gfp_mask); if (!query) return -ENOMEM; @@ -953,7 +1251,7 @@ int ib_sa_mcmember_rec_query(struct ib_sa_client *client, port = &sa_dev->port[port_num - sa_dev->start_port]; agent = port->agent; - query = kmalloc(sizeof *query, gfp_mask); + query = kzalloc(sizeof(*query), gfp_mask); if (!query) return -ENOMEM; @@ -1050,7 +1348,7 @@ int ib_sa_guid_info_rec_query(struct ib_sa_client *client, port = &sa_dev->port[port_num - sa_dev->start_port]; agent = port->agent; - query = kmalloc(sizeof *query, gfp_mask); + query = kzalloc(sizeof(*query), gfp_mask); if (!query) return -ENOMEM; @@ -1250,6 +1548,10 @@ static int __init ib_sa_init(void) get_random_bytes(&tid, sizeof tid); + atomic_set(&ib_nl_sa_request_seq, 0); + sa_local_svc_timeout_ms = max(sa_local_svc_timeout_ms, + IB_SA_LOCAL_SVC_TIMEOUT_MIN); + ret = ib_register_client(&sa_client); if (ret) { printk(KERN_ERR "Couldn't register ib_sa client\n"); @@ -1262,7 +1564,25 @@ static int __init ib_sa_init(void) goto err2; } + ib_nl_wq = create_singlethread_workqueue("ib_nl_sa_wq"); + if (!ib_nl_wq) { + ret = -ENOMEM; + goto err3; + } + + if (ibnl_add_client(RDMA_NL_MAD, RDMA_NL_MAD_NUM_OPS, + ib_sa_cb_table)) { + pr_err("Failed to add netlink callback\n"); + ret = -EINVAL; + goto err4; + } + INIT_DELAYED_WORK(&ib_nl_timed_work, ib_nl_request_timeout); + return 0; +err4: + destroy_workqueue(ib_nl_wq); +err3: + mcast_cleanup(); err2: ib_unregister_client(&sa_client); err1: @@ -1271,6 +1591,10 @@ err1: static void __exit ib_sa_cleanup(void) { + ibnl_remove_client(RDMA_NL_MAD); + cancel_delayed_work(&ib_nl_timed_work); + flush_workqueue(ib_nl_wq); + destroy_workqueue(ib_nl_wq); mcast_cleanup(); ib_unregister_client(&sa_client); idr_destroy(&query_idr);