diff mbox

[3/3] IB/sa: route SA pathrecord query through netlink

Message ID 1431975616-23529-4-git-send-email-kaike.wan@intel.com (mailing list archive)
State Superseded
Headers show

Commit Message

Wan, Kaike May 18, 2015, 7 p.m. UTC
From: Kaike Wan <kaike.wan@intel.com>

This patch routes a SA pathrecord query to netlink first and processes the
response appropriately. If a failure is returned, the request will be sent
through IB. The decision whether to route the request to netlink first is
determined by the presence of a listener for the MAD netlink multicast
group. If the user-space MAD netlink multicast group listener is not
present, the request will be sent through IB, just like what is currently
being done.

Signed-off-by: Kaike Wan <kaike.wan@intel.com>
Signed-off-by: John Fleck <john.fleck@intel.com>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Sean Hefty <sean.hefty@intel.com>
---
 drivers/infiniband/core/sa_query.c |  334 +++++++++++++++++++++++++++++++++++-
 1 files changed, 329 insertions(+), 5 deletions(-)

Comments

Or Gerlitz May 19, 2015, 5 a.m. UTC | #1
On 5/18/2015 10:00 PM, kaike.wan@intel.com wrote:

Would be good to have consistent setup for patch titles, e.g start with 
capital letter
as you did for patches 1 and 2 ("route SA ..." --> "Route SA...").
> --- a/drivers/infiniband/core/sa_query.c
> +++ b/drivers/infiniband/core/sa_query.c
> @@ -45,12 +45,22 @@
>   #include <uapi/linux/if_ether.h>
>   #include <rdma/ib_pack.h>
>   #include <rdma/ib_cache.h>
> +#include <rdma/rdma_netlink.h>
> +#include <net/netlink.h>
> +#include <rdma/rdma_netlink.h>
>   #include "sa.h"
>   
>   MODULE_AUTHOR("Roland Dreier");
>   MODULE_DESCRIPTION("InfiniBand subnet administration query support");
>   MODULE_LICENSE("Dual BSD/GPL");
>   
> +#define IB_SA_LOCAL_SVC_TIMEOUT_MIN		100
> +#define IB_SA_LOCAL_SVC_TIMEOUT_DEFAULT		2000
> +static int sa_local_svc_timeout_ms = IB_SA_LOCAL_SVC_TIMEOUT_DEFAULT;
> +
> +module_param_named(local_svc_timeout_ms, sa_local_svc_timeout_ms, int, 0644);
> +MODULE_PARM_DESC(local_svc_timeout_ms, "Local service timeout in millisecond");

what's actually the role of the module param? why it belongs here? is 
that really unavoidable to have it?


>   
> +struct ib_nl_request_info {
> +	struct list_head list;
> +	u32 seq;
> +	unsigned long timeout;
> +	struct ib_sa_query *query;
> +};
> +
> +struct rdma_nl_resp_msg {
> +	struct nlmsghdr nl_hdr;
> +	struct ib_sa_mad sa_mad;
> +};

You use ib_nl prefix for request and rdma_nl prefix for response... make 
it consistent.

Also, request and response are too generic, maybe use naming which is a 
bit more informative on what you
are doing here?


> +
> +static LIST_HEAD(ib_nl_request_list);
> +static DEFINE_SPINLOCK(ib_nl_request_lock);
> +static atomic_t ib_nl_sa_request_seq;
> +static struct workqueue_struct *ib_nl_wq;
> +static struct delayed_work ib_nl_timed_work;
> +
>   static void ib_sa_add_one(struct ib_device *device);
>   static void ib_sa_remove_one(struct ib_device *device);
>   
> @@ -381,6 +425,244 @@ static const struct ib_field guidinfo_rec_table[] = {
>   	  .size_bits    = 512 },
>   };
>   
> +static int ib_nl_send_mad(void *mad, int len, u32 seq)
> +{
> +	struct sk_buff *skb = NULL;
> +	struct nlmsghdr *nlh;
> +	void *data;
> +	int ret = 0;
> +
> +	skb = nlmsg_new(len, GFP_KERNEL);
> +	if (!skb) {
> +		pr_err("alloc failed ret=%d\n", ret);
> +		return -ENOMEM;
> +	}
> +
> +	data = ibnl_put_msg(skb, &nlh, seq, len, RDMA_NL_MAD,
> +			    RDMA_NL_MAD_REQUEST, GFP_KERNEL);
> +	if (!data) {
> +		kfree_skb(skb);
> +		return -EMSGSIZE;
> +	}
> +	memcpy(data, mad, len);
> +
> +	ret = ibnl_multicast(skb, nlh, RDMA_NL_GROUP_MAD, GFP_KERNEL);
> +	if (!ret) {
> +		ret = len;
> +	} else {
> +		if (ret != -ESRCH)
> +			pr_err("ibnl_multicast failed l=%d, r=%d\n", len, ret);
> +		ret = 0;
> +	}
> +	return ret;
> +}
> +
> +static struct ib_nl_request_info *
> +ib_nl_alloc_request(struct ib_sa_query *query)
> +{
> +	struct ib_nl_request_info *rinfo;
> +
> +	rinfo = kzalloc(sizeof(*rinfo), GFP_ATOMIC);
> +	if (rinfo == NULL)
> +		return ERR_PTR(-ENOMEM);
> +
> +	INIT_LIST_HEAD(&rinfo->list);
> +	rinfo->seq = (u32)atomic_inc_return(&ib_nl_sa_request_seq);
> +	rinfo->query = query;
> +
> +	return rinfo;
> +}
> +
> +static int ib_nl_send_request(struct ib_nl_request_info *rinfo)
> +{
> +	struct ib_mad_send_buf *send_buf;
> +	unsigned long flags;
> +	unsigned long delay;
> +	int ret;
> +
> +	send_buf = rinfo->query->mad_buf;
> +
> +	delay = msecs_to_jiffies(sa_local_svc_timeout_ms);
> +	spin_lock_irqsave(&ib_nl_request_lock, flags);
> +	ret = ib_nl_send_mad(send_buf->mad,
> +			     (send_buf->data_len + send_buf->hdr_len),
> +			     rinfo->seq);
> +
> +	if (ret != (send_buf->data_len + send_buf->hdr_len)) {
> +		kfree(rinfo);
> +		ret = -EIO;
> +		goto request_out;
> +	} else {
> +		ret = 0;
> +	}
> +
> +	rinfo->timeout = delay + jiffies;
> +	list_add_tail(&rinfo->list, &ib_nl_request_list);
> +	/* Start the timeout if this is the only request */
> +	if (ib_nl_request_list.next == &rinfo->list)
> +		queue_delayed_work(ib_nl_wq, &ib_nl_timed_work, delay);
> +
> +request_out:
> +	spin_unlock_irqrestore(&ib_nl_request_lock, flags);
> +
> +	return ret;
> +}
> +
> +static int ib_nl_make_request(struct ib_sa_query *query)
> +{
> +	struct ib_nl_request_info *rinfo;
> +
> +	rinfo = ib_nl_alloc_request(query);
> +	if (IS_ERR(rinfo))
> +		return -ENOMEM;
> +
> +	return ib_nl_send_request(rinfo);
> +}
> +
> +static int ib_nl_cancel_request(struct ib_sa_query *query)
> +{
> +	unsigned long flags;
> +	struct ib_nl_request_info *rinfo;
> +	int found = 0;
> +
> +	spin_lock_irqsave(&ib_nl_request_lock, flags);
> +	list_for_each_entry(rinfo, &ib_nl_request_list, list) {
> +		/* Let the timeout to take care of the callback */
> +		if (query == rinfo->query) {
> +			IB_SA_CANCEL_QUERY(query);
> +			rinfo->timeout = jiffies;
> +			list_move(&rinfo->list, &ib_nl_request_list);
> +			found = 1;
> +			mod_delayed_work(ib_nl_wq, &ib_nl_timed_work, 1);
> +			break;
> +		}
> +	}
> +	spin_unlock_irqrestore(&ib_nl_request_lock, flags);
> +
> +	return found;
> +}
> +
> +
> +static int ib_nl_handle_mad_resp(struct sk_buff *skb,
> +				 struct netlink_callback *cb);
> +static struct ibnl_client_cbs ib_sa_cb_table[] = {
> +	[RDMA_NL_MAD_REQUEST] = {
> +		.dump = ib_nl_handle_mad_resp,
> +		.module = THIS_MODULE },
> +};
> +
> +static void send_handler(struct ib_mad_agent *agent,
> +			 struct ib_mad_send_wc *mad_send_wc);
> +
> +static void ib_nl_process_good_rsp(struct ib_sa_query *query,
> +				   struct ib_sa_mad *rsp)
> +{
> +	struct ib_mad_send_wc mad_send_wc;
> +
> +	if (query->callback)
> +		query->callback(query, 0, rsp);
> +
> +	mad_send_wc.send_buf = query->mad_buf;
> +	mad_send_wc.status = IB_WC_SUCCESS;
> +	send_handler(query->mad_buf->mad_agent, &mad_send_wc);
> +}
> +
> +static void ib_nl_request_timeout(struct work_struct *work)
> +{
> +	unsigned long flags;
> +	struct ib_nl_request_info *rinfo;
> +	struct ib_sa_query *query;
> +	unsigned long delay;
> +	struct ib_mad_send_wc mad_send_wc;
> +	int ret;
> +
> +	spin_lock_irqsave(&ib_nl_request_lock, flags);
> +	while (!list_empty(&ib_nl_request_list)) {
> +		rinfo = list_entry(ib_nl_request_list.next,
> +				   struct ib_nl_request_info, list);
> +
> +		if (time_after(rinfo->timeout, jiffies)) {
> +			delay = rinfo->timeout - jiffies;
> +			if ((long)delay <= 0)
> +				delay = 1;
> +			queue_delayed_work(ib_nl_wq, &ib_nl_timed_work, delay);
> +			break;
> +		}
> +
> +		list_del(&rinfo->list);
> +		query = rinfo->query;
> +		IB_SA_DISABLE_LOCAL_SVC(query);
> +		/* Hold the lock to protect against query cancellation */
> +		if (IB_SA_QUERY_CANCELLED(query))
> +			ret = -1;
> +		else
> +			ret = ib_post_send_mad(query->mad_buf, NULL);
> +		if (ret) {
> +			mad_send_wc.send_buf = query->mad_buf;
> +			mad_send_wc.status = IB_WC_WR_FLUSH_ERR;
> +			spin_unlock_irqrestore(&ib_nl_request_lock, flags);
> +			send_handler(query->port->agent, &mad_send_wc);
> +			spin_lock_irqsave(&ib_nl_request_lock, flags);
> +		}
> +		kfree(rinfo);
> +	}
> +	spin_unlock_irqrestore(&ib_nl_request_lock, flags);
> +}
> +
> +static int ib_nl_handle_mad_resp(struct sk_buff *skb,
> +				 struct netlink_callback *cb)
> +{
> +	struct rdma_nl_resp_msg *nl_msg = (struct rdma_nl_resp_msg *)cb->nlh;
> +	unsigned long flags;
> +	struct ib_nl_request_info *rinfo;
> +	struct ib_sa_query *query;
> +	struct ib_mad_send_buf *send_buf;
> +	struct ib_mad_send_wc mad_send_wc;
> +	int found = 0;
> +	int ret;
> +
> +	spin_lock_irqsave(&ib_nl_request_lock, flags);
> +	list_for_each_entry(rinfo, &ib_nl_request_list, list) {
> +		/*
> +		 * If the query is cancelled, let the timeout routine
> +		 * take care of it.
> +		 */
> +		if (nl_msg->nl_hdr.nlmsg_seq == rinfo->seq) {
> +			found = !IB_SA_QUERY_CANCELLED(rinfo->query);
> +			if (found)
> +				list_del(&rinfo->list);
> +			break;
> +		}
> +	}
> +
> +	if (!found) {
> +		spin_unlock_irqrestore(&ib_nl_request_lock, flags);
> +		goto resp_out;
> +	}
> +
> +	query = rinfo->query;
> +	send_buf = query->mad_buf;
> +
> +	if (nl_msg->sa_mad.mad_hdr.status != 0) {
> +		/* if the result is a failure, send out the packet via IB */
> +		IB_SA_DISABLE_LOCAL_SVC(query);
> +		ret = ib_post_send_mad(query->mad_buf, NULL);
> +		spin_unlock_irqrestore(&ib_nl_request_lock, flags);
> +		if (ret) {
> +			mad_send_wc.send_buf = send_buf;
> +			mad_send_wc.status = IB_WC_GENERAL_ERR;
> +			send_handler(query->port->agent, &mad_send_wc);
> +		}
> +	} else {
> +		spin_unlock_irqrestore(&ib_nl_request_lock, flags);
> +		ib_nl_process_good_rsp(query, &nl_msg->sa_mad);
> +	}
> +
> +	kfree(rinfo);
> +resp_out:
> +	return skb->len;
> +}
> +
>   static void free_sm_ah(struct kref *kref)
>   {
>   	struct ib_sa_sm_ah *sm_ah = container_of(kref, struct ib_sa_sm_ah, ref);
> @@ -502,7 +784,13 @@ void ib_sa_cancel_query(int id, struct ib_sa_query *query)
>   	mad_buf = query->mad_buf;
>   	spin_unlock_irqrestore(&idr_lock, flags);
>   
> -	ib_cancel_mad(agent, mad_buf);
> +	/*
> +	 * If the query is still on the netlink request list, schedule
> +	 * it to be cancelled by the timeout routine. Otherwise, it has been
> +	 * sent to the MAD layer and has to be cancelled from there.
> +	 */
> +	if (!ib_nl_cancel_request(query))
> +		ib_cancel_mad(agent, mad_buf);
>   }
>   EXPORT_SYMBOL(ib_sa_cancel_query);
>   
> @@ -638,6 +926,14 @@ static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask)
>   	query->mad_buf->context[0] = query;
>   	query->id = id;
>   
> +	if (IB_SA_LOCAL_SVC_ENABLED(query)) {
> +		if (!ibnl_chk_listeners(RDMA_NL_GROUP_MAD)) {
> +			if (!ib_nl_make_request(query))
> +				return id;
> +		}
> +		IB_SA_DISABLE_LOCAL_SVC(query);
> +	}
> +
>   	ret = ib_post_send_mad(query->mad_buf, NULL);
>   	if (ret) {
>   		spin_lock_irqsave(&idr_lock, flags);
> @@ -739,7 +1035,7 @@ int ib_sa_path_rec_get(struct ib_sa_client *client,
>   	port  = &sa_dev->port[port_num - sa_dev->start_port];
>   	agent = port->agent;
>   
> -	query = kmalloc(sizeof *query, gfp_mask);
> +	query = kzalloc(sizeof(*query), gfp_mask);
>   	if (!query)
>   		return -ENOMEM;
>   
> @@ -766,6 +1062,8 @@ int ib_sa_path_rec_get(struct ib_sa_client *client,
>   
>   	*sa_query = &query->sa_query;
>   
> +	IB_SA_ENABLE_LOCAL_SVC(&query->sa_query);
> +
>   	ret = send_mad(&query->sa_query, timeout_ms, gfp_mask);
>   	if (ret < 0)
>   		goto err2;
> @@ -861,7 +1159,7 @@ int ib_sa_service_rec_query(struct ib_sa_client *client,
>   	    method != IB_SA_METHOD_DELETE)
>   		return -EINVAL;
>   
> -	query = kmalloc(sizeof *query, gfp_mask);
> +	query = kzalloc(sizeof(*query), gfp_mask);
>   	if (!query)
>   		return -ENOMEM;
>   
> @@ -953,7 +1251,7 @@ int ib_sa_mcmember_rec_query(struct ib_sa_client *client,
>   	port  = &sa_dev->port[port_num - sa_dev->start_port];
>   	agent = port->agent;
>   
> -	query = kmalloc(sizeof *query, gfp_mask);
> +	query = kzalloc(sizeof(*query), gfp_mask);
>   	if (!query)
>   		return -ENOMEM;
>   
> @@ -1050,7 +1348,7 @@ int ib_sa_guid_info_rec_query(struct ib_sa_client *client,
>   	port  = &sa_dev->port[port_num - sa_dev->start_port];
>   	agent = port->agent;
>   
> -	query = kmalloc(sizeof *query, gfp_mask);
> +	query = kzalloc(sizeof(*query), gfp_mask);
>   	if (!query)
>   		return -ENOMEM;

Please move the three kmalloc --> kzalloc changes above (and more of 
their such if exist in the patch) to a pre-patch


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ilya Nelkenbaum May 19, 2015, 6:49 a.m. UTC | #2
On 5/18/2015 10:00 PM, kaike.wan@intel.com wrote:
> diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
> index c38f030..69fcd28 100644
> --- a/drivers/infiniband/core/sa_query.c
> +++ b/drivers/infiniband/core/sa_query.c
> @@ -45,12 +45,22 @@
>  #include <uapi/linux/if_ether.h>
>  #include <rdma/ib_pack.h>
>  #include <rdma/ib_cache.h>
> +#include <rdma/rdma_netlink.h>
> +#include <net/netlink.h>
> +#include <rdma/rdma_netlink.h>
Redundant include

>  #include "sa.h"

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Wan, Kaike May 19, 2015, 12:24 p.m. UTC | #3
> 
> On 5/18/2015 10:00 PM, kaike.wan@intel.com wrote:
> 
> Would be good to have consistent setup for patch titles, e.g start with capital
> letter as you did for patches 1 and 2 ("route SA ..." --> "Route SA...").

Will be changed. Thank you.

> > --- a/drivers/infiniband/core/sa_query.c
> > +++ b/drivers/infiniband/core/sa_query.c
> > @@ -45,12 +45,22 @@
> >   #include <uapi/linux/if_ether.h>
> >   #include <rdma/ib_pack.h>
> >   #include <rdma/ib_cache.h>
> > +#include <rdma/rdma_netlink.h>
> > +#include <net/netlink.h>
> > +#include <rdma/rdma_netlink.h>
> >   #include "sa.h"
> >
> >   MODULE_AUTHOR("Roland Dreier");
> >   MODULE_DESCRIPTION("InfiniBand subnet administration query
> support");
> >   MODULE_LICENSE("Dual BSD/GPL");
> >
> > +#define IB_SA_LOCAL_SVC_TIMEOUT_MIN		100
> > +#define IB_SA_LOCAL_SVC_TIMEOUT_DEFAULT		2000
> > +static int sa_local_svc_timeout_ms =
> IB_SA_LOCAL_SVC_TIMEOUT_DEFAULT;
> > +
> > +module_param_named(local_svc_timeout_ms,
> sa_local_svc_timeout_ms,
> > +int, 0644); MODULE_PARM_DESC(local_svc_timeout_ms, "Local service
> > +timeout in millisecond");
> 
> what's actually the role of the module param? why it belongs here? is that
> really unavoidable to have it?

It's nice to provide the capability for the user to adjust the netlink timeout based his/her fabric setup.

> 
> 
> >
> > +struct ib_nl_request_info {
> > +	struct list_head list;
> > +	u32 seq;
> > +	unsigned long timeout;
> > +	struct ib_sa_query *query;
> > +};
> > +
> > +struct rdma_nl_resp_msg {
> > +	struct nlmsghdr nl_hdr;
> > +	struct ib_sa_mad sa_mad;
> > +};
> 
> You use ib_nl prefix for request and rdma_nl prefix for response... make
> it consistent.

Will be changed.

> 
> Also, request and response are too generic, maybe use naming which is a
> bit more informative on what you
> are doing here?

That's why we have ib_nl prefix to indicate netlink request/response. Any better naming is welcome.

> 
> 
> > +
> > +static LIST_HEAD(ib_nl_request_list);
> > +static DEFINE_SPINLOCK(ib_nl_request_lock);
> > +static atomic_t ib_nl_sa_request_seq;
> > +static struct workqueue_struct *ib_nl_wq;
> > +static struct delayed_work ib_nl_timed_work;
> > +
> >   static void ib_sa_add_one(struct ib_device *device);
> >   static void ib_sa_remove_one(struct ib_device *device);
> >
> > @@ -381,6 +425,244 @@ static const struct ib_field guidinfo_rec_table[] = {
> >   	  .size_bits    = 512 },
> >   };
> >
> > +static int ib_nl_send_mad(void *mad, int len, u32 seq)
> > +{
> > +	struct sk_buff *skb = NULL;
> > +	struct nlmsghdr *nlh;
> > +	void *data;
> > +	int ret = 0;
> > +
> > +	skb = nlmsg_new(len, GFP_KERNEL);
> > +	if (!skb) {
> > +		pr_err("alloc failed ret=%d\n", ret);
> > +		return -ENOMEM;
> > +	}
> > +
> > +	data = ibnl_put_msg(skb, &nlh, seq, len, RDMA_NL_MAD,
> > +			    RDMA_NL_MAD_REQUEST, GFP_KERNEL);
> > +	if (!data) {
> > +		kfree_skb(skb);
> > +		return -EMSGSIZE;
> > +	}
> > +	memcpy(data, mad, len);
> > +
> > +	ret = ibnl_multicast(skb, nlh, RDMA_NL_GROUP_MAD,
> GFP_KERNEL);
> > +	if (!ret) {
> > +		ret = len;
> > +	} else {
> > +		if (ret != -ESRCH)
> > +			pr_err("ibnl_multicast failed l=%d, r=%d\n", len, ret);
> > +		ret = 0;
> > +	}
> > +	return ret;
> > +}
> > +
> > +static struct ib_nl_request_info *
> > +ib_nl_alloc_request(struct ib_sa_query *query)
> > +{
> > +	struct ib_nl_request_info *rinfo;
> > +
> > +	rinfo = kzalloc(sizeof(*rinfo), GFP_ATOMIC);
> > +	if (rinfo == NULL)
> > +		return ERR_PTR(-ENOMEM);
> > +
> > +	INIT_LIST_HEAD(&rinfo->list);
> > +	rinfo->seq = (u32)atomic_inc_return(&ib_nl_sa_request_seq);
> > +	rinfo->query = query;
> > +
> > +	return rinfo;
> > +}
> > +
> > +static int ib_nl_send_request(struct ib_nl_request_info *rinfo)
> > +{
> > +	struct ib_mad_send_buf *send_buf;
> > +	unsigned long flags;
> > +	unsigned long delay;
> > +	int ret;
> > +
> > +	send_buf = rinfo->query->mad_buf;
> > +
> > +	delay = msecs_to_jiffies(sa_local_svc_timeout_ms);
> > +	spin_lock_irqsave(&ib_nl_request_lock, flags);
> > +	ret = ib_nl_send_mad(send_buf->mad,
> > +			     (send_buf->data_len + send_buf->hdr_len),
> > +			     rinfo->seq);
> > +
> > +	if (ret != (send_buf->data_len + send_buf->hdr_len)) {
> > +		kfree(rinfo);
> > +		ret = -EIO;
> > +		goto request_out;
> > +	} else {
> > +		ret = 0;
> > +	}
> > +
> > +	rinfo->timeout = delay + jiffies;
> > +	list_add_tail(&rinfo->list, &ib_nl_request_list);
> > +	/* Start the timeout if this is the only request */
> > +	if (ib_nl_request_list.next == &rinfo->list)
> > +		queue_delayed_work(ib_nl_wq, &ib_nl_timed_work, delay);
> > +
> > +request_out:
> > +	spin_unlock_irqrestore(&ib_nl_request_lock, flags);
> > +
> > +	return ret;
> > +}
> > +
> > +static int ib_nl_make_request(struct ib_sa_query *query)
> > +{
> > +	struct ib_nl_request_info *rinfo;
> > +
> > +	rinfo = ib_nl_alloc_request(query);
> > +	if (IS_ERR(rinfo))
> > +		return -ENOMEM;
> > +
> > +	return ib_nl_send_request(rinfo);
> > +}
> > +
> > +static int ib_nl_cancel_request(struct ib_sa_query *query)
> > +{
> > +	unsigned long flags;
> > +	struct ib_nl_request_info *rinfo;
> > +	int found = 0;
> > +
> > +	spin_lock_irqsave(&ib_nl_request_lock, flags);
> > +	list_for_each_entry(rinfo, &ib_nl_request_list, list) {
> > +		/* Let the timeout to take care of the callback */
> > +		if (query == rinfo->query) {
> > +			IB_SA_CANCEL_QUERY(query);
> > +			rinfo->timeout = jiffies;
> > +			list_move(&rinfo->list, &ib_nl_request_list);
> > +			found = 1;
> > +			mod_delayed_work(ib_nl_wq, &ib_nl_timed_work,
> 1);
> > +			break;
> > +		}
> > +	}
> > +	spin_unlock_irqrestore(&ib_nl_request_lock, flags);
> > +
> > +	return found;
> > +}
> > +
> > +
> > +static int ib_nl_handle_mad_resp(struct sk_buff *skb,
> > +				 struct netlink_callback *cb);
> > +static struct ibnl_client_cbs ib_sa_cb_table[] = {
> > +	[RDMA_NL_MAD_REQUEST] = {
> > +		.dump = ib_nl_handle_mad_resp,
> > +		.module = THIS_MODULE },
> > +};
> > +
> > +static void send_handler(struct ib_mad_agent *agent,
> > +			 struct ib_mad_send_wc *mad_send_wc);
> > +
> > +static void ib_nl_process_good_rsp(struct ib_sa_query *query,
> > +				   struct ib_sa_mad *rsp)
> > +{
> > +	struct ib_mad_send_wc mad_send_wc;
> > +
> > +	if (query->callback)
> > +		query->callback(query, 0, rsp);
> > +
> > +	mad_send_wc.send_buf = query->mad_buf;
> > +	mad_send_wc.status = IB_WC_SUCCESS;
> > +	send_handler(query->mad_buf->mad_agent, &mad_send_wc);
> > +}
> > +
> > +static void ib_nl_request_timeout(struct work_struct *work)
> > +{
> > +	unsigned long flags;
> > +	struct ib_nl_request_info *rinfo;
> > +	struct ib_sa_query *query;
> > +	unsigned long delay;
> > +	struct ib_mad_send_wc mad_send_wc;
> > +	int ret;
> > +
> > +	spin_lock_irqsave(&ib_nl_request_lock, flags);
> > +	while (!list_empty(&ib_nl_request_list)) {
> > +		rinfo = list_entry(ib_nl_request_list.next,
> > +				   struct ib_nl_request_info, list);
> > +
> > +		if (time_after(rinfo->timeout, jiffies)) {
> > +			delay = rinfo->timeout - jiffies;
> > +			if ((long)delay <= 0)
> > +				delay = 1;
> > +			queue_delayed_work(ib_nl_wq,
> &ib_nl_timed_work, delay);
> > +			break;
> > +		}
> > +
> > +		list_del(&rinfo->list);
> > +		query = rinfo->query;
> > +		IB_SA_DISABLE_LOCAL_SVC(query);
> > +		/* Hold the lock to protect against query cancellation */
> > +		if (IB_SA_QUERY_CANCELLED(query))
> > +			ret = -1;
> > +		else
> > +			ret = ib_post_send_mad(query->mad_buf, NULL);
> > +		if (ret) {
> > +			mad_send_wc.send_buf = query->mad_buf;
> > +			mad_send_wc.status = IB_WC_WR_FLUSH_ERR;
> > +			spin_unlock_irqrestore(&ib_nl_request_lock, flags);
> > +			send_handler(query->port->agent, &mad_send_wc);
> > +			spin_lock_irqsave(&ib_nl_request_lock, flags);
> > +		}
> > +		kfree(rinfo);
> > +	}
> > +	spin_unlock_irqrestore(&ib_nl_request_lock, flags);
> > +}
> > +
> > +static int ib_nl_handle_mad_resp(struct sk_buff *skb,
> > +				 struct netlink_callback *cb)
> > +{
> > +	struct rdma_nl_resp_msg *nl_msg = (struct rdma_nl_resp_msg
> *)cb->nlh;
> > +	unsigned long flags;
> > +	struct ib_nl_request_info *rinfo;
> > +	struct ib_sa_query *query;
> > +	struct ib_mad_send_buf *send_buf;
> > +	struct ib_mad_send_wc mad_send_wc;
> > +	int found = 0;
> > +	int ret;
> > +
> > +	spin_lock_irqsave(&ib_nl_request_lock, flags);
> > +	list_for_each_entry(rinfo, &ib_nl_request_list, list) {
> > +		/*
> > +		 * If the query is cancelled, let the timeout routine
> > +		 * take care of it.
> > +		 */
> > +		if (nl_msg->nl_hdr.nlmsg_seq == rinfo->seq) {
> > +			found = !IB_SA_QUERY_CANCELLED(rinfo->query);
> > +			if (found)
> > +				list_del(&rinfo->list);
> > +			break;
> > +		}
> > +	}
> > +
> > +	if (!found) {
> > +		spin_unlock_irqrestore(&ib_nl_request_lock, flags);
> > +		goto resp_out;
> > +	}
> > +
> > +	query = rinfo->query;
> > +	send_buf = query->mad_buf;
> > +
> > +	if (nl_msg->sa_mad.mad_hdr.status != 0) {
> > +		/* if the result is a failure, send out the packet via IB */
> > +		IB_SA_DISABLE_LOCAL_SVC(query);
> > +		ret = ib_post_send_mad(query->mad_buf, NULL);
> > +		spin_unlock_irqrestore(&ib_nl_request_lock, flags);
> > +		if (ret) {
> > +			mad_send_wc.send_buf = send_buf;
> > +			mad_send_wc.status = IB_WC_GENERAL_ERR;
> > +			send_handler(query->port->agent, &mad_send_wc);
> > +		}
> > +	} else {
> > +		spin_unlock_irqrestore(&ib_nl_request_lock, flags);
> > +		ib_nl_process_good_rsp(query, &nl_msg->sa_mad);
> > +	}
> > +
> > +	kfree(rinfo);
> > +resp_out:
> > +	return skb->len;
> > +}
> > +
> >   static void free_sm_ah(struct kref *kref)
> >   {
> >   	struct ib_sa_sm_ah *sm_ah = container_of(kref, struct ib_sa_sm_ah,
> ref);
> > @@ -502,7 +784,13 @@ void ib_sa_cancel_query(int id, struct ib_sa_query
> *query)
> >   	mad_buf = query->mad_buf;
> >   	spin_unlock_irqrestore(&idr_lock, flags);
> >
> > -	ib_cancel_mad(agent, mad_buf);
> > +	/*
> > +	 * If the query is still on the netlink request list, schedule
> > +	 * it to be cancelled by the timeout routine. Otherwise, it has been
> > +	 * sent to the MAD layer and has to be cancelled from there.
> > +	 */
> > +	if (!ib_nl_cancel_request(query))
> > +		ib_cancel_mad(agent, mad_buf);
> >   }
> >   EXPORT_SYMBOL(ib_sa_cancel_query);
> >
> > @@ -638,6 +926,14 @@ static int send_mad(struct ib_sa_query *query, int
> timeout_ms, gfp_t gfp_mask)
> >   	query->mad_buf->context[0] = query;
> >   	query->id = id;
> >
> > +	if (IB_SA_LOCAL_SVC_ENABLED(query)) {
> > +		if (!ibnl_chk_listeners(RDMA_NL_GROUP_MAD)) {
> > +			if (!ib_nl_make_request(query))
> > +				return id;
> > +		}
> > +		IB_SA_DISABLE_LOCAL_SVC(query);
> > +	}
> > +
> >   	ret = ib_post_send_mad(query->mad_buf, NULL);
> >   	if (ret) {
> >   		spin_lock_irqsave(&idr_lock, flags);
> > @@ -739,7 +1035,7 @@ int ib_sa_path_rec_get(struct ib_sa_client *client,
> >   	port  = &sa_dev->port[port_num - sa_dev->start_port];
> >   	agent = port->agent;
> >
> > -	query = kmalloc(sizeof *query, gfp_mask);
> > +	query = kzalloc(sizeof(*query), gfp_mask);
> >   	if (!query)
> >   		return -ENOMEM;
> >
> > @@ -766,6 +1062,8 @@ int ib_sa_path_rec_get(struct ib_sa_client *client,
> >
> >   	*sa_query = &query->sa_query;
> >
> > +	IB_SA_ENABLE_LOCAL_SVC(&query->sa_query);
> > +
> >   	ret = send_mad(&query->sa_query, timeout_ms, gfp_mask);
> >   	if (ret < 0)
> >   		goto err2;
> > @@ -861,7 +1159,7 @@ int ib_sa_service_rec_query(struct ib_sa_client
> *client,
> >   	    method != IB_SA_METHOD_DELETE)
> >   		return -EINVAL;
> >
> > -	query = kmalloc(sizeof *query, gfp_mask);
> > +	query = kzalloc(sizeof(*query), gfp_mask);
> >   	if (!query)
> >   		return -ENOMEM;
> >
> > @@ -953,7 +1251,7 @@ int ib_sa_mcmember_rec_query(struct
> ib_sa_client *client,
> >   	port  = &sa_dev->port[port_num - sa_dev->start_port];
> >   	agent = port->agent;
> >
> > -	query = kmalloc(sizeof *query, gfp_mask);
> > +	query = kzalloc(sizeof(*query), gfp_mask);
> >   	if (!query)
> >   		return -ENOMEM;
> >
> > @@ -1050,7 +1348,7 @@ int ib_sa_guid_info_rec_query(struct ib_sa_client
> *client,
> >   	port  = &sa_dev->port[port_num - sa_dev->start_port];
> >   	agent = port->agent;
> >
> > -	query = kmalloc(sizeof *query, gfp_mask);
> > +	query = kzalloc(sizeof(*query), gfp_mask);
> >   	if (!query)
> >   		return -ENOMEM;
> 
> Please move the three kmalloc --> kzalloc changes above (and more of
> their such if exist in the patch) to a pre-patch

Will do.

Thank you,

Kaike
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Wan, Kaike May 19, 2015, 12:57 p.m. UTC | #4
> On 5/18/2015 10:00 PM, kaike.wan@intel.com wrote:
> > diff --git a/drivers/infiniband/core/sa_query.c
> b/drivers/infiniband/core/sa_query.c
> > index c38f030..69fcd28 100644
> > --- a/drivers/infiniband/core/sa_query.c
> > +++ b/drivers/infiniband/core/sa_query.c
> > @@ -45,12 +45,22 @@
> >  #include <uapi/linux/if_ether.h>
> >  #include <rdma/ib_pack.h>
> >  #include <rdma/ib_cache.h>
> > +#include <rdma/rdma_netlink.h>
> > +#include <net/netlink.h>
> > +#include <rdma/rdma_netlink.h>
> Redundant include

Will be removed.

Thank you,

Kaike
> 
> >  #include "sa.h"

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Or Gerlitz May 19, 2015, 6:30 p.m. UTC | #5
On Tue, May 19, 2015 at 3:24 PM, Wan, Kaike <kaike.wan@intel.com> wrote:
>> On 5/18/2015 10:00 PM, kaike.wan@intel.com wrote:

>> > --- a/drivers/infiniband/core/sa_query.c
>> > +++ b/drivers/infiniband/core/sa_query.c
>> > @@ -45,12 +45,22 @@

>> what's actually the role of the module param? why it belongs here? is that
>> really unavoidable to have it?

> It's nice to provide the capability for the user to adjust the netlink timeout based his/her fabric setup.

NO, adding module params should be really your last resort when
everything else wouldn't work, definitely not the case here, remove
it.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Wan, Kaike May 19, 2015, 6:42 p.m. UTC | #6
PiANCj4gPj4gd2hhdCdzIGFjdHVhbGx5IHRoZSByb2xlIG9mIHRoZSBtb2R1bGUgcGFyYW0/IHdo
eSBpdCBiZWxvbmdzIGhlcmU/IGlzDQo+ID4+IHRoYXQgcmVhbGx5IHVuYXZvaWRhYmxlIHRvIGhh
dmUgaXQ/DQo+IA0KPiA+IEl0J3MgbmljZSB0byBwcm92aWRlIHRoZSBjYXBhYmlsaXR5IGZvciB0
aGUgdXNlciB0byBhZGp1c3QgdGhlIG5ldGxpbmsgdGltZW91dA0KPiBiYXNlZCBoaXMvaGVyIGZh
YnJpYyBzZXR1cC4NCj4gDQo+IE5PLCBhZGRpbmcgbW9kdWxlIHBhcmFtcyBzaG91bGQgYmUgcmVh
bGx5IHlvdXIgbGFzdCByZXNvcnQgd2hlbiBldmVyeXRoaW5nDQo+IGVsc2Ugd291bGRuJ3Qgd29y
aywgZGVmaW5pdGVseSBub3QgdGhlIGNhc2UgaGVyZSwgcmVtb3ZlIGl0Lg0KDQpBcmUgeW91IHN1
Z2dlc3RpbmcgdGhhdCB3ZSB1c2UgdGhlIHRpbWVvdXQgdGhlIGNhbGxlciBwcm92aWRlcz8gT3Ig
aWYgdGhlcmUgaXMgbm8gdGltZW91dCAoMCkgcHJvdmlkZWQsIHVzZSBhIGRlZmF1bHQgb25lPyBM
b29raW5nIHRocm91Z2ggdGhlIElCIGRyaXZlcnMsIEkgZG8gZmluZCBtYW55IG1vZHVsZSBwYXJh
bWV0ZXJzLg0K
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Or Gerlitz May 19, 2015, 6:51 p.m. UTC | #7
On Tue, May 19, 2015 at 9:42 PM, Wan, Kaike <kaike.wan@intel.com> wrote:
>>
>> >> what's actually the role of the module param? why it belongs here? is
>> >> that really unavoidable to have it?
>>
>> > It's nice to provide the capability for the user to adjust the netlink timeout
>> based his/her fabric setup.
>>
>> NO, adding module params should be really your last resort when everything
>> else wouldn't work, definitely not the case here, remove it.
>
> Are you suggesting that we use the timeout the caller provides? Or if there is no timeout (0) provided, use a default one?

Not sure, pick one of these, whatever works for you the best.

> Looking through the IB drivers, I do find many module parameters.

One of the most basic rules in kernel programming (and suggested for
life too), is that if you something done by other drivers, this
doesn't necessarily means it's the right thing to do [1]

Module params are problematic, and sometimes cause way more damage
then benefit, some subsystem kernel maintainers kind of disallow them
totally (networking), maybe do some reading and talking with kernel
developers @ your firm to learn more.

Or.

[1] many times the most nasty bugs originates from cut-and-paste
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe May 19, 2015, 7:07 p.m. UTC | #8
On Mon, May 18, 2015 at 03:00:16PM -0400, kaike.wan@intel.com wrote:
> From: Kaike Wan <kaike.wan@intel.com>
> 
> This patch routes a SA pathrecord query to netlink first and processes the
> response appropriately. If a failure is returned, the request will be sent
> through IB. The decision whether to route the request to netlink first is
> determined by the presence of a listener for the MAD netlink multicast
> group. If the user-space MAD netlink multicast group listener is not
> present, the request will be sent through IB, just like what is currently
> being done.

I have mixed feelings about using the SA message format for this.

In-kernel we don't use that format, and many of the newer APIs were
designed around passing the GMP,Forward,Return path tuple, not just a
naked GMP.

I've argued for this in the past.

I'd rather see the netlink data ask for what the kernel needs (eg UD
path to X, RC path to X) and return the new format we've already
established for AF_IB

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ira Weiny May 19, 2015, 7:17 p.m. UTC | #9
PiANCj4gT24gVHVlLCBNYXkgMTksIDIwMTUgYXQgOTo0MiBQTSwgV2FuLCBLYWlrZSA8a2Fpa2Uu
d2FuQGludGVsLmNvbT4gd3JvdGU6DQo+ID4+DQo+ID4+ID4+IHdoYXQncyBhY3R1YWxseSB0aGUg
cm9sZSBvZiB0aGUgbW9kdWxlIHBhcmFtPyB3aHkgaXQgYmVsb25ncyBoZXJlPw0KPiA+PiA+PiBp
cyB0aGF0IHJlYWxseSB1bmF2b2lkYWJsZSB0byBoYXZlIGl0Pw0KPiA+Pg0KPiA+PiA+IEl0J3Mg
bmljZSB0byBwcm92aWRlIHRoZSBjYXBhYmlsaXR5IGZvciB0aGUgdXNlciB0byBhZGp1c3QgdGhl
DQo+ID4+ID4gbmV0bGluayB0aW1lb3V0DQo+ID4+IGJhc2VkIGhpcy9oZXIgZmFicmljIHNldHVw
Lg0KPiA+Pg0KPiA+PiBOTywgYWRkaW5nIG1vZHVsZSBwYXJhbXMgc2hvdWxkIGJlIHJlYWxseSB5
b3VyIGxhc3QgcmVzb3J0IHdoZW4NCj4gPj4gZXZlcnl0aGluZyBlbHNlIHdvdWxkbid0IHdvcmss
IGRlZmluaXRlbHkgbm90IHRoZSBjYXNlIGhlcmUsIHJlbW92ZSBpdC4NCj4gPg0KPiA+IEFyZSB5
b3Ugc3VnZ2VzdGluZyB0aGF0IHdlIHVzZSB0aGUgdGltZW91dCB0aGUgY2FsbGVyIHByb3ZpZGVz
PyBPciBpZiB0aGVyZSBpcw0KPiBubyB0aW1lb3V0ICgwKSBwcm92aWRlZCwgdXNlIGEgZGVmYXVs
dCBvbmU/DQo+IA0KPiBOb3Qgc3VyZSwgcGljayBvbmUgb2YgdGhlc2UsIHdoYXRldmVyIHdvcmtz
IGZvciB5b3UgdGhlIGJlc3QuDQoNCk5vIHRoaXMgd2lsbCBub3Qgd29yay4gIFdlIGRvbid0IHdh
bnQgdG8gdXNlIHRoZSB0aW1lb3V0IHRoZSBjYWxsZXIgcHJvdmlkZXMuICBUaGUgdGltZW91dCBm
b3IgbmV0bGluayBtYXkgKGFuZCBzaG91bGQgYmUpIGRpZmZlcmVudCBmcm9tIHRoZSB0aW1lb3V0
IG5lZWRlZCB3aGVuIHNlbmRpbmcgdG8gdGhlIFNBIGRpcmVjdGx5Lg0KDQo+IA0KPiA+IExvb2tp
bmcgdGhyb3VnaCB0aGUgSUIgZHJpdmVycywgSSBkbyBmaW5kIG1hbnkgbW9kdWxlIHBhcmFtZXRl
cnMuDQo+IA0KPiBPbmUgb2YgdGhlIG1vc3QgYmFzaWMgcnVsZXMgaW4ga2VybmVsIHByb2dyYW1t
aW5nIChhbmQgc3VnZ2VzdGVkIGZvciBsaWZlIHRvbyksDQo+IGlzIHRoYXQgaWYgeW91IHNvbWV0
aGluZyBkb25lIGJ5IG90aGVyIGRyaXZlcnMsIHRoaXMgZG9lc24ndCBuZWNlc3NhcmlseSBtZWFu
cw0KPiBpdCdzIHRoZSByaWdodCB0aGluZyB0byBkbyBbMV0NCg0KVGhpcyBtYXkgYmUgdHJ1ZS4u
Lg0KDQo+IA0KPiBNb2R1bGUgcGFyYW1zIGFyZSBwcm9ibGVtYXRpYywgYW5kIHNvbWV0aW1lcyBj
YXVzZSB3YXkgbW9yZSBkYW1hZ2UgdGhlbg0KPiBiZW5lZml0LCBzb21lIHN1YnN5c3RlbSBrZXJu
ZWwgbWFpbnRhaW5lcnMga2luZCBvZiBkaXNhbGxvdyB0aGVtIHRvdGFsbHkNCj4gKG5ldHdvcmtp
bmcpLCBtYXliZSBkbyBzb21lIHJlYWRpbmcgYW5kIHRhbGtpbmcgd2l0aCBrZXJuZWwgZGV2ZWxv
cGVycyBADQo+IHlvdXIgZmlybSB0byBsZWFybiBtb3JlLg0KDQpJIGhhdmUgc2VhcmNoZWQgaGln
aCBhbmQgbG93IGZvciB0aGUgcHJvYmxlbXMgd2l0aCBtb2R1bGUgcGFyYW1ldGVycyAgYW5kIGhh
dmUgZm91bmQgbm8gZG9jdW1lbnRhdGlvbiB3aGljaCBkaXNhbGxvd3MgdGhlaXIgdXNlLiAgRG8g
eW91IGhhdmUgYSByZWZlcmVuY2Ugd2hpY2ggc2hvd3Mgd2h5IHRoZXkgc2hvdWxkIGJlIGRpc2Fs
bG93ZWQ/ICBXaGlsZSBvdGhlciBtZWNoYW5pc21zIGNhbiBiZSB1c2VkIHRoZSB1c2Ugb2YgdGhp
cyBwYXJhbWV0ZXIgaXMgdmVyeSBzaW1wbGUgYW5kIHRoZSBhZGRpdGlvbiBvZiBhIG1vZHVsZSBw
YXJhbWV0ZXIgc2VlbXMgYXBwcm9wcmlhdGUgaW4gdGhpcyBjYXNlLg0KDQpJcmENCg0K
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe May 19, 2015, 7:21 p.m. UTC | #10
On Tue, May 19, 2015 at 07:17:49PM +0000, Weiny, Ira wrote:
> > Not sure, pick one of these, whatever works for you the best.
> 
> No this will not work.  We don't want to use the timeout the caller
> provides.  The timeout for netlink may (and should be) different
> from the timeout needed when sending to the SA directly.

Why?

> N?????r??y????b?X???v?^?)?{.n?+????{????{ay????,j??f???h???z??w??????j:+v???w?j?m????????zZ+??????j"??!

Does anyone else ever wonder what this garbage is on some of the
messages from @intel folks?

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ira Weiny May 19, 2015, 7:26 p.m. UTC | #11
On Tue, May 19, 2015 at 01:21:41PM -0600, Jason Gunthorpe wrote:
> On Tue, May 19, 2015 at 07:17:49PM +0000, Weiny, Ira wrote:
> > > Not sure, pick one of these, whatever works for you the best.
> > 
> > No this will not work.  We don't want to use the timeout the caller
> > provides.  The timeout for netlink may (and should be) different
> > from the timeout needed when sending to the SA directly.
> 
> Why?

The best use case is the desire to have the user space cache issue the query to
the SA on behalf of this request, cache the data, and return the response.
This means the Netlink timeout needs to be longer than the SA direct timeout.

> 
> > N?????r??y????b?X???v?^?)?{.n?+????{????{ay????,j??f???h???z??w??????j:+v???w?j?m????????zZ+??????j"??!
> 
> Does anyone else ever wonder what this garbage is on some of the
> messages from @intel folks?

Outlook issues...  This msg should be better...

Ira

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe May 19, 2015, 7:28 p.m. UTC | #12
On Tue, May 19, 2015 at 03:26:15PM -0400, ira.weiny wrote:

> The best use case is the desire to have the user space cache issue the query to
> the SA on behalf of this request, cache the data, and return the response.

> This means the Netlink timeout needs to be longer than the SA direct timeout.

By how much?

The SA timeout is already huge and has lots of slack, adding another
timeout without an actual need seems premature..

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Hefty, Sean May 19, 2015, 7:58 p.m. UTC | #13
> > N?????r??y????b?X???v?^?)?{.n?+????{????{ay????,j
??f???h???z??w???

> 

> ???j:+v???w?j?m????
????zZ+??????j"??!

> 

> Does anyone else ever wonder what this garbage is on some of the

> messages from @intel folks?


I believe it's some weird formatting of:

> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in

> the body of a message to majordomo@vger.kernel.org

> More majordomo info at  http://vger.kernel.org/majordomo-info.html


I have no idea why we get it.  I send/receive in plain text.
Hefty, Sean May 19, 2015, 8:27 p.m. UTC | #14
> I have mixed feelings about using the SA message format for this.
> 
> In-kernel we don't use that format, and many of the newer APIs were
> designed around passing the GMP,Forward,Return path tuple, not just a
> naked GMP.

I think the choice of the netlink protocol is a largely matter of taste.  In the internal review of these patches, the netlink hook was previously in the mad layer, which would have allowed redirecting any mad to user space.  (The code only attempted to redirect SA queries.)  Trying to hook from within ib_mad added a fair amount of complexity, that moving up into sa_query avoided.  The proposed interface would allow moving the changes back down, versus limiting this only to the SA query calls.  (And I'm not meaning to imply that limiting this only to SA queries is a bad choice.)

> I'd rather see the netlink data ask for what the kernel needs (eg UD
> path to X, RC path to X) and return the new format we've already
> established for AF_IB

Does anyone else have input on the netlink protocol?  Hal/Ira?  MADs seem more flexible, but higher-level structures more efficient and natural.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe May 19, 2015, 8:29 p.m. UTC | #15
On Tue, May 19, 2015 at 07:58:32PM +0000, Hefty, Sean wrote:
> > Does anyone else ever wonder what this garbage is on some of the
> > messages from @intel folks?
> 
> I believe it's some weird formatting of:
> 
> > To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> I have no idea why we get it.  I send/receive in plain text. 

Hum, I checked one of your messages that had this and it isn't plain
text, it was actually sent base64 encoded:

Message-ID: <1828884A29C6694DAF28B7E6B8A82373A8FCA2F1@ORSMSX109.amr.corp.intel.com>
Content-Type: text/plain; charset="utf-8"
Content-Transfer-Encoding: base64

So the garbage is what you get when you base 64 decode the standard
list trailer.

No idea why your side is generating base 64 encodings, but it is
clearly a list bug that it blindly appends the trailer...

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe May 19, 2015, 8:49 p.m. UTC | #16
On Tue, May 19, 2015 at 08:27:22PM +0000, Hefty, Sean wrote:
> > I have mixed feelings about using the SA message format for this.
> > 
> > In-kernel we don't use that format, and many of the newer APIs were
> > designed around passing the GMP,Forward,Return path tuple, not just a
> > naked GMP.
> 
> I think the choice of the netlink protocol is a largely matter of
> taste.  In the internal review of these patches, the netlink hook
> was previously in the mad layer, which would have allowed
> redirecting any mad to user space.  (The code only attempted to
> redirect SA queries.)  Trying to hook from within ib_mad added a
> fair amount of complexity, that moving up into sa_query avoided.
> The proposed interface would allow moving the changes back down,
> versus limiting this only to the SA query calls.  (And I'm not
> meaning to imply that limiting this only to SA queries is a bad
> choice.)

I'm not sure there is much use case for letting user space get
involved in arbitary MAD traffic..

Over generalizing feels like over design and doesn't let us add
valuable information that could help push policy decisions into user
space, like passing the IP and TOS, QP type, etc when available, to
userspace.

If there is a use case for that, it still cleanly layers, the cache NL
query goes first, if that fails then the MAD-based NL query goes next.

> > I'd rather see the netlink data ask for what the kernel needs (eg UD
> > path to X, RC path to X) and return the new format we've already
> > established for AF_IB
> 
> Does anyone else have input on the netlink protocol?  Hal/Ira?  MADs
> seem more flexible, but higher-level structures more efficient and
> natural.

MADs are not more flexible, they are fixed and controlled by something
outside the kernel. Their upside is we can track certain limited
changes to the standard without too much effort.

But the same reasons we didn't use them for AF_IB holds here too:
 - Routers are not supported
 - Asymmetric (non-reversible) mesh networks are not supported

We should't bake in the already known limitations of PathRecord when
adding a new scheme.

Userspace should provide the same kind of information as AF_IB, in a
netlink container that could be extended in future.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Hefty, Sean May 19, 2015, 9:20 p.m. UTC | #17
> I'm not sure there is much use case for letting user space get
> involved in arbitary MAD traffic..

I was thinking more about easily supporting other queries - ServiceRecords, MCMemberRecords, etc.  The only use case I'm currently aware of is PathRecords, however.

> Over generalizing feels like over design and doesn't let us add
> valuable information that could help push policy decisions into user
> space, like passing the IP and TOS, QP type, etc when available, to
> userspace.

I agree.  This level of control would be better.  The current security model of IB seems hopelessly broken.  IMO, all path data should come from privileged users, without any ability of the app to modify it.

Changing the protocol shouldn't be a big deal, though if you wanted to make my life easy, we could just adopt the ibacm sockets protocol directly.  :)

- Sean
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ira Weiny May 19, 2015, 11:15 p.m. UTC | #18
On Tue, May 19, 2015 at 02:27:22PM -0600, Hefty, Sean wrote:
> > I have mixed feelings about using the SA message format for this.
> > 
> > In-kernel we don't use that format, and many of the newer APIs were
> > designed around passing the GMP,Forward,Return path tuple, not just a
> > naked GMP.
> 
> I think the choice of the netlink protocol is a largely matter of taste.  In the internal review of these patches, the netlink hook was previously in the mad layer, which would have allowed redirecting any mad to user space.  (The code only attempted to redirect SA queries.)  Trying to hook from within ib_mad added a fair amount of complexity, that moving up into sa_query avoided.  The proposed interface would allow moving the changes back down, versus limiting this only to the SA query calls.  (And I'm not meaning to imply that limiting this only to SA queries is a bad choice.)
> 
> > I'd rather see the netlink data ask for what the kernel needs (eg UD
> > path to X, RC path to X) and return the new format we've already
> > established for AF_IB
> 
> Does anyone else have input on the netlink protocol?  Hal/Ira?  MADs seem more flexible, but higher-level structures more efficient and natural.


The idea was to implement a local SA cache.  Nothing more.  A lot of what is
being discussed is beyond the intent of the patches.

Ira

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ira Weiny May 19, 2015, 11:53 p.m. UTC | #19
On Tue, May 19, 2015 at 03:20:09PM -0600, Hefty, Sean wrote:
> > I'm not sure there is much use case for letting user space get
> > involved in arbitary MAD traffic..
> 
> I was thinking more about easily supporting other queries - ServiceRecords, MCMemberRecords, etc.  The only use case I'm currently aware of is PathRecords, however.
> 

I think MCMemberRecords (including joins) are good future candidates for this
interface.  The SSA work which Hal is doing had some plans for doing this.

> > Over generalizing feels like over design and doesn't let us add
> > valuable information that could help push policy decisions into user
> > space, like passing the IP and TOS, QP type, etc when available, to
> > userspace.
> 
> I agree.  This level of control would be better.  The current security model of IB seems hopelessly broken.  IMO, all path data should come from privileged users, without any ability of the app to modify it.
> 

I agree with your statement but I'm not sure how Jasons proposal helps to
control the use of PR data.  As long as verbs supports the individual setting
of PR data such as SL an application can change this data.

> Changing the protocol shouldn't be a big deal, though if you wanted to make my life easy, we could just adopt the ibacm sockets protocol directly.  :)
> 

ib_sa_path_rec_get is pretty specific in what it does.  Without over designing, this series 
adds access to the existing SA data cache (ibacm) for those users who call this
function.

I think that extending this to return an array of paths should be tackled in
future patches.  This future work could use a different protocol but I don't
think it would be part of ib_sa_path_rec_get.

Ira

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Wan, Kaike May 19, 2015, 11:55 p.m. UTC | #20
> 
> On Tue, May 19, 2015 at 02:27:22PM -0600, Hefty, Sean wrote:
> > > I have mixed feelings about using the SA message format for this.
> > >
> > > In-kernel we don't use that format, and many of the newer APIs were
> > > designed around passing the GMP,Forward,Return path tuple, not just
> > > a naked GMP.
> >
> > I think the choice of the netlink protocol is a largely matter of
> > taste.  In the internal review of these patches, the netlink hook was
> > previously in the mad layer, which would have allowed redirecting any
> > mad to user space.  (The code only attempted to redirect SA queries.)
> > Trying to hook from within ib_mad added a fair amount of complexity,
> > that moving up into sa_query avoided.  The proposed interface would
> > allow moving the changes back down, versus limiting this only to the
> > SA query calls.  (And I'm not meaning to imply that limiting this only
> > to SA queries is a bad choice.)
> >
> > > I'd rather see the netlink data ask for what the kernel needs (eg UD
> > > path to X, RC path to X) and return the new format we've already
> > > established for AF_IB
> >
> > Does anyone else have input on the netlink protocol?  Hal/Ira?  MADs seem
> more flexible, but higher-level structures more efficient and natural.
> 
> 
> The idea was to implement a local SA cache.  Nothing more.  A lot of what is
> being discussed is beyond the intent of the patches.
> 
> Ira

The higher-level scheme discussed so far can be implemented using similar mechanism (netlink), but with a different netlink multicast group, and therefore different data exchange format. Maybe it should be done in another patch series.

Kaike

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe May 20, 2015, 12:22 a.m. UTC | #21
On Tue, May 19, 2015 at 07:53:04PM -0400, ira.weiny wrote:

> I think that extending this to return an array of paths should be tackled in
> future patches.  This future work could use a different protocol but I don't
> think it would be part of ib_sa_path_rec_get.

No, this is a UAPI, we have to live with it forever, it needs to be
well designed, not just rammed into whatever already exists.

I already have an experimental patch series that extends CM to support
async scenarios, so the basic assumption this UAPI is premised on is already
wrong.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ira Weiny May 20, 2015, 9:05 a.m. UTC | #22
On Tue, May 19, 2015 at 01:28:42PM -0600, Jason Gunthorpe wrote:
> On Tue, May 19, 2015 at 03:26:15PM -0400, ira.weiny wrote:
> 
> > The best use case is the desire to have the user space cache issue the query to
> > the SA on behalf of this request, cache the data, and return the response.
> 
> > This means the Netlink timeout needs to be longer than the SA direct timeout.
> 
> By how much?

It depends on the fabric size, SA load, userspace implementation etc.  The last
of which is potentially the most important.

In general I would say we could take SubnetTimeOut + 1 sec as a good starting
point if the userspace implementation were to issue individual PR queries.

If it attempted some sort of bulk update of its cache in anticipation of
additional queries it could be more.  IBSSA for example may detect an epoch
change and download a significant number of PRs when this query occurs.  How
long this takes is a function of that cache and not anything kernel space can
or should know.

> 
> The SA timeout is already huge and has lots of slack, adding another
> timeout without an actual need seems premature..
> 

Define huge?  If I'm doing my math right the max subnet timeout is > 8,000
seconds.  None of the kernel users actually use the subnet timeout and most are
no where near that max, only 9P seems "huge".

iser:
        ret = rdma_resolve_route(cma_id, 1000); <=== 1 second

9P:
#define P9_RDMA_TIMEOUT         30000           /* 30 seconds */

rds:
#define RDS_RDMA_RESOLVE_TIMEOUT_MS     5000 <=== 5 seconds

NFSoRDMA:
#define RDMA_RESOLVE_TIMEOUT    (5000)  /* 5 seconds */

IPoIB:


                ib_sa_path_rec_get(&ipoib_sa_client, priv->ca, priv->port,
                                   &path->pathrec,
                                   IB_SA_PATH_REC_DGID          |
                                   IB_SA_PATH_REC_SGID          |
                                   IB_SA_PATH_REC_NUMB_PATH     |
                                   IB_SA_PATH_REC_TRAFFIC_CLASS |
                                   IB_SA_PATH_REC_PKEY,
1 second ==================>       1000, GFP_ATOMIC,
                                   path_rec_completion,
                                   path, &path->query);

SRP:
        SRP_PATH_REC_TIMEOUT_MS = 1000,   <=== 1 second



The other issue is that each caller in the kernel specifies a different
timeout.  Defining this in 1 central place and allowing user space to control
the policy of that timeout is much better than allowing the kernel clients to
specify the timeout as they do now.

Ira

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Hefty, Sean May 20, 2015, 4:13 p.m. UTC | #23
> The other issue is that each caller in the kernel specifies a different
> timeout.  Defining this in 1 central place and allowing user space to
> control
> the policy of that timeout is much better than allowing the kernel clients
> to
> specify the timeout as they do now.

Everything has been randomly hard-coded.  IMO, the sa_query module should use its own timeout value, which it updates based on how fast it actually gets responses.  But that takes too much work, and no one is ever going to write the code to do this.

For the netlink specific problem, I'll propose using a different randomly hard-coded value as a timeout.  Then define an 'MRA' type of message that user space can send to the kernel in order to ask it to wait longer.  The 'set timeout' message could apply to a single request or all future requests.  If we only wanted to the 'all future requests' option, the data value could be written into a file.  In any case, this pushes the policy of the timeout value into the hands of the responding daemon.

- Sean
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ira Weiny May 20, 2015, 4:25 p.m. UTC | #24
On Wed, May 20, 2015 at 10:13:59AM -0600, Hefty, Sean wrote:
> > The other issue is that each caller in the kernel specifies a different
> > timeout.  Defining this in 1 central place and allowing user space to
> > control
> > the policy of that timeout is much better than allowing the kernel clients
> > to
> > specify the timeout as they do now.
> 
> Everything has been randomly hard-coded.  IMO, the sa_query module should use
> its own timeout value, which it updates based on how fast it actually gets
> responses.  But that takes too much work, and no one is ever going to write
> the code to do this.

I agree.  So lets not do this again.

> 
> For the netlink specific problem, I'll propose using a different randomly
> hard-coded value as a timeout.

Isn't this what we have as the start default to the module parameter?

> Then define an 'MRA' type of message that
> user space can send to the kernel in order to ask it to wait longer.  The
> 'set timeout' message could apply to a single request or all future requests.
> If we only wanted to the 'all future requests' option, the data value could
> be written into a file.

This is effectively what we have with the module parameter.

> In any case, this pushes the policy of the timeout
> value into the hands of the responding daemon.
> 

I'm flexible on the mechanism used.  In addition to the module parameter, we
discussed sysctl as well as an additional control protocol for configuring the
communication.  We avoided these to keep the mechanism simple.  But since we
are going down the path of designing a new protocol I think it is reasonable to
have some "control" packets there.

Ira

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe May 20, 2015, 5:34 p.m. UTC | #25
On Wed, May 20, 2015 at 04:13:59PM +0000, Hefty, Sean wrote:
> > The other issue is that each caller in the kernel specifies a different
> > timeout.  Defining this in 1 central place and allowing user space to
> > control
> > the policy of that timeout is much better than allowing the kernel clients
> > to
> > specify the timeout as they do now.
> 
> Everything has been randomly hard-coded.  IMO, the sa_query module
> should use its own timeout value, which it updates based on how fast
> it actually gets responses.  But that takes too much work, and no
> one is ever going to write the code to do this.

The IB spec actually says how to compute the timeout for the SA, and
if done properly the SM will configure a timeout appropriate for the
network. It looks like everything the kernel does in this area is
basically wrong..

> For the netlink specific problem, I'll propose using a different
> randomly hard-coded value as a timeout.  Then define an 'MRA' type
> of message that user space can send to the kernel in order to ask it
> to wait longer.  The 'set timeout' message could apply to a single
> request or all future requests.  If we only wanted to the 'all
> future requests' option, the data value could be written into a
> file.  In any case, this pushes the policy of the timeout value into
> the hands of the responding daemon.

A fixed will known timeout (5s?) and require that user space send a
'operation in progress' positive liveness indication seems very
reasonable.

The only purpose of the timeout is to detect a locked up daemon so IB
doesn't lock up, so a watchdog like scheme seems appropriate.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
index c38f030..69fcd28 100644
--- a/drivers/infiniband/core/sa_query.c
+++ b/drivers/infiniband/core/sa_query.c
@@ -45,12 +45,22 @@ 
 #include <uapi/linux/if_ether.h>
 #include <rdma/ib_pack.h>
 #include <rdma/ib_cache.h>
+#include <rdma/rdma_netlink.h>
+#include <net/netlink.h>
+#include <rdma/rdma_netlink.h>
 #include "sa.h"
 
 MODULE_AUTHOR("Roland Dreier");
 MODULE_DESCRIPTION("InfiniBand subnet administration query support");
 MODULE_LICENSE("Dual BSD/GPL");
 
+#define IB_SA_LOCAL_SVC_TIMEOUT_MIN		100
+#define IB_SA_LOCAL_SVC_TIMEOUT_DEFAULT		2000
+static int sa_local_svc_timeout_ms = IB_SA_LOCAL_SVC_TIMEOUT_DEFAULT;
+
+module_param_named(local_svc_timeout_ms, sa_local_svc_timeout_ms, int, 0644);
+MODULE_PARM_DESC(local_svc_timeout_ms, "Local service timeout in millisecond");
+
 struct ib_sa_sm_ah {
 	struct ib_ah        *ah;
 	struct kref          ref;
@@ -80,8 +90,24 @@  struct ib_sa_query {
 	struct ib_mad_send_buf *mad_buf;
 	struct ib_sa_sm_ah     *sm_ah;
 	int			id;
+	u32			flags;
 };
 
+#define IB_SA_ENABLE_LOCAL_SERVICE	0x00000001
+#define IB_SA_CANCEL			0x00000002
+
+#define IB_SA_LOCAL_SVC_ENABLED(query) \
+	((query)->flags & IB_SA_ENABLE_LOCAL_SERVICE)
+#define IB_SA_ENABLE_LOCAL_SVC(query) \
+	((query)->flags |= IB_SA_ENABLE_LOCAL_SERVICE)
+#define IB_SA_DISABLE_LOCAL_SVC(query) \
+	((query)->flags &= ~IB_SA_ENABLE_LOCAL_SERVICE)
+
+#define IB_SA_QUERY_CANCELLED(query) \
+	((query)->flags & IB_SA_CANCEL)
+#define IB_SA_CANCEL_QUERY(query) \
+	((query)->flags |= IB_SA_CANCEL)
+
 struct ib_sa_service_query {
 	void (*callback)(int, struct ib_sa_service_rec *, void *);
 	void *context;
@@ -106,6 +132,24 @@  struct ib_sa_mcmember_query {
 	struct ib_sa_query sa_query;
 };
 
+struct ib_nl_request_info {
+	struct list_head list;
+	u32 seq;
+	unsigned long timeout;
+	struct ib_sa_query *query;
+};
+
+struct rdma_nl_resp_msg {
+	struct nlmsghdr nl_hdr;
+	struct ib_sa_mad sa_mad;
+};
+
+static LIST_HEAD(ib_nl_request_list);
+static DEFINE_SPINLOCK(ib_nl_request_lock);
+static atomic_t ib_nl_sa_request_seq;
+static struct workqueue_struct *ib_nl_wq;
+static struct delayed_work ib_nl_timed_work;
+
 static void ib_sa_add_one(struct ib_device *device);
 static void ib_sa_remove_one(struct ib_device *device);
 
@@ -381,6 +425,244 @@  static const struct ib_field guidinfo_rec_table[] = {
 	  .size_bits    = 512 },
 };
 
+static int ib_nl_send_mad(void *mad, int len, u32 seq)
+{
+	struct sk_buff *skb = NULL;
+	struct nlmsghdr *nlh;
+	void *data;
+	int ret = 0;
+
+	skb = nlmsg_new(len, GFP_KERNEL);
+	if (!skb) {
+		pr_err("alloc failed ret=%d\n", ret);
+		return -ENOMEM;
+	}
+
+	data = ibnl_put_msg(skb, &nlh, seq, len, RDMA_NL_MAD,
+			    RDMA_NL_MAD_REQUEST, GFP_KERNEL);
+	if (!data) {
+		kfree_skb(skb);
+		return -EMSGSIZE;
+	}
+	memcpy(data, mad, len);
+
+	ret = ibnl_multicast(skb, nlh, RDMA_NL_GROUP_MAD, GFP_KERNEL);
+	if (!ret) {
+		ret = len;
+	} else {
+		if (ret != -ESRCH)
+			pr_err("ibnl_multicast failed l=%d, r=%d\n", len, ret);
+		ret = 0;
+	}
+	return ret;
+}
+
+static struct ib_nl_request_info *
+ib_nl_alloc_request(struct ib_sa_query *query)
+{
+	struct ib_nl_request_info *rinfo;
+
+	rinfo = kzalloc(sizeof(*rinfo), GFP_ATOMIC);
+	if (rinfo == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&rinfo->list);
+	rinfo->seq = (u32)atomic_inc_return(&ib_nl_sa_request_seq);
+	rinfo->query = query;
+
+	return rinfo;
+}
+
+static int ib_nl_send_request(struct ib_nl_request_info *rinfo)
+{
+	struct ib_mad_send_buf *send_buf;
+	unsigned long flags;
+	unsigned long delay;
+	int ret;
+
+	send_buf = rinfo->query->mad_buf;
+
+	delay = msecs_to_jiffies(sa_local_svc_timeout_ms);
+	spin_lock_irqsave(&ib_nl_request_lock, flags);
+	ret = ib_nl_send_mad(send_buf->mad,
+			     (send_buf->data_len + send_buf->hdr_len),
+			     rinfo->seq);
+
+	if (ret != (send_buf->data_len + send_buf->hdr_len)) {
+		kfree(rinfo);
+		ret = -EIO;
+		goto request_out;
+	} else {
+		ret = 0;
+	}
+
+	rinfo->timeout = delay + jiffies;
+	list_add_tail(&rinfo->list, &ib_nl_request_list);
+	/* Start the timeout if this is the only request */
+	if (ib_nl_request_list.next == &rinfo->list)
+		queue_delayed_work(ib_nl_wq, &ib_nl_timed_work, delay);
+
+request_out:
+	spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+
+	return ret;
+}
+
+static int ib_nl_make_request(struct ib_sa_query *query)
+{
+	struct ib_nl_request_info *rinfo;
+
+	rinfo = ib_nl_alloc_request(query);
+	if (IS_ERR(rinfo))
+		return -ENOMEM;
+
+	return ib_nl_send_request(rinfo);
+}
+
+static int ib_nl_cancel_request(struct ib_sa_query *query)
+{
+	unsigned long flags;
+	struct ib_nl_request_info *rinfo;
+	int found = 0;
+
+	spin_lock_irqsave(&ib_nl_request_lock, flags);
+	list_for_each_entry(rinfo, &ib_nl_request_list, list) {
+		/* Let the timeout to take care of the callback */
+		if (query == rinfo->query) {
+			IB_SA_CANCEL_QUERY(query);
+			rinfo->timeout = jiffies;
+			list_move(&rinfo->list, &ib_nl_request_list);
+			found = 1;
+			mod_delayed_work(ib_nl_wq, &ib_nl_timed_work, 1);
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+
+	return found;
+}
+
+
+static int ib_nl_handle_mad_resp(struct sk_buff *skb,
+				 struct netlink_callback *cb);
+static struct ibnl_client_cbs ib_sa_cb_table[] = {
+	[RDMA_NL_MAD_REQUEST] = {
+		.dump = ib_nl_handle_mad_resp,
+		.module = THIS_MODULE },
+};
+
+static void send_handler(struct ib_mad_agent *agent,
+			 struct ib_mad_send_wc *mad_send_wc);
+
+static void ib_nl_process_good_rsp(struct ib_sa_query *query,
+				   struct ib_sa_mad *rsp)
+{
+	struct ib_mad_send_wc mad_send_wc;
+
+	if (query->callback)
+		query->callback(query, 0, rsp);
+
+	mad_send_wc.send_buf = query->mad_buf;
+	mad_send_wc.status = IB_WC_SUCCESS;
+	send_handler(query->mad_buf->mad_agent, &mad_send_wc);
+}
+
+static void ib_nl_request_timeout(struct work_struct *work)
+{
+	unsigned long flags;
+	struct ib_nl_request_info *rinfo;
+	struct ib_sa_query *query;
+	unsigned long delay;
+	struct ib_mad_send_wc mad_send_wc;
+	int ret;
+
+	spin_lock_irqsave(&ib_nl_request_lock, flags);
+	while (!list_empty(&ib_nl_request_list)) {
+		rinfo = list_entry(ib_nl_request_list.next,
+				   struct ib_nl_request_info, list);
+
+		if (time_after(rinfo->timeout, jiffies)) {
+			delay = rinfo->timeout - jiffies;
+			if ((long)delay <= 0)
+				delay = 1;
+			queue_delayed_work(ib_nl_wq, &ib_nl_timed_work, delay);
+			break;
+		}
+
+		list_del(&rinfo->list);
+		query = rinfo->query;
+		IB_SA_DISABLE_LOCAL_SVC(query);
+		/* Hold the lock to protect against query cancellation */
+		if (IB_SA_QUERY_CANCELLED(query))
+			ret = -1;
+		else
+			ret = ib_post_send_mad(query->mad_buf, NULL);
+		if (ret) {
+			mad_send_wc.send_buf = query->mad_buf;
+			mad_send_wc.status = IB_WC_WR_FLUSH_ERR;
+			spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+			send_handler(query->port->agent, &mad_send_wc);
+			spin_lock_irqsave(&ib_nl_request_lock, flags);
+		}
+		kfree(rinfo);
+	}
+	spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+}
+
+static int ib_nl_handle_mad_resp(struct sk_buff *skb,
+				 struct netlink_callback *cb)
+{
+	struct rdma_nl_resp_msg *nl_msg = (struct rdma_nl_resp_msg *)cb->nlh;
+	unsigned long flags;
+	struct ib_nl_request_info *rinfo;
+	struct ib_sa_query *query;
+	struct ib_mad_send_buf *send_buf;
+	struct ib_mad_send_wc mad_send_wc;
+	int found = 0;
+	int ret;
+
+	spin_lock_irqsave(&ib_nl_request_lock, flags);
+	list_for_each_entry(rinfo, &ib_nl_request_list, list) {
+		/*
+		 * If the query is cancelled, let the timeout routine
+		 * take care of it.
+		 */
+		if (nl_msg->nl_hdr.nlmsg_seq == rinfo->seq) {
+			found = !IB_SA_QUERY_CANCELLED(rinfo->query);
+			if (found)
+				list_del(&rinfo->list);
+			break;
+		}
+	}
+
+	if (!found) {
+		spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+		goto resp_out;
+	}
+
+	query = rinfo->query;
+	send_buf = query->mad_buf;
+
+	if (nl_msg->sa_mad.mad_hdr.status != 0) {
+		/* if the result is a failure, send out the packet via IB */
+		IB_SA_DISABLE_LOCAL_SVC(query);
+		ret = ib_post_send_mad(query->mad_buf, NULL);
+		spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+		if (ret) {
+			mad_send_wc.send_buf = send_buf;
+			mad_send_wc.status = IB_WC_GENERAL_ERR;
+			send_handler(query->port->agent, &mad_send_wc);
+		}
+	} else {
+		spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+		ib_nl_process_good_rsp(query, &nl_msg->sa_mad);
+	}
+
+	kfree(rinfo);
+resp_out:
+	return skb->len;
+}
+
 static void free_sm_ah(struct kref *kref)
 {
 	struct ib_sa_sm_ah *sm_ah = container_of(kref, struct ib_sa_sm_ah, ref);
@@ -502,7 +784,13 @@  void ib_sa_cancel_query(int id, struct ib_sa_query *query)
 	mad_buf = query->mad_buf;
 	spin_unlock_irqrestore(&idr_lock, flags);
 
-	ib_cancel_mad(agent, mad_buf);
+	/*
+	 * If the query is still on the netlink request list, schedule
+	 * it to be cancelled by the timeout routine. Otherwise, it has been
+	 * sent to the MAD layer and has to be cancelled from there.
+	 */
+	if (!ib_nl_cancel_request(query))
+		ib_cancel_mad(agent, mad_buf);
 }
 EXPORT_SYMBOL(ib_sa_cancel_query);
 
@@ -638,6 +926,14 @@  static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask)
 	query->mad_buf->context[0] = query;
 	query->id = id;
 
+	if (IB_SA_LOCAL_SVC_ENABLED(query)) {
+		if (!ibnl_chk_listeners(RDMA_NL_GROUP_MAD)) {
+			if (!ib_nl_make_request(query))
+				return id;
+		}
+		IB_SA_DISABLE_LOCAL_SVC(query);
+	}
+
 	ret = ib_post_send_mad(query->mad_buf, NULL);
 	if (ret) {
 		spin_lock_irqsave(&idr_lock, flags);
@@ -739,7 +1035,7 @@  int ib_sa_path_rec_get(struct ib_sa_client *client,
 	port  = &sa_dev->port[port_num - sa_dev->start_port];
 	agent = port->agent;
 
-	query = kmalloc(sizeof *query, gfp_mask);
+	query = kzalloc(sizeof(*query), gfp_mask);
 	if (!query)
 		return -ENOMEM;
 
@@ -766,6 +1062,8 @@  int ib_sa_path_rec_get(struct ib_sa_client *client,
 
 	*sa_query = &query->sa_query;
 
+	IB_SA_ENABLE_LOCAL_SVC(&query->sa_query);
+
 	ret = send_mad(&query->sa_query, timeout_ms, gfp_mask);
 	if (ret < 0)
 		goto err2;
@@ -861,7 +1159,7 @@  int ib_sa_service_rec_query(struct ib_sa_client *client,
 	    method != IB_SA_METHOD_DELETE)
 		return -EINVAL;
 
-	query = kmalloc(sizeof *query, gfp_mask);
+	query = kzalloc(sizeof(*query), gfp_mask);
 	if (!query)
 		return -ENOMEM;
 
@@ -953,7 +1251,7 @@  int ib_sa_mcmember_rec_query(struct ib_sa_client *client,
 	port  = &sa_dev->port[port_num - sa_dev->start_port];
 	agent = port->agent;
 
-	query = kmalloc(sizeof *query, gfp_mask);
+	query = kzalloc(sizeof(*query), gfp_mask);
 	if (!query)
 		return -ENOMEM;
 
@@ -1050,7 +1348,7 @@  int ib_sa_guid_info_rec_query(struct ib_sa_client *client,
 	port  = &sa_dev->port[port_num - sa_dev->start_port];
 	agent = port->agent;
 
-	query = kmalloc(sizeof *query, gfp_mask);
+	query = kzalloc(sizeof(*query), gfp_mask);
 	if (!query)
 		return -ENOMEM;
 
@@ -1250,6 +1548,10 @@  static int __init ib_sa_init(void)
 
 	get_random_bytes(&tid, sizeof tid);
 
+	atomic_set(&ib_nl_sa_request_seq, 0);
+	sa_local_svc_timeout_ms = max(sa_local_svc_timeout_ms,
+				       IB_SA_LOCAL_SVC_TIMEOUT_MIN);
+
 	ret = ib_register_client(&sa_client);
 	if (ret) {
 		printk(KERN_ERR "Couldn't register ib_sa client\n");
@@ -1262,7 +1564,25 @@  static int __init ib_sa_init(void)
 		goto err2;
 	}
 
+	ib_nl_wq = create_singlethread_workqueue("ib_nl_sa_wq");
+	if (!ib_nl_wq) {
+		ret = -ENOMEM;
+		goto err3;
+	}
+
+	if (ibnl_add_client(RDMA_NL_MAD, RDMA_NL_MAD_NUM_OPS,
+			    ib_sa_cb_table)) {
+		pr_err("Failed to add netlink callback\n");
+		ret = -EINVAL;
+		goto err4;
+	}
+	INIT_DELAYED_WORK(&ib_nl_timed_work, ib_nl_request_timeout);
+
 	return 0;
+err4:
+	destroy_workqueue(ib_nl_wq);
+err3:
+	mcast_cleanup();
 err2:
 	ib_unregister_client(&sa_client);
 err1:
@@ -1271,6 +1591,10 @@  err1:
 
 static void __exit ib_sa_cleanup(void)
 {
+	ibnl_remove_client(RDMA_NL_MAD);
+	cancel_delayed_work(&ib_nl_timed_work);
+	flush_workqueue(ib_nl_wq);
+	destroy_workqueue(ib_nl_wq);
 	mcast_cleanup();
 	ib_unregister_client(&sa_client);
 	idr_destroy(&query_idr);