Message ID | 2fa2b6c93c4c16c8915bac3cfc4f27be1d60519d.1662631201.git.leonro@nvidia.com (mailing list archive) |
---|---|
State | Accepted |
Headers | show |
Series | Support multiple path records | expand |
On Thu, Sep 08, 2022 at 01:09:01PM +0300, Leon Romanovsky wrote: > +static void route_set_path_rec_inbound(struct cma_work *work, > + struct sa_path_rec *path_rec) > +{ > + struct rdma_route *route = &work->id->id.route; > + > + if (!route->path_rec_inbound) { > + route->path_rec_inbound = > + kzalloc(sizeof(*route->path_rec_inbound), GFP_KERNEL); > + if (!route->path_rec_inbound) > + return; > + } > + > + *route->path_rec_inbound = *path_rec; > +} We are just ignoring these memory allocation failures?? > static void cma_query_handler(int status, struct sa_path_rec *path_rec, > - void *context) > + int num_prs, void *context) This param should be "unsigned int num_prs" > { > struct cma_work *work = context; > struct rdma_route *route; > + int i; > > route = &work->id->id.route; > > - if (!status) { > - route->num_pri_alt_paths = 1; > - *route->path_rec = *path_rec; > - } else { > - work->old_state = RDMA_CM_ROUTE_QUERY; > - work->new_state = RDMA_CM_ADDR_RESOLVED; > - work->event.event = RDMA_CM_EVENT_ROUTE_ERROR; > - work->event.status = status; > - pr_debug_ratelimited("RDMA CM: ROUTE_ERROR: failed to query path. status %d\n", > - status); > + if (status) > + goto fail; > + > + for (i = 0; i < num_prs; i++) { > + if (!path_rec[i].flags || (path_rec[i].flags & IB_PATH_GMP)) > + *route->path_rec = path_rec[i]; > + else if (path_rec[i].flags & IB_PATH_INBOUND) > + route_set_path_rec_inbound(work, &path_rec[i]); > + else if (path_rec[i].flags & IB_PATH_OUTBOUND) > + route_set_path_rec_outbound(work, &path_rec[i]); > + } > + if (!route->path_rec) { Why is this needed? The for loop above will have already oops'd. > +/** > + * ib_sa_pr_callback_multiple() - Parse path records then do callback. > + * > + * In a multiple-PR case the PRs are saved in "query->resp_pr_data" > + * (instead of"mad->data") and with "ib_path_rec_data" structure format, > + * so that rec->flags can be set to indicate the type of PR. > + * This is valid only in IB fabric. > + */ > +static void ib_sa_pr_callback_multiple(struct ib_sa_path_query *query, > + int status, int num_prs, > + struct ib_path_rec_data *rec_data) > +{ > + struct sa_path_rec *rec; > + int i; > + > + rec = kvcalloc(num_prs, sizeof(*rec), GFP_KERNEL); > + if (!rec) { > + query->callback(-ENOMEM, NULL, 0, query->context); > + return; > + } This all seems really wild, why are we allocating memory so many times on this path? Have ib_nl_process_good_resolve_rsp unpack the mad instead of storing the raw format It would also be good to make resp_pr_data always valid so all these special paths don't need to exist. > diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h > index 81916039ee24..cdc7cafab572 100644 > --- a/include/rdma/rdma_cm.h > +++ b/include/rdma/rdma_cm.h > @@ -49,9 +49,15 @@ struct rdma_addr { > struct rdma_dev_addr dev_addr; > }; > > +#define RDMA_PRIMARY_PATH_MAX_REC_NUM 3 This is a strange place for this define, it should be in sa_query.c? Jason
On 9/22/2022 9:58 PM, Jason Gunthorpe wrote: > On Thu, Sep 08, 2022 at 01:09:01PM +0300, Leon Romanovsky wrote: > >> +static void route_set_path_rec_inbound(struct cma_work *work, >> + struct sa_path_rec *path_rec) >> +{ >> + struct rdma_route *route = &work->id->id.route; >> + >> + if (!route->path_rec_inbound) { >> + route->path_rec_inbound = >> + kzalloc(sizeof(*route->path_rec_inbound), GFP_KERNEL); >> + if (!route->path_rec_inbound) >> + return; >> + } >> + >> + *route->path_rec_inbound = *path_rec; >> +} > > We are just ignoring these memory allocation failures?? > Inside "if" statement if kzalloc fails here then we don't set route->path_rec_inbound or outbound; >> static void cma_query_handler(int status, struct sa_path_rec *path_rec, >> - void *context) >> + int num_prs, void *context) > > This param should be "unsigned int num_prs" Ack > >> { >> struct cma_work *work = context; >> struct rdma_route *route; >> + int i; >> >> route = &work->id->id.route; >> >> - if (!status) { >> - route->num_pri_alt_paths = 1; >> - *route->path_rec = *path_rec; >> - } else { >> - work->old_state = RDMA_CM_ROUTE_QUERY; >> - work->new_state = RDMA_CM_ADDR_RESOLVED; >> - work->event.event = RDMA_CM_EVENT_ROUTE_ERROR; >> - work->event.status = status; >> - pr_debug_ratelimited("RDMA CM: ROUTE_ERROR: failed to query path. status %d\n", >> - status); >> + if (status) >> + goto fail; >> + >> + for (i = 0; i < num_prs; i++) { >> + if (!path_rec[i].flags || (path_rec[i].flags & IB_PATH_GMP)) >> + *route->path_rec = path_rec[i]; >> + else if (path_rec[i].flags & IB_PATH_INBOUND) >> + route_set_path_rec_inbound(work, &path_rec[i]); >> + else if (path_rec[i].flags & IB_PATH_OUTBOUND) >> + route_set_path_rec_outbound(work, &path_rec[i]); >> + } >> + if (!route->path_rec) { > > Why is this needed? The for loop above will have already oops'd. Right, this "if" is no needed. We don't need to check if route->path_rec is valid in this function because it is allocated in cma_resolve_ib_route() > >> +/** >> + * ib_sa_pr_callback_multiple() - Parse path records then do callback. >> + * >> + * In a multiple-PR case the PRs are saved in "query->resp_pr_data" >> + * (instead of"mad->data") and with "ib_path_rec_data" structure format, >> + * so that rec->flags can be set to indicate the type of PR. >> + * This is valid only in IB fabric. >> + */ >> +static void ib_sa_pr_callback_multiple(struct ib_sa_path_query *query, >> + int status, int num_prs, >> + struct ib_path_rec_data *rec_data) >> +{ >> + struct sa_path_rec *rec; >> + int i; >> + >> + rec = kvcalloc(num_prs, sizeof(*rec), GFP_KERNEL); >> + if (!rec) { >> + query->callback(-ENOMEM, NULL, 0, query->context); >> + return; >> + } > > This all seems really wild, why are we allocating memory so many times > on this path? Have ib_nl_process_good_resolve_rsp unpack the mad > instead of storing the raw format > > It would also be good to make resp_pr_data always valid so all these > special paths don't need to exist. The ib_sa_pr_callback_single() uses stack variable "rec" but ib_sa_pr_callback_multiple() uses malloc because there are multiple PRs. ib_sa_path_rec_callback is also used by ib_post_send_mad(), which always have single PR and saved in mad->data, so always set resp_pr_data in netlink case is not enough. > >> diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h >> index 81916039ee24..cdc7cafab572 100644 >> --- a/include/rdma/rdma_cm.h >> +++ b/include/rdma/rdma_cm.h >> @@ -49,9 +49,15 @@ struct rdma_addr { >> struct rdma_dev_addr dev_addr; >> }; >> >> +#define RDMA_PRIMARY_PATH_MAX_REC_NUM 3 > > This is a strange place for this define, it should be in sa_query.c? That's because path_rec, path_rec_inbound and path_rec_outbound are defined here, but yes it is only used in sa_query.c, so maybe better move it to there. Thanks Jason.
On Fri, Sep 23, 2022 at 09:40:22AM +0800, Mark Zhang wrote: > On 9/22/2022 9:58 PM, Jason Gunthorpe wrote: > > On Thu, Sep 08, 2022 at 01:09:01PM +0300, Leon Romanovsky wrote: > > > > > +static void route_set_path_rec_inbound(struct cma_work *work, > > > + struct sa_path_rec *path_rec) > > > +{ > > > + struct rdma_route *route = &work->id->id.route; > > > + > > > + if (!route->path_rec_inbound) { > > > + route->path_rec_inbound = > > > + kzalloc(sizeof(*route->path_rec_inbound), GFP_KERNEL); > > > + if (!route->path_rec_inbound) > > > + return; > > > + } > > > + > > > + *route->path_rec_inbound = *path_rec; > > > +} > > > > We are just ignoring these memory allocation failures?? > > > Inside "if" statement if kzalloc fails here then we don't set > route->path_rec_inbound or outbound; But why don't we propogate a ENOMEM failure code? > > > +static void ib_sa_pr_callback_multiple(struct ib_sa_path_query *query, > > > + int status, int num_prs, > > > + struct ib_path_rec_data *rec_data) > > > +{ > > > + struct sa_path_rec *rec; > > > + int i; > > > + > > > + rec = kvcalloc(num_prs, sizeof(*rec), GFP_KERNEL); > > > + if (!rec) { > > > + query->callback(-ENOMEM, NULL, 0, query->context); > > > + return; > > > + } > > > > This all seems really wild, why are we allocating memory so many times > > on this path? Have ib_nl_process_good_resolve_rsp unpack the mad > > instead of storing the raw format > > > > It would also be good to make resp_pr_data always valid so all these > > special paths don't need to exist. > > The ib_sa_pr_callback_single() uses stack variable "rec" but > ib_sa_pr_callback_multiple() uses malloc because there are multiple PRs. > > ib_sa_path_rec_callback is also used by ib_post_send_mad(), which always > have single PR and saved in mad->data, so always set resp_pr_data in netlink > case is not enough. We should always be able to point resp_pr_data to some kind of storage, even if it is stack storage. Jason
On 9/23/2022 9:13 PM, Jason Gunthorpe wrote: > On Fri, Sep 23, 2022 at 09:40:22AM +0800, Mark Zhang wrote: >> On 9/22/2022 9:58 PM, Jason Gunthorpe wrote: >>> On Thu, Sep 08, 2022 at 01:09:01PM +0300, Leon Romanovsky wrote: >>> >>>> +static void route_set_path_rec_inbound(struct cma_work *work, >>>> + struct sa_path_rec *path_rec) >>>> +{ >>>> + struct rdma_route *route = &work->id->id.route; >>>> + >>>> + if (!route->path_rec_inbound) { >>>> + route->path_rec_inbound = >>>> + kzalloc(sizeof(*route->path_rec_inbound), GFP_KERNEL); >>>> + if (!route->path_rec_inbound) >>>> + return; >>>> + } >>>> + >>>> + *route->path_rec_inbound = *path_rec; >>>> +} >>> >>> We are just ignoring these memory allocation failures?? >>> >> Inside "if" statement if kzalloc fails here then we don't set >> route->path_rec_inbound or outbound; > > But why don't we propogate a ENOMEM failure code? Because inbound/outbound PRs are optional, so even they are provided they can still be ignored if cma is not able to set them (e.g. memory allocation failure in this case). >>>> +static void ib_sa_pr_callback_multiple(struct ib_sa_path_query *query, >>>> + int status, int num_prs, >>>> + struct ib_path_rec_data *rec_data) >>>> +{ >>>> + struct sa_path_rec *rec; >>>> + int i; >>>> + >>>> + rec = kvcalloc(num_prs, sizeof(*rec), GFP_KERNEL); >>>> + if (!rec) { >>>> + query->callback(-ENOMEM, NULL, 0, query->context); >>>> + return; >>>> + } >>> >>> This all seems really wild, why are we allocating memory so many times >>> on this path? Have ib_nl_process_good_resolve_rsp unpack the mad >>> instead of storing the raw format >>> >>> It would also be good to make resp_pr_data always valid so all these >>> special paths don't need to exist. >> >> The ib_sa_pr_callback_single() uses stack variable "rec" but >> ib_sa_pr_callback_multiple() uses malloc because there are multiple PRs. >> >> ib_sa_path_rec_callback is also used by ib_post_send_mad(), which always >> have single PR and saved in mad->data, so always set resp_pr_data in netlink >> case is not enough. > > We should always be able to point resp_pr_data to some kind of > storage, even if it is stack storage. The idea is: - Single PR: PR in mad->data; Used by both netlink and ib_post_send_mad(); - Multiple PRs: PRs in resp_pr_data, with "ib_path_rec_data" structure format; Currently used by netlink only. So if we want to always use resp_pr_data then in single-PR case we need to copy mad->data to resp_pr_data in both netlink and ib_post_send_mad(), and turn it into "ib_path_rec_data" structure format. This adds complexity for single-PR, which should be most of situations? Use malloc instead of stack for resp_pr_data and multiple-PR unpack is because sizeof(sa_path_rec)=72B, now we supports 3 and there might be more in future..
On 9/23/2022 10:24 PM, Mark Zhang wrote: > External email: Use caution opening links or attachments > > > On 9/23/2022 9:13 PM, Jason Gunthorpe wrote: >> On Fri, Sep 23, 2022 at 09:40:22AM +0800, Mark Zhang wrote: >>> On 9/22/2022 9:58 PM, Jason Gunthorpe wrote: >>>> On Thu, Sep 08, 2022 at 01:09:01PM +0300, Leon Romanovsky wrote: >>>> >>>>> +static void route_set_path_rec_inbound(struct cma_work *work, >>>>> + struct sa_path_rec *path_rec) >>>>> +{ >>>>> + struct rdma_route *route = &work->id->id.route; >>>>> + >>>>> + if (!route->path_rec_inbound) { >>>>> + route->path_rec_inbound = >>>>> + kzalloc(sizeof(*route->path_rec_inbound), >>>>> GFP_KERNEL); >>>>> + if (!route->path_rec_inbound) >>>>> + return; >>>>> + } >>>>> + >>>>> + *route->path_rec_inbound = *path_rec; >>>>> +} >>>> >>>> We are just ignoring these memory allocation failures?? >>>> >>> Inside "if" statement if kzalloc fails here then we don't set >>> route->path_rec_inbound or outbound; >> >> But why don't we propogate a ENOMEM failure code? > > Because inbound/outbound PRs are optional, so even they are provided > they can still be ignored if cma is not able to set them (e.g. memory > allocation failure in this case). > >>>>> +static void ib_sa_pr_callback_multiple(struct ib_sa_path_query >>>>> *query, >>>>> + int status, int num_prs, >>>>> + struct ib_path_rec_data *rec_data) >>>>> +{ >>>>> + struct sa_path_rec *rec; >>>>> + int i; >>>>> + >>>>> + rec = kvcalloc(num_prs, sizeof(*rec), GFP_KERNEL); >>>>> + if (!rec) { >>>>> + query->callback(-ENOMEM, NULL, 0, query->context); >>>>> + return; >>>>> + } >>>> >>>> This all seems really wild, why are we allocating memory so many times >>>> on this path? Have ib_nl_process_good_resolve_rsp unpack the mad >>>> instead of storing the raw format >>>> >>>> It would also be good to make resp_pr_data always valid so all these >>>> special paths don't need to exist. >>> >>> The ib_sa_pr_callback_single() uses stack variable "rec" but >>> ib_sa_pr_callback_multiple() uses malloc because there are multiple PRs. >>> >>> ib_sa_path_rec_callback is also used by ib_post_send_mad(), which always >>> have single PR and saved in mad->data, so always set resp_pr_data in >>> netlink >>> case is not enough. >> >> We should always be able to point resp_pr_data to some kind of >> storage, even if it is stack storage. > > The idea is: > - Single PR: PR in mad->data; Used by both netlink and > ib_post_send_mad(); > - Multiple PRs: PRs in resp_pr_data, with "ib_path_rec_data" structure > format; Currently used by netlink only. > So if we want to always use resp_pr_data then in single-PR case we need > to copy mad->data to resp_pr_data in both netlink and > ib_post_send_mad(), and turn it into "ib_path_rec_data" structure > format. This adds complexity for single-PR, which should be most of > situations? Sorry what I mean is resp_pr_data is used by netlink while mad->data is used by post_send_mad(), so if we want to always use resp_pr_data then in post_send_mad() we need to do malloc, copy and transfer. Besides if malloc failures then we still need to use mad->data, unless we always use stack no matter how many PRs there are. > Use malloc instead of stack for resp_pr_data and multiple-PR unpack is > because sizeof(sa_path_rec)=72B, now we supports 3 and there might be > more in future.. >
On Fri, Sep 23, 2022 at 10:24:35PM +0800, Mark Zhang wrote: > On 9/23/2022 9:13 PM, Jason Gunthorpe wrote: > > On Fri, Sep 23, 2022 at 09:40:22AM +0800, Mark Zhang wrote: > > > On 9/22/2022 9:58 PM, Jason Gunthorpe wrote: > > > > On Thu, Sep 08, 2022 at 01:09:01PM +0300, Leon Romanovsky wrote: > > > > > > > > > +static void route_set_path_rec_inbound(struct cma_work *work, > > > > > + struct sa_path_rec *path_rec) > > > > > +{ > > > > > + struct rdma_route *route = &work->id->id.route; > > > > > + > > > > > + if (!route->path_rec_inbound) { > > > > > + route->path_rec_inbound = > > > > > + kzalloc(sizeof(*route->path_rec_inbound), GFP_KERNEL); > > > > > + if (!route->path_rec_inbound) > > > > > + return; > > > > > + } > > > > > + > > > > > + *route->path_rec_inbound = *path_rec; > > > > > +} > > > > > > > > We are just ignoring these memory allocation failures?? > > > > > > > Inside "if" statement if kzalloc fails here then we don't set > > > route->path_rec_inbound or outbound; > > > > But why don't we propogate a ENOMEM failure code? > > Because inbound/outbound PRs are optional, so even they are provided they > can still be ignored if cma is not able to set them (e.g. memory allocation > failure in this case). This isn't how we do things, the netlink operation had a failure, the failure should propogate. If we've run out of memory the CM connection is going to blow up anyhow very quickly. > > We should always be able to point resp_pr_data to some kind of > > storage, even if it is stack storage. > > The idea is: > - Single PR: PR in mad->data; Used by both netlink and > ib_post_send_mad(); > - Multiple PRs: PRs in resp_pr_data, with "ib_path_rec_data" structure > format; Currently used by netlink only. > > So if we want to always use resp_pr_data then in single-PR case we need to > copy mad->data to resp_pr_data in both netlink and ib_post_send_mad(), and > turn it into "ib_path_rec_data" structure format. This adds complexity for > single-PR, which should be most of situations? You don't copy it, you unpack it. We already have to unpack it, just to a (large!) stack var - unpack it to resp_pr_data instead. Jason
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 91e72a76d95e..a3efc462305d 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -2026,6 +2026,8 @@ static void _destroy_id(struct rdma_id_private *id_priv, cma_id_put(id_priv->id.context); kfree(id_priv->id.route.path_rec); + kfree(id_priv->id.route.path_rec_inbound); + kfree(id_priv->id.route.path_rec_outbound); put_net(id_priv->id.route.addr.dev_addr.net); kfree(id_priv); @@ -2817,26 +2819,72 @@ int rdma_set_min_rnr_timer(struct rdma_cm_id *id, u8 min_rnr_timer) } EXPORT_SYMBOL(rdma_set_min_rnr_timer); +static void route_set_path_rec_inbound(struct cma_work *work, + struct sa_path_rec *path_rec) +{ + struct rdma_route *route = &work->id->id.route; + + if (!route->path_rec_inbound) { + route->path_rec_inbound = + kzalloc(sizeof(*route->path_rec_inbound), GFP_KERNEL); + if (!route->path_rec_inbound) + return; + } + + *route->path_rec_inbound = *path_rec; +} + +static void route_set_path_rec_outbound(struct cma_work *work, + struct sa_path_rec *path_rec) +{ + struct rdma_route *route = &work->id->id.route; + + if (!route->path_rec_outbound) { + route->path_rec_outbound = + kzalloc(sizeof(*route->path_rec_outbound), GFP_KERNEL); + if (!route->path_rec_outbound) + return; + } + + *route->path_rec_outbound = *path_rec; +} + static void cma_query_handler(int status, struct sa_path_rec *path_rec, - void *context) + int num_prs, void *context) { struct cma_work *work = context; struct rdma_route *route; + int i; route = &work->id->id.route; - if (!status) { - route->num_pri_alt_paths = 1; - *route->path_rec = *path_rec; - } else { - work->old_state = RDMA_CM_ROUTE_QUERY; - work->new_state = RDMA_CM_ADDR_RESOLVED; - work->event.event = RDMA_CM_EVENT_ROUTE_ERROR; - work->event.status = status; - pr_debug_ratelimited("RDMA CM: ROUTE_ERROR: failed to query path. status %d\n", - status); + if (status) + goto fail; + + for (i = 0; i < num_prs; i++) { + if (!path_rec[i].flags || (path_rec[i].flags & IB_PATH_GMP)) + *route->path_rec = path_rec[i]; + else if (path_rec[i].flags & IB_PATH_INBOUND) + route_set_path_rec_inbound(work, &path_rec[i]); + else if (path_rec[i].flags & IB_PATH_OUTBOUND) + route_set_path_rec_outbound(work, &path_rec[i]); + } + if (!route->path_rec) { + status = -EINVAL; + goto fail; } + route->num_pri_alt_paths = 1; + queue_work(cma_wq, &work->work); + return; + +fail: + work->old_state = RDMA_CM_ROUTE_QUERY; + work->new_state = RDMA_CM_ADDR_RESOLVED; + work->event.event = RDMA_CM_EVENT_ROUTE_ERROR; + work->event.status = status; + pr_debug_ratelimited("RDMA CM: ROUTE_ERROR: failed to query path. status %d\n", + status); queue_work(cma_wq, &work->work); } diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index 003e504feca2..0de83d9a4985 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -50,6 +50,7 @@ #include <rdma/ib_marshall.h> #include <rdma/ib_addr.h> #include <rdma/opa_addr.h> +#include <rdma/rdma_cm.h> #include "sa.h" #include "core_priv.h" @@ -104,7 +105,8 @@ struct ib_sa_device { }; struct ib_sa_query { - void (*callback)(struct ib_sa_query *, int, struct ib_sa_mad *); + void (*callback)(struct ib_sa_query *sa_query, int status, + int num_prs, struct ib_sa_mad *mad); void (*release)(struct ib_sa_query *); struct ib_sa_client *client; struct ib_sa_port *port; @@ -116,6 +118,12 @@ struct ib_sa_query { u32 seq; /* Local svc request sequence number */ unsigned long timeout; /* Local svc timeout */ u8 path_use; /* How will the pathrecord be used */ + + /* A separate buffer to save pathrecords of a response, as in cases + * like IB/netlink, mulptiple pathrecords are supported, so that + * mad->data is not large enough to hold them + */ + void *resp_pr_data; }; #define IB_SA_ENABLE_LOCAL_SERVICE 0x00000001 @@ -123,7 +131,8 @@ struct ib_sa_query { #define IB_SA_QUERY_OPA 0x00000004 struct ib_sa_path_query { - void (*callback)(int, struct sa_path_rec *, void *); + void (*callback)(int status, struct sa_path_rec *rec, + int num_paths, void *context); void *context; struct ib_sa_query sa_query; struct sa_path_rec *conv_pr; @@ -712,7 +721,7 @@ static void ib_nl_set_path_rec_attrs(struct sk_buff *skb, if ((comp_mask & IB_SA_PATH_REC_REVERSIBLE) && sa_rec->reversible != 0) - query->path_use = LS_RESOLVE_PATH_USE_GMP; + query->path_use = LS_RESOLVE_PATH_USE_ALL; else query->path_use = LS_RESOLVE_PATH_USE_UNIDIRECTIONAL; header->path_use = query->path_use; @@ -865,50 +874,81 @@ static void send_handler(struct ib_mad_agent *agent, static void ib_nl_process_good_resolve_rsp(struct ib_sa_query *query, const struct nlmsghdr *nlh) { + struct ib_path_rec_data *srec, *drec; + struct ib_sa_path_query *path_query; struct ib_mad_send_wc mad_send_wc; - struct ib_sa_mad *mad = NULL; const struct nlattr *head, *curr; - struct ib_path_rec_data *rec; - int len, rem; + struct ib_sa_mad *mad = NULL; + int len, rem, num_prs = 0; u32 mask = 0; int status = -EIO; - if (query->callback) { - head = (const struct nlattr *) nlmsg_data(nlh); - len = nlmsg_len(nlh); - switch (query->path_use) { - case LS_RESOLVE_PATH_USE_UNIDIRECTIONAL: - mask = IB_PATH_PRIMARY | IB_PATH_OUTBOUND; - break; + if (!query->callback) + goto out; - case LS_RESOLVE_PATH_USE_ALL: - case LS_RESOLVE_PATH_USE_GMP: - default: - mask = IB_PATH_PRIMARY | IB_PATH_GMP | - IB_PATH_BIDIRECTIONAL; - break; + path_query = container_of(query, struct ib_sa_path_query, sa_query); + mad = query->mad_buf->mad; + if (!path_query->conv_pr && + (be16_to_cpu(mad->mad_hdr.attr_id) == IB_SA_ATTR_PATH_REC)) { + /* Need a larger buffer for possible multiple PRs */ + query->resp_pr_data = kvcalloc(RDMA_PRIMARY_PATH_MAX_REC_NUM, + sizeof(*drec), GFP_KERNEL); + if (!query->resp_pr_data) { + query->callback(query, -ENOMEM, 0, NULL); + return; } - nla_for_each_attr(curr, head, len, rem) { - if (curr->nla_type == LS_NLA_TYPE_PATH_RECORD) { - rec = nla_data(curr); - /* - * Get the first one. In the future, we may - * need to get up to 6 pathrecords. - */ - if ((rec->flags & mask) == mask) { - mad = query->mad_buf->mad; - mad->mad_hdr.method |= - IB_MGMT_METHOD_RESP; - memcpy(mad->data, rec->path_rec, - sizeof(rec->path_rec)); - status = 0; - break; - } - } + } + + head = (const struct nlattr *) nlmsg_data(nlh); + len = nlmsg_len(nlh); + switch (query->path_use) { + case LS_RESOLVE_PATH_USE_UNIDIRECTIONAL: + mask = IB_PATH_PRIMARY | IB_PATH_OUTBOUND; + break; + + case LS_RESOLVE_PATH_USE_ALL: + mask = IB_PATH_PRIMARY; + break; + + case LS_RESOLVE_PATH_USE_GMP: + default: + mask = IB_PATH_PRIMARY | IB_PATH_GMP | + IB_PATH_BIDIRECTIONAL; + break; + } + + drec = (struct ib_path_rec_data *)query->resp_pr_data; + nla_for_each_attr(curr, head, len, rem) { + if (curr->nla_type != LS_NLA_TYPE_PATH_RECORD) + continue; + + srec = nla_data(curr); + if ((srec->flags & mask) != mask) + continue; + + status = 0; + if (!drec) { + memcpy(mad->data, srec->path_rec, + sizeof(srec->path_rec)); + num_prs = 1; + break; } - query->callback(query, status, mad); + + memcpy(drec, srec, sizeof(*drec)); + drec++; + num_prs++; + if (num_prs >= RDMA_PRIMARY_PATH_MAX_REC_NUM) + break; } + if (!status) + mad->mad_hdr.method |= IB_MGMT_METHOD_RESP; + + query->callback(query, status, num_prs, mad); + kvfree(query->resp_pr_data); + query->resp_pr_data = NULL; + +out: mad_send_wc.send_buf = query->mad_buf; mad_send_wc.status = IB_WC_SUCCESS; send_handler(query->mad_buf->mad_agent, &mad_send_wc); @@ -1411,41 +1451,90 @@ static int opa_pr_query_possible(struct ib_sa_client *client, return PR_IB_SUPPORTED; } +static void ib_sa_pr_callback_single(struct ib_sa_path_query *query, + int status, struct ib_sa_mad *mad) +{ + struct sa_path_rec rec = {}; + + ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table), + mad->data, &rec); + rec.rec_type = SA_PATH_REC_TYPE_IB; + sa_path_set_dmac_zero(&rec); + + if (query->conv_pr) { + struct sa_path_rec opa; + + memset(&opa, 0, sizeof(struct sa_path_rec)); + sa_convert_path_ib_to_opa(&opa, &rec); + query->callback(status, &opa, 1, query->context); + } else { + query->callback(status, &rec, 1, query->context); + } +} + +/** + * ib_sa_pr_callback_multiple() - Parse path records then do callback. + * + * In a multiple-PR case the PRs are saved in "query->resp_pr_data" + * (instead of"mad->data") and with "ib_path_rec_data" structure format, + * so that rec->flags can be set to indicate the type of PR. + * This is valid only in IB fabric. + */ +static void ib_sa_pr_callback_multiple(struct ib_sa_path_query *query, + int status, int num_prs, + struct ib_path_rec_data *rec_data) +{ + struct sa_path_rec *rec; + int i; + + rec = kvcalloc(num_prs, sizeof(*rec), GFP_KERNEL); + if (!rec) { + query->callback(-ENOMEM, NULL, 0, query->context); + return; + } + + for (i = 0; i < num_prs; i++) { + ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table), + rec_data[i].path_rec, rec + i); + rec[i].rec_type = SA_PATH_REC_TYPE_IB; + sa_path_set_dmac_zero(rec + i); + rec[i].flags = rec_data[i].flags; + } + + query->callback(status, rec, num_prs, query->context); + kvfree(rec); +} + static void ib_sa_path_rec_callback(struct ib_sa_query *sa_query, - int status, + int status, int num_prs, struct ib_sa_mad *mad) { struct ib_sa_path_query *query = container_of(sa_query, struct ib_sa_path_query, sa_query); + struct sa_path_rec rec; - if (mad) { - struct sa_path_rec rec; - - if (sa_query->flags & IB_SA_QUERY_OPA) { - ib_unpack(opa_path_rec_table, - ARRAY_SIZE(opa_path_rec_table), - mad->data, &rec); - rec.rec_type = SA_PATH_REC_TYPE_OPA; - query->callback(status, &rec, query->context); - } else { - ib_unpack(path_rec_table, - ARRAY_SIZE(path_rec_table), - mad->data, &rec); - rec.rec_type = SA_PATH_REC_TYPE_IB; - sa_path_set_dmac_zero(&rec); - - if (query->conv_pr) { - struct sa_path_rec opa; + if (!mad || !num_prs) { + query->callback(status, NULL, 0, query->context); + return; + } - memset(&opa, 0, sizeof(struct sa_path_rec)); - sa_convert_path_ib_to_opa(&opa, &rec); - query->callback(status, &opa, query->context); - } else { - query->callback(status, &rec, query->context); - } + if (sa_query->flags & IB_SA_QUERY_OPA) { + if (num_prs != 1) { + query->callback(-EINVAL, NULL, 0, query->context); + return; } - } else - query->callback(status, NULL, query->context); + + ib_unpack(opa_path_rec_table, ARRAY_SIZE(opa_path_rec_table), + mad->data, &rec); + rec.rec_type = SA_PATH_REC_TYPE_OPA; + query->callback(status, &rec, num_prs, query->context); + } else { + if (!sa_query->resp_pr_data) + ib_sa_pr_callback_single(query, status, mad); + else + ib_sa_pr_callback_multiple(query, status, num_prs, + sa_query->resp_pr_data); + } } static void ib_sa_path_rec_release(struct ib_sa_query *sa_query) @@ -1489,7 +1578,7 @@ int ib_sa_path_rec_get(struct ib_sa_client *client, unsigned long timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct sa_path_rec *resp, - void *context), + int num_paths, void *context), void *context, struct ib_sa_query **sa_query) { @@ -1588,7 +1677,7 @@ int ib_sa_path_rec_get(struct ib_sa_client *client, EXPORT_SYMBOL(ib_sa_path_rec_get); static void ib_sa_mcmember_rec_callback(struct ib_sa_query *sa_query, - int status, + int status, int num_prs, struct ib_sa_mad *mad) { struct ib_sa_mcmember_query *query = @@ -1680,7 +1769,7 @@ int ib_sa_mcmember_rec_query(struct ib_sa_client *client, /* Support GuidInfoRecord */ static void ib_sa_guidinfo_rec_callback(struct ib_sa_query *sa_query, - int status, + int status, int num_paths, struct ib_sa_mad *mad) { struct ib_sa_guidinfo_query *query = @@ -1790,7 +1879,7 @@ static void ib_classportinfo_cb(void *context) } static void ib_sa_classport_info_rec_callback(struct ib_sa_query *sa_query, - int status, + int status, int num_prs, struct ib_sa_mad *mad) { unsigned long flags; @@ -1966,13 +2055,13 @@ static void send_handler(struct ib_mad_agent *agent, /* No callback -- already got recv */ break; case IB_WC_RESP_TIMEOUT_ERR: - query->callback(query, -ETIMEDOUT, NULL); + query->callback(query, -ETIMEDOUT, 0, NULL); break; case IB_WC_WR_FLUSH_ERR: - query->callback(query, -EINTR, NULL); + query->callback(query, -EINTR, 0, NULL); break; default: - query->callback(query, -EIO, NULL); + query->callback(query, -EIO, 0, NULL); break; } @@ -2000,10 +2089,10 @@ static void recv_handler(struct ib_mad_agent *mad_agent, if (mad_recv_wc->wc->status == IB_WC_SUCCESS) query->callback(query, mad_recv_wc->recv_buf.mad->mad_hdr.status ? - -EINVAL : 0, + -EINVAL : 0, 1, (struct ib_sa_mad *) mad_recv_wc->recv_buf.mad); else - query->callback(query, -EIO, NULL); + query->callback(query, -EIO, 0, NULL); } ib_free_recv_mad(mad_recv_wc); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index a4904371e2db..ac25fc80fb33 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -742,7 +742,7 @@ void ipoib_flush_paths(struct net_device *dev) static void path_rec_completion(int status, struct sa_path_rec *pathrec, - void *path_ptr) + int num_prs, void *path_ptr) { struct ipoib_path *path = path_ptr; struct net_device *dev = path->dev; diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 1e777b2043d6..e5ec5444b662 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -699,7 +699,7 @@ static void srp_free_ch_ib(struct srp_target_port *target, static void srp_path_rec_completion(int status, struct sa_path_rec *pathrec, - void *ch_ptr) + int num_paths, void *ch_ptr) { struct srp_rdma_ch *ch = ch_ptr; struct srp_target_port *target = ch->target; diff --git a/include/rdma/ib_sa.h b/include/rdma/ib_sa.h index 3634d4cc7a56..e930bec33b31 100644 --- a/include/rdma/ib_sa.h +++ b/include/rdma/ib_sa.h @@ -186,6 +186,7 @@ struct sa_path_rec { struct sa_path_rec_opa opa; }; enum sa_path_rec_type rec_type; + u32 flags; }; static inline enum ib_gid_type @@ -413,7 +414,7 @@ int ib_sa_path_rec_get(struct ib_sa_client *client, struct ib_device *device, ib_sa_comp_mask comp_mask, unsigned long timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct sa_path_rec *resp, - void *context), + int num_prs, void *context), void *context, struct ib_sa_query **query); struct ib_sa_multicast { diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h index 81916039ee24..cdc7cafab572 100644 --- a/include/rdma/rdma_cm.h +++ b/include/rdma/rdma_cm.h @@ -49,9 +49,15 @@ struct rdma_addr { struct rdma_dev_addr dev_addr; }; +#define RDMA_PRIMARY_PATH_MAX_REC_NUM 3 struct rdma_route { struct rdma_addr addr; struct sa_path_rec *path_rec; + + /* Optional path records of primary path */ + struct sa_path_rec *path_rec_inbound; + struct sa_path_rec *path_rec_outbound; + /* * 0 - No primary nor alternate path is available * 1 - Only primary path is available