Message ID | 20240604194522.10390-8-cel@kernel.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [1/5] xprtrdma: Fix rpcrdma_reqs_reset() | expand |
CC Dan. Reviewed-by: Sagi Grimberg <sagi@grimberg.me> On 04/06/2024 22:45, cel@kernel.org wrote: > From: Chuck Lever <chuck.lever@oracle.com> > > Wait for all disconnects to complete to ensure the transport has > divested all of its hardware resources before the underlying RDMA > device can be removed. > > Signed-off-by: Chuck Lever <chuck.lever@oracle.com> > --- > include/trace/events/rpcrdma.h | 23 +++++++++++++++++++++++ > net/sunrpc/xprtrdma/verbs.c | 23 ++++++++++++++--------- > net/sunrpc/xprtrdma/xprt_rdma.h | 2 ++ > 3 files changed, 39 insertions(+), 9 deletions(-) > > diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h > index ecdaf088219d..ba2d6a0e41cc 100644 > --- a/include/trace/events/rpcrdma.h > +++ b/include/trace/events/rpcrdma.h > @@ -669,6 +669,29 @@ TRACE_EVENT(xprtrdma_inline_thresh, > DEFINE_CONN_EVENT(connect); > DEFINE_CONN_EVENT(disconnect); > > +TRACE_EVENT(xprtrdma_device_removal, > + TP_PROTO( > + const struct rdma_cm_id *id > + ), > + > + TP_ARGS(id), > + > + TP_STRUCT__entry( > + __string(name, id->device->name) > + __array(unsigned char, addr, sizeof(struct sockaddr_in6)) > + ), > + > + TP_fast_assign( > + __assign_str(name); > + memcpy(__entry->addr, &id->route.addr.dst_addr, > + sizeof(struct sockaddr_in6)); > + ), > + > + TP_printk("device %s to be removed, disconnecting %pISpc\n", > + __get_str(name), __entry->addr > + ) > +); > + > DEFINE_RXPRT_EVENT(xprtrdma_op_inject_dsc); > > TRACE_EVENT(xprtrdma_op_connect, > diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c > index a0b071089e15..04558c99e9f4 100644 > --- a/net/sunrpc/xprtrdma/verbs.c > +++ b/net/sunrpc/xprtrdma/verbs.c > @@ -222,7 +222,6 @@ static void rpcrdma_update_cm_private(struct rpcrdma_ep *ep, > static int > rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) > { > - struct sockaddr *sap = (struct sockaddr *)&id->route.addr.dst_addr; > struct rpcrdma_ep *ep = id->context; > > might_sleep(); > @@ -241,14 +240,6 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) > ep->re_async_rc = -ENETUNREACH; > complete(&ep->re_done); > return 0; > - case RDMA_CM_EVENT_DEVICE_REMOVAL: > - pr_info("rpcrdma: removing device %s for %pISpc\n", > - ep->re_id->device->name, sap); > - switch (xchg(&ep->re_connect_status, -ENODEV)) { > - case 0: goto wake_connect_worker; > - case 1: goto disconnected; > - } > - return 0; > case RDMA_CM_EVENT_ADDR_CHANGE: > ep->re_connect_status = -ENODEV; > goto disconnected; > @@ -284,6 +275,14 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) > return 0; > } > > +static void rpcrdma_ep_removal_done(struct rpcrdma_notification *rn) > +{ > + struct rpcrdma_ep *ep = container_of(rn, struct rpcrdma_ep, re_rn); > + > + trace_xprtrdma_device_removal(ep->re_id); > + xprt_force_disconnect(ep->re_xprt); > +} > + > static struct rdma_cm_id *rpcrdma_create_id(struct rpcrdma_xprt *r_xprt, > struct rpcrdma_ep *ep) > { > @@ -323,6 +322,10 @@ static struct rdma_cm_id *rpcrdma_create_id(struct rpcrdma_xprt *r_xprt, > if (rc) > goto out; > > + rc = rpcrdma_rn_register(id->device, &ep->re_rn, rpcrdma_ep_removal_done); > + if (rc) > + goto out; > + > return id; > > out: > @@ -350,6 +353,8 @@ static void rpcrdma_ep_destroy(struct kref *kref) > ib_dealloc_pd(ep->re_pd); > ep->re_pd = NULL; > > + rpcrdma_rn_unregister(ep->re_id->device, &ep->re_rn); > + > kfree(ep); > module_put(THIS_MODULE); > } > diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h > index da409450dfc0..341725c66ec8 100644 > --- a/net/sunrpc/xprtrdma/xprt_rdma.h > +++ b/net/sunrpc/xprtrdma/xprt_rdma.h > @@ -56,6 +56,7 @@ > #include <linux/sunrpc/rpc_rdma_cid.h> /* completion IDs */ > #include <linux/sunrpc/rpc_rdma.h> /* RPC/RDMA protocol */ > #include <linux/sunrpc/xprtrdma.h> /* xprt parameters */ > +#include <linux/sunrpc/rdma_rn.h> /* removal notifications */ > > #define RDMA_RESOLVE_TIMEOUT (5000) /* 5 seconds */ > #define RDMA_CONNECT_RETRY_MAX (2) /* retries if no listener backlog */ > @@ -92,6 +93,7 @@ struct rpcrdma_ep { > struct rpcrdma_connect_private > re_cm_private; > struct rdma_conn_param re_remote_cma; > + struct rpcrdma_notification re_rn; > int re_receive_count; > unsigned int re_max_requests; /* depends on device */ > unsigned int re_inline_send; /* negotiated */
diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index ecdaf088219d..ba2d6a0e41cc 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -669,6 +669,29 @@ TRACE_EVENT(xprtrdma_inline_thresh, DEFINE_CONN_EVENT(connect); DEFINE_CONN_EVENT(disconnect); +TRACE_EVENT(xprtrdma_device_removal, + TP_PROTO( + const struct rdma_cm_id *id + ), + + TP_ARGS(id), + + TP_STRUCT__entry( + __string(name, id->device->name) + __array(unsigned char, addr, sizeof(struct sockaddr_in6)) + ), + + TP_fast_assign( + __assign_str(name); + memcpy(__entry->addr, &id->route.addr.dst_addr, + sizeof(struct sockaddr_in6)); + ), + + TP_printk("device %s to be removed, disconnecting %pISpc\n", + __get_str(name), __entry->addr + ) +); + DEFINE_RXPRT_EVENT(xprtrdma_op_inject_dsc); TRACE_EVENT(xprtrdma_op_connect, diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index a0b071089e15..04558c99e9f4 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -222,7 +222,6 @@ static void rpcrdma_update_cm_private(struct rpcrdma_ep *ep, static int rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) { - struct sockaddr *sap = (struct sockaddr *)&id->route.addr.dst_addr; struct rpcrdma_ep *ep = id->context; might_sleep(); @@ -241,14 +240,6 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) ep->re_async_rc = -ENETUNREACH; complete(&ep->re_done); return 0; - case RDMA_CM_EVENT_DEVICE_REMOVAL: - pr_info("rpcrdma: removing device %s for %pISpc\n", - ep->re_id->device->name, sap); - switch (xchg(&ep->re_connect_status, -ENODEV)) { - case 0: goto wake_connect_worker; - case 1: goto disconnected; - } - return 0; case RDMA_CM_EVENT_ADDR_CHANGE: ep->re_connect_status = -ENODEV; goto disconnected; @@ -284,6 +275,14 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) return 0; } +static void rpcrdma_ep_removal_done(struct rpcrdma_notification *rn) +{ + struct rpcrdma_ep *ep = container_of(rn, struct rpcrdma_ep, re_rn); + + trace_xprtrdma_device_removal(ep->re_id); + xprt_force_disconnect(ep->re_xprt); +} + static struct rdma_cm_id *rpcrdma_create_id(struct rpcrdma_xprt *r_xprt, struct rpcrdma_ep *ep) { @@ -323,6 +322,10 @@ static struct rdma_cm_id *rpcrdma_create_id(struct rpcrdma_xprt *r_xprt, if (rc) goto out; + rc = rpcrdma_rn_register(id->device, &ep->re_rn, rpcrdma_ep_removal_done); + if (rc) + goto out; + return id; out: @@ -350,6 +353,8 @@ static void rpcrdma_ep_destroy(struct kref *kref) ib_dealloc_pd(ep->re_pd); ep->re_pd = NULL; + rpcrdma_rn_unregister(ep->re_id->device, &ep->re_rn); + kfree(ep); module_put(THIS_MODULE); } diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index da409450dfc0..341725c66ec8 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -56,6 +56,7 @@ #include <linux/sunrpc/rpc_rdma_cid.h> /* completion IDs */ #include <linux/sunrpc/rpc_rdma.h> /* RPC/RDMA protocol */ #include <linux/sunrpc/xprtrdma.h> /* xprt parameters */ +#include <linux/sunrpc/rdma_rn.h> /* removal notifications */ #define RDMA_RESOLVE_TIMEOUT (5000) /* 5 seconds */ #define RDMA_CONNECT_RETRY_MAX (2) /* retries if no listener backlog */ @@ -92,6 +93,7 @@ struct rpcrdma_ep { struct rpcrdma_connect_private re_cm_private; struct rdma_conn_param re_remote_cma; + struct rpcrdma_notification re_rn; int re_receive_count; unsigned int re_max_requests; /* depends on device */ unsigned int re_inline_send; /* negotiated */