Message ID | 170620016455.2833.5426224225062159088.stgit@manet.1015granger.net (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | NFSD backchannel fixes | expand |
On 25 Jan 2024, at 11:29, Chuck Lever wrote: > From: Chuck Lever <chuck.lever@oracle.com> > > Help observe the flow of callback operations. > > bc_shutdown() records exactly when the backchannel RPC client is > destroyed and cl_cb_client is replaced with NULL. > > Signed-off-by: Chuck Lever <chuck.lever@oracle.com> > --- > fs/nfsd/nfs4callback.c | 7 +++++++ > fs/nfsd/trace.h | 42 ++++++++++++++++++++++++++++++++++++++++++ > include/trace/misc/nfs.h | 34 ++++++++++++++++++++++++++++++++++ > 3 files changed, 83 insertions(+) > > diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c > index 1c85426830b1..4d5a6370b92c 100644 > --- a/fs/nfsd/nfs4callback.c > +++ b/fs/nfsd/nfs4callback.c > @@ -887,6 +887,7 @@ static struct workqueue_struct *callback_wq; > > static bool nfsd4_queue_cb(struct nfsd4_callback *cb) > { > + trace_nfsd_cb_queue(cb->cb_clp, cb); > return queue_work(callback_wq, &cb->cb_work); > } > > @@ -1106,6 +1107,7 @@ static void nfsd41_destroy_cb(struct nfsd4_callback *cb) > { > struct nfs4_client *clp = cb->cb_clp; > > + trace_nfsd_cb_destroy(clp, cb); > nfsd41_cb_release_slot(cb); > if (cb->cb_ops && cb->cb_ops->release) > cb->cb_ops->release(cb); > @@ -1220,6 +1222,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback > goto out; > need_restart: > if (!test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags)) { > + trace_nfsd_cb_restart(clp, cb); > task->tk_status = 0; > cb->cb_need_restart = true; I think you want to call the tracepoint /after/ setting cb_need_restart here.. > } > @@ -1333,11 +1336,14 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb) > struct nfsd4_conn *c; > int err; > > + trace_nfsd_cb_bc_update(clp, cb); > + > /* > * This is either an update, or the client dying; in either case, > * kill the old client: > */ > if (clp->cl_cb_client) { > + trace_nfsd_cb_bc_shutdown(clp, cb); > rpc_shutdown_client(clp->cl_cb_client); > clp->cl_cb_client = NULL; > put_cred(clp->cl_cb_cred); > @@ -1349,6 +1355,7 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb) > } > if (test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags)) > return; > + I'm in favor of this whitespace change, but did you mean to include it? Ben
On Thu, Jan 25, 2024 at 04:49:17PM -0500, Benjamin Coddington wrote: > On 25 Jan 2024, at 11:29, Chuck Lever wrote: > > > From: Chuck Lever <chuck.lever@oracle.com> > > > > Help observe the flow of callback operations. > > > > bc_shutdown() records exactly when the backchannel RPC client is > > destroyed and cl_cb_client is replaced with NULL. > > > > Signed-off-by: Chuck Lever <chuck.lever@oracle.com> > > --- > > fs/nfsd/nfs4callback.c | 7 +++++++ > > fs/nfsd/trace.h | 42 ++++++++++++++++++++++++++++++++++++++++++ > > include/trace/misc/nfs.h | 34 ++++++++++++++++++++++++++++++++++ > > 3 files changed, 83 insertions(+) > > > > diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c > > index 1c85426830b1..4d5a6370b92c 100644 > > --- a/fs/nfsd/nfs4callback.c > > +++ b/fs/nfsd/nfs4callback.c > > @@ -887,6 +887,7 @@ static struct workqueue_struct *callback_wq; > > > > static bool nfsd4_queue_cb(struct nfsd4_callback *cb) > > { > > + trace_nfsd_cb_queue(cb->cb_clp, cb); > > return queue_work(callback_wq, &cb->cb_work); > > } > > > > @@ -1106,6 +1107,7 @@ static void nfsd41_destroy_cb(struct nfsd4_callback *cb) > > { > > struct nfs4_client *clp = cb->cb_clp; > > > > + trace_nfsd_cb_destroy(clp, cb); > > nfsd41_cb_release_slot(cb); > > if (cb->cb_ops && cb->cb_ops->release) > > cb->cb_ops->release(cb); > > @@ -1220,6 +1222,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback > > goto out; > > need_restart: > > if (!test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags)) { > > + trace_nfsd_cb_restart(clp, cb); > > task->tk_status = 0; > > cb->cb_need_restart = true; > > I think you want to call the tracepoint /after/ setting cb_need_restart here.. Maybe? The trace point currently captures the value of cb_need_restart before this logic overwrites it. Is that beneficial for troubleshooting? > > } > > @@ -1333,11 +1336,14 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb) > > struct nfsd4_conn *c; > > int err; > > > > + trace_nfsd_cb_bc_update(clp, cb); > > + > > /* > > * This is either an update, or the client dying; in either case, > > * kill the old client: > > */ > > if (clp->cl_cb_client) { > > + trace_nfsd_cb_bc_shutdown(clp, cb); > > rpc_shutdown_client(clp->cl_cb_client); > > clp->cl_cb_client = NULL; > > put_cred(clp->cl_cb_cred); > > @@ -1349,6 +1355,7 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb) > > } > > if (test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags)) > > return; > > + > > I'm in favor of this whitespace change, but did you mean to include it? Yes, but it might be better off in one of the other patches.
On 26 Jan 2024, at 9:12, Chuck Lever wrote: > On Thu, Jan 25, 2024 at 04:49:17PM -0500, Benjamin Coddington wrote: >> On 25 Jan 2024, at 11:29, Chuck Lever wrote: >> >>> From: Chuck Lever <chuck.lever@oracle.com> >>> >>> Help observe the flow of callback operations. >>> >>> bc_shutdown() records exactly when the backchannel RPC client is >>> destroyed and cl_cb_client is replaced with NULL. >>> >>> Signed-off-by: Chuck Lever <chuck.lever@oracle.com> >>> --- >>> fs/nfsd/nfs4callback.c | 7 +++++++ >>> fs/nfsd/trace.h | 42 ++++++++++++++++++++++++++++++++++++++++++ >>> include/trace/misc/nfs.h | 34 ++++++++++++++++++++++++++++++++++ >>> 3 files changed, 83 insertions(+) >>> >>> diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c >>> index 1c85426830b1..4d5a6370b92c 100644 >>> --- a/fs/nfsd/nfs4callback.c >>> +++ b/fs/nfsd/nfs4callback.c >>> @@ -887,6 +887,7 @@ static struct workqueue_struct *callback_wq; >>> >>> static bool nfsd4_queue_cb(struct nfsd4_callback *cb) >>> { >>> + trace_nfsd_cb_queue(cb->cb_clp, cb); >>> return queue_work(callback_wq, &cb->cb_work); >>> } >>> >>> @@ -1106,6 +1107,7 @@ static void nfsd41_destroy_cb(struct nfsd4_callback *cb) >>> { >>> struct nfs4_client *clp = cb->cb_clp; >>> >>> + trace_nfsd_cb_destroy(clp, cb); >>> nfsd41_cb_release_slot(cb); >>> if (cb->cb_ops && cb->cb_ops->release) >>> cb->cb_ops->release(cb); >>> @@ -1220,6 +1222,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback >>> goto out; >>> need_restart: >>> if (!test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags)) { >>> + trace_nfsd_cb_restart(clp, cb); >>> task->tk_status = 0; >>> cb->cb_need_restart = true; >> >> I think you want to call the tracepoint /after/ setting cb_need_restart here.. > > Maybe? > > The trace point currently captures the value of cb_need_restart > before this logic overwrites it. Is that beneficial for > troubleshooting? Got it, yes its better the way you have it - and the tracepoint name makes clear we want to restart. Ben
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 1c85426830b1..4d5a6370b92c 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -887,6 +887,7 @@ static struct workqueue_struct *callback_wq; static bool nfsd4_queue_cb(struct nfsd4_callback *cb) { + trace_nfsd_cb_queue(cb->cb_clp, cb); return queue_work(callback_wq, &cb->cb_work); } @@ -1106,6 +1107,7 @@ static void nfsd41_destroy_cb(struct nfsd4_callback *cb) { struct nfs4_client *clp = cb->cb_clp; + trace_nfsd_cb_destroy(clp, cb); nfsd41_cb_release_slot(cb); if (cb->cb_ops && cb->cb_ops->release) cb->cb_ops->release(cb); @@ -1220,6 +1222,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback goto out; need_restart: if (!test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags)) { + trace_nfsd_cb_restart(clp, cb); task->tk_status = 0; cb->cb_need_restart = true; } @@ -1333,11 +1336,14 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb) struct nfsd4_conn *c; int err; + trace_nfsd_cb_bc_update(clp, cb); + /* * This is either an update, or the client dying; in either case, * kill the old client: */ if (clp->cl_cb_client) { + trace_nfsd_cb_bc_shutdown(clp, cb); rpc_shutdown_client(clp->cl_cb_client); clp->cl_cb_client = NULL; put_cred(clp->cl_cb_cred); @@ -1349,6 +1355,7 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb) } if (test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags)) return; + spin_lock(&clp->cl_lock); /* * Only serialized callback code is allowed to clear these diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 6003af2bee33..9f9e58debc26 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -1443,6 +1443,48 @@ TRACE_EVENT(nfsd_cb_setup_err, __entry->error) ); +DECLARE_EVENT_CLASS(nfsd_cb_lifetime_class, + TP_PROTO( + const struct nfs4_client *clp, + const struct nfsd4_callback *cb + ), + TP_ARGS(clp, cb), + TP_STRUCT__entry( + __field(u32, cl_boot) + __field(u32, cl_id) + __field(const void *, cb) + __field(bool, need_restart) + __sockaddr(addr, clp->cl_cb_conn.cb_addrlen) + ), + TP_fast_assign( + __entry->cl_boot = clp->cl_clientid.cl_boot; + __entry->cl_id = clp->cl_clientid.cl_id; + __entry->cb = cb; + __entry->need_restart = cb->cb_need_restart; + __assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr, + clp->cl_cb_conn.cb_addrlen) + ), + TP_printk("addr=%pISpc client %08x:%08x cb=%p%s", + __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id, + __entry->cb, __entry->need_restart ? + " (need restart)" : " (first try)" + ) +); + +#define DEFINE_NFSD_CB_LIFETIME_EVENT(name) \ +DEFINE_EVENT(nfsd_cb_lifetime_class, nfsd_cb_##name, \ + TP_PROTO( \ + const struct nfs4_client *clp, \ + const struct nfsd4_callback *cb \ + ), \ + TP_ARGS(clp, cb)) + +DEFINE_NFSD_CB_LIFETIME_EVENT(queue); +DEFINE_NFSD_CB_LIFETIME_EVENT(destroy); +DEFINE_NFSD_CB_LIFETIME_EVENT(restart); +DEFINE_NFSD_CB_LIFETIME_EVENT(bc_update); +DEFINE_NFSD_CB_LIFETIME_EVENT(bc_shutdown); + TRACE_EVENT(nfsd_cb_seq_status, TP_PROTO( const struct rpc_task *task, diff --git a/include/trace/misc/nfs.h b/include/trace/misc/nfs.h index 0d9d48dca38a..64ab5dac59ce 100644 --- a/include/trace/misc/nfs.h +++ b/include/trace/misc/nfs.h @@ -385,3 +385,37 @@ TRACE_DEFINE_ENUM(IOMODE_ANY); { SEQ4_STATUS_RESTART_RECLAIM_NEEDED, "RESTART_RECLAIM_NEEDED" }, \ { SEQ4_STATUS_CB_PATH_DOWN_SESSION, "CB_PATH_DOWN_SESSION" }, \ { SEQ4_STATUS_BACKCHANNEL_FAULT, "BACKCHANNEL_FAULT" }) + +TRACE_DEFINE_ENUM(OP_CB_GETATTR); +TRACE_DEFINE_ENUM(OP_CB_RECALL); +TRACE_DEFINE_ENUM(OP_CB_LAYOUTRECALL); +TRACE_DEFINE_ENUM(OP_CB_NOTIFY); +TRACE_DEFINE_ENUM(OP_CB_PUSH_DELEG); +TRACE_DEFINE_ENUM(OP_CB_RECALL_ANY); +TRACE_DEFINE_ENUM(OP_CB_RECALLABLE_OBJ_AVAIL); +TRACE_DEFINE_ENUM(OP_CB_RECALL_SLOT); +TRACE_DEFINE_ENUM(OP_CB_SEQUENCE); +TRACE_DEFINE_ENUM(OP_CB_WANTS_CANCELLED); +TRACE_DEFINE_ENUM(OP_CB_NOTIFY_LOCK); +TRACE_DEFINE_ENUM(OP_CB_NOTIFY_DEVICEID); +TRACE_DEFINE_ENUM(OP_CB_OFFLOAD); +TRACE_DEFINE_ENUM(OP_CB_ILLEGAL); + +#define show_nfs4_cb_op(x) \ + __print_symbolic(x, \ + { 0, "CB_NULL" }, \ + { 1, "CB_COMPOUND" }, \ + { OP_CB_GETATTR, "CB_GETATTR" }, \ + { OP_CB_RECALL, "CB_RECALL" }, \ + { OP_CB_LAYOUTRECALL, "CB_LAYOUTRECALL" }, \ + { OP_CB_NOTIFY, "CB_NOTIFY" }, \ + { OP_CB_PUSH_DELEG, "CB_PUSH_DELEG" }, \ + { OP_CB_RECALL_ANY, "CB_RECALL_ANY" }, \ + { OP_CB_RECALLABLE_OBJ_AVAIL, "CB_RECALLABLE_OBJ_AVAIL" }, \ + { OP_CB_RECALL_SLOT, "CB_RECALL_SLOT" }, \ + { OP_CB_SEQUENCE, "CB_SEQUENCE" }, \ + { OP_CB_WANTS_CANCELLED, "CB_WANTS_CANCELLED" }, \ + { OP_CB_NOTIFY_LOCK, "CB_NOTIFY_LOCK" }, \ + { OP_CB_NOTIFY_DEVICEID, "CB_NOTIFY_DEVICEID" }, \ + { OP_CB_OFFLOAD, "CB_OFFLOAD" }, \ + { OP_CB_ILLEGAL, "CB_ILLEGAL" })