diff mbox series

[RFC,07/13] NFSD: Add callback operation lifetime trace points

Message ID 170620016455.2833.5426224225062159088.stgit@manet.1015granger.net (mailing list archive)
State New, archived
Headers show
Series NFSD backchannel fixes | expand

Commit Message

Chuck Lever Jan. 25, 2024, 4:29 p.m. UTC
From: Chuck Lever <chuck.lever@oracle.com>

Help observe the flow of callback operations.

bc_shutdown() records exactly when the backchannel RPC client is
destroyed and cl_cb_client is replaced with NULL.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4callback.c   |    7 +++++++
 fs/nfsd/trace.h          |   42 ++++++++++++++++++++++++++++++++++++++++++
 include/trace/misc/nfs.h |   34 ++++++++++++++++++++++++++++++++++
 3 files changed, 83 insertions(+)

Comments

Benjamin Coddington Jan. 25, 2024, 9:49 p.m. UTC | #1
On 25 Jan 2024, at 11:29, Chuck Lever wrote:

> From: Chuck Lever <chuck.lever@oracle.com>
>
> Help observe the flow of callback operations.
>
> bc_shutdown() records exactly when the backchannel RPC client is
> destroyed and cl_cb_client is replaced with NULL.
>
> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> ---
>  fs/nfsd/nfs4callback.c   |    7 +++++++
>  fs/nfsd/trace.h          |   42 ++++++++++++++++++++++++++++++++++++++++++
>  include/trace/misc/nfs.h |   34 ++++++++++++++++++++++++++++++++++
>  3 files changed, 83 insertions(+)
>
> diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
> index 1c85426830b1..4d5a6370b92c 100644
> --- a/fs/nfsd/nfs4callback.c
> +++ b/fs/nfsd/nfs4callback.c
> @@ -887,6 +887,7 @@ static struct workqueue_struct *callback_wq;
>
>  static bool nfsd4_queue_cb(struct nfsd4_callback *cb)
>  {
> +	trace_nfsd_cb_queue(cb->cb_clp, cb);
>  	return queue_work(callback_wq, &cb->cb_work);
>  }
>
> @@ -1106,6 +1107,7 @@ static void nfsd41_destroy_cb(struct nfsd4_callback *cb)
>  {
>  	struct nfs4_client *clp = cb->cb_clp;
>
> +	trace_nfsd_cb_destroy(clp, cb);
>  	nfsd41_cb_release_slot(cb);
>  	if (cb->cb_ops && cb->cb_ops->release)
>  		cb->cb_ops->release(cb);
> @@ -1220,6 +1222,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
>  	goto out;
>  need_restart:
>  	if (!test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags)) {
> +		trace_nfsd_cb_restart(clp, cb);
>  		task->tk_status = 0;
>  		cb->cb_need_restart = true;

I think you want to call the tracepoint /after/ setting cb_need_restart here..

>  	}
> @@ -1333,11 +1336,14 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
>  	struct nfsd4_conn *c;
>  	int err;
>
> +	trace_nfsd_cb_bc_update(clp, cb);
> +
>  	/*
>  	 * This is either an update, or the client dying; in either case,
>  	 * kill the old client:
>  	 */
>  	if (clp->cl_cb_client) {
> +		trace_nfsd_cb_bc_shutdown(clp, cb);
>  		rpc_shutdown_client(clp->cl_cb_client);
>  		clp->cl_cb_client = NULL;
>  		put_cred(clp->cl_cb_cred);
> @@ -1349,6 +1355,7 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
>  	}
>  	if (test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags))
>  		return;
> +

I'm in favor of this whitespace change, but did you mean to include it?

Ben
Chuck Lever III Jan. 26, 2024, 2:12 p.m. UTC | #2
On Thu, Jan 25, 2024 at 04:49:17PM -0500, Benjamin Coddington wrote:
> On 25 Jan 2024, at 11:29, Chuck Lever wrote:
> 
> > From: Chuck Lever <chuck.lever@oracle.com>
> >
> > Help observe the flow of callback operations.
> >
> > bc_shutdown() records exactly when the backchannel RPC client is
> > destroyed and cl_cb_client is replaced with NULL.
> >
> > Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> > ---
> >  fs/nfsd/nfs4callback.c   |    7 +++++++
> >  fs/nfsd/trace.h          |   42 ++++++++++++++++++++++++++++++++++++++++++
> >  include/trace/misc/nfs.h |   34 ++++++++++++++++++++++++++++++++++
> >  3 files changed, 83 insertions(+)
> >
> > diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
> > index 1c85426830b1..4d5a6370b92c 100644
> > --- a/fs/nfsd/nfs4callback.c
> > +++ b/fs/nfsd/nfs4callback.c
> > @@ -887,6 +887,7 @@ static struct workqueue_struct *callback_wq;
> >
> >  static bool nfsd4_queue_cb(struct nfsd4_callback *cb)
> >  {
> > +	trace_nfsd_cb_queue(cb->cb_clp, cb);
> >  	return queue_work(callback_wq, &cb->cb_work);
> >  }
> >
> > @@ -1106,6 +1107,7 @@ static void nfsd41_destroy_cb(struct nfsd4_callback *cb)
> >  {
> >  	struct nfs4_client *clp = cb->cb_clp;
> >
> > +	trace_nfsd_cb_destroy(clp, cb);
> >  	nfsd41_cb_release_slot(cb);
> >  	if (cb->cb_ops && cb->cb_ops->release)
> >  		cb->cb_ops->release(cb);
> > @@ -1220,6 +1222,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> >  	goto out;
> >  need_restart:
> >  	if (!test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags)) {
> > +		trace_nfsd_cb_restart(clp, cb);
> >  		task->tk_status = 0;
> >  		cb->cb_need_restart = true;
> 
> I think you want to call the tracepoint /after/ setting cb_need_restart here..

Maybe?

The trace point currently captures the value of cb_need_restart
before this logic overwrites it. Is that beneficial for
troubleshooting?


> >  	}
> > @@ -1333,11 +1336,14 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
> >  	struct nfsd4_conn *c;
> >  	int err;
> >
> > +	trace_nfsd_cb_bc_update(clp, cb);
> > +
> >  	/*
> >  	 * This is either an update, or the client dying; in either case,
> >  	 * kill the old client:
> >  	 */
> >  	if (clp->cl_cb_client) {
> > +		trace_nfsd_cb_bc_shutdown(clp, cb);
> >  		rpc_shutdown_client(clp->cl_cb_client);
> >  		clp->cl_cb_client = NULL;
> >  		put_cred(clp->cl_cb_cred);
> > @@ -1349,6 +1355,7 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
> >  	}
> >  	if (test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags))
> >  		return;
> > +
> 
> I'm in favor of this whitespace change, but did you mean to include it?

Yes, but it might be better off in one of the other patches.
Benjamin Coddington Jan. 26, 2024, 2:27 p.m. UTC | #3
On 26 Jan 2024, at 9:12, Chuck Lever wrote:

> On Thu, Jan 25, 2024 at 04:49:17PM -0500, Benjamin Coddington wrote:
>> On 25 Jan 2024, at 11:29, Chuck Lever wrote:
>>
>>> From: Chuck Lever <chuck.lever@oracle.com>
>>>
>>> Help observe the flow of callback operations.
>>>
>>> bc_shutdown() records exactly when the backchannel RPC client is
>>> destroyed and cl_cb_client is replaced with NULL.
>>>
>>> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
>>> ---
>>>  fs/nfsd/nfs4callback.c   |    7 +++++++
>>>  fs/nfsd/trace.h          |   42 ++++++++++++++++++++++++++++++++++++++++++
>>>  include/trace/misc/nfs.h |   34 ++++++++++++++++++++++++++++++++++
>>>  3 files changed, 83 insertions(+)
>>>
>>> diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
>>> index 1c85426830b1..4d5a6370b92c 100644
>>> --- a/fs/nfsd/nfs4callback.c
>>> +++ b/fs/nfsd/nfs4callback.c
>>> @@ -887,6 +887,7 @@ static struct workqueue_struct *callback_wq;
>>>
>>>  static bool nfsd4_queue_cb(struct nfsd4_callback *cb)
>>>  {
>>> +	trace_nfsd_cb_queue(cb->cb_clp, cb);
>>>  	return queue_work(callback_wq, &cb->cb_work);
>>>  }
>>>
>>> @@ -1106,6 +1107,7 @@ static void nfsd41_destroy_cb(struct nfsd4_callback *cb)
>>>  {
>>>  	struct nfs4_client *clp = cb->cb_clp;
>>>
>>> +	trace_nfsd_cb_destroy(clp, cb);
>>>  	nfsd41_cb_release_slot(cb);
>>>  	if (cb->cb_ops && cb->cb_ops->release)
>>>  		cb->cb_ops->release(cb);
>>> @@ -1220,6 +1222,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
>>>  	goto out;
>>>  need_restart:
>>>  	if (!test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags)) {
>>> +		trace_nfsd_cb_restart(clp, cb);
>>>  		task->tk_status = 0;
>>>  		cb->cb_need_restart = true;
>>
>> I think you want to call the tracepoint /after/ setting cb_need_restart here..
>
> Maybe?
>
> The trace point currently captures the value of cb_need_restart
> before this logic overwrites it. Is that beneficial for
> troubleshooting?

Got it, yes its better the way you have it - and the tracepoint name makes
clear we want to restart.

Ben
diff mbox series

Patch

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 1c85426830b1..4d5a6370b92c 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -887,6 +887,7 @@  static struct workqueue_struct *callback_wq;
 
 static bool nfsd4_queue_cb(struct nfsd4_callback *cb)
 {
+	trace_nfsd_cb_queue(cb->cb_clp, cb);
 	return queue_work(callback_wq, &cb->cb_work);
 }
 
@@ -1106,6 +1107,7 @@  static void nfsd41_destroy_cb(struct nfsd4_callback *cb)
 {
 	struct nfs4_client *clp = cb->cb_clp;
 
+	trace_nfsd_cb_destroy(clp, cb);
 	nfsd41_cb_release_slot(cb);
 	if (cb->cb_ops && cb->cb_ops->release)
 		cb->cb_ops->release(cb);
@@ -1220,6 +1222,7 @@  static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
 	goto out;
 need_restart:
 	if (!test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags)) {
+		trace_nfsd_cb_restart(clp, cb);
 		task->tk_status = 0;
 		cb->cb_need_restart = true;
 	}
@@ -1333,11 +1336,14 @@  static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
 	struct nfsd4_conn *c;
 	int err;
 
+	trace_nfsd_cb_bc_update(clp, cb);
+
 	/*
 	 * This is either an update, or the client dying; in either case,
 	 * kill the old client:
 	 */
 	if (clp->cl_cb_client) {
+		trace_nfsd_cb_bc_shutdown(clp, cb);
 		rpc_shutdown_client(clp->cl_cb_client);
 		clp->cl_cb_client = NULL;
 		put_cred(clp->cl_cb_cred);
@@ -1349,6 +1355,7 @@  static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
 	}
 	if (test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags))
 		return;
+
 	spin_lock(&clp->cl_lock);
 	/*
 	 * Only serialized callback code is allowed to clear these
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 6003af2bee33..9f9e58debc26 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -1443,6 +1443,48 @@  TRACE_EVENT(nfsd_cb_setup_err,
 		__entry->error)
 );
 
+DECLARE_EVENT_CLASS(nfsd_cb_lifetime_class,
+	TP_PROTO(
+		const struct nfs4_client *clp,
+		const struct nfsd4_callback *cb
+	),
+	TP_ARGS(clp, cb),
+	TP_STRUCT__entry(
+		__field(u32, cl_boot)
+		__field(u32, cl_id)
+		__field(const void *, cb)
+		__field(bool, need_restart)
+		__sockaddr(addr, clp->cl_cb_conn.cb_addrlen)
+	),
+	TP_fast_assign(
+		__entry->cl_boot = clp->cl_clientid.cl_boot;
+		__entry->cl_id = clp->cl_clientid.cl_id;
+		__entry->cb = cb;
+		__entry->need_restart = cb->cb_need_restart;
+		__assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr,
+				  clp->cl_cb_conn.cb_addrlen)
+	),
+	TP_printk("addr=%pISpc client %08x:%08x cb=%p%s",
+		__get_sockaddr(addr), __entry->cl_boot, __entry->cl_id,
+		__entry->cb, __entry->need_restart ?
+			" (need restart)" : " (first try)"
+	)
+);
+
+#define DEFINE_NFSD_CB_LIFETIME_EVENT(name)		\
+DEFINE_EVENT(nfsd_cb_lifetime_class, nfsd_cb_##name,	\
+	TP_PROTO(					\
+		const struct nfs4_client *clp,		\
+		const struct nfsd4_callback *cb		\
+	),						\
+	TP_ARGS(clp, cb))
+
+DEFINE_NFSD_CB_LIFETIME_EVENT(queue);
+DEFINE_NFSD_CB_LIFETIME_EVENT(destroy);
+DEFINE_NFSD_CB_LIFETIME_EVENT(restart);
+DEFINE_NFSD_CB_LIFETIME_EVENT(bc_update);
+DEFINE_NFSD_CB_LIFETIME_EVENT(bc_shutdown);
+
 TRACE_EVENT(nfsd_cb_seq_status,
 	TP_PROTO(
 		const struct rpc_task *task,
diff --git a/include/trace/misc/nfs.h b/include/trace/misc/nfs.h
index 0d9d48dca38a..64ab5dac59ce 100644
--- a/include/trace/misc/nfs.h
+++ b/include/trace/misc/nfs.h
@@ -385,3 +385,37 @@  TRACE_DEFINE_ENUM(IOMODE_ANY);
 		{ SEQ4_STATUS_RESTART_RECLAIM_NEEDED,	"RESTART_RECLAIM_NEEDED" }, \
 		{ SEQ4_STATUS_CB_PATH_DOWN_SESSION,	"CB_PATH_DOWN_SESSION" }, \
 		{ SEQ4_STATUS_BACKCHANNEL_FAULT,	"BACKCHANNEL_FAULT" })
+
+TRACE_DEFINE_ENUM(OP_CB_GETATTR);
+TRACE_DEFINE_ENUM(OP_CB_RECALL);
+TRACE_DEFINE_ENUM(OP_CB_LAYOUTRECALL);
+TRACE_DEFINE_ENUM(OP_CB_NOTIFY);
+TRACE_DEFINE_ENUM(OP_CB_PUSH_DELEG);
+TRACE_DEFINE_ENUM(OP_CB_RECALL_ANY);
+TRACE_DEFINE_ENUM(OP_CB_RECALLABLE_OBJ_AVAIL);
+TRACE_DEFINE_ENUM(OP_CB_RECALL_SLOT);
+TRACE_DEFINE_ENUM(OP_CB_SEQUENCE);
+TRACE_DEFINE_ENUM(OP_CB_WANTS_CANCELLED);
+TRACE_DEFINE_ENUM(OP_CB_NOTIFY_LOCK);
+TRACE_DEFINE_ENUM(OP_CB_NOTIFY_DEVICEID);
+TRACE_DEFINE_ENUM(OP_CB_OFFLOAD);
+TRACE_DEFINE_ENUM(OP_CB_ILLEGAL);
+
+#define show_nfs4_cb_op(x) \
+	__print_symbolic(x, \
+		{ 0,				"CB_NULL" }, \
+		{ 1,				"CB_COMPOUND" }, \
+		{ OP_CB_GETATTR,		"CB_GETATTR" }, \
+		{ OP_CB_RECALL,			"CB_RECALL" }, \
+		{ OP_CB_LAYOUTRECALL,		"CB_LAYOUTRECALL" }, \
+		{ OP_CB_NOTIFY,			"CB_NOTIFY" }, \
+		{ OP_CB_PUSH_DELEG,		"CB_PUSH_DELEG" }, \
+		{ OP_CB_RECALL_ANY,		"CB_RECALL_ANY" }, \
+		{ OP_CB_RECALLABLE_OBJ_AVAIL,	"CB_RECALLABLE_OBJ_AVAIL" }, \
+		{ OP_CB_RECALL_SLOT,		"CB_RECALL_SLOT" }, \
+		{ OP_CB_SEQUENCE,		"CB_SEQUENCE" }, \
+		{ OP_CB_WANTS_CANCELLED,	"CB_WANTS_CANCELLED" }, \
+		{ OP_CB_NOTIFY_LOCK,		"CB_NOTIFY_LOCK" }, \
+		{ OP_CB_NOTIFY_DEVICEID,	"CB_NOTIFY_DEVICEID" }, \
+		{ OP_CB_OFFLOAD,		"CB_OFFLOAD" }, \
+		{ OP_CB_ILLEGAL,		"CB_ILLEGAL" })