diff mbox series

[net-next,3/3] net/smc: Introduce tracepoint for smcr link down

Message ID 20211101073912.60410-4-tonylu@linux.alibaba.com (mailing list archive)
State Not Applicable
Headers show
Series Tracepoints for SMC | expand

Commit Message

Tony Lu Nov. 1, 2021, 7:39 a.m. UTC
SMC-R link down event is important to help us find links' issues, we
should track this event, especially in the single nic mode, which means
upper layer connection would be shut down. Then find out the direct
link-down reason in time, not only increased the counter, also the
location of the code who triggered this event.

Signed-off-by: Tony Lu <tonylu@linux.alibaba.com>
Reviewed-by: Wen Gu <guwen@linux.alibaba.com>
---
 net/smc/smc_core.c       |  9 +++++++--
 net/smc/smc_tracepoint.c |  1 +
 net/smc/smc_tracepoint.h | 30 ++++++++++++++++++++++++++++++
 3 files changed, 38 insertions(+), 2 deletions(-)

Comments

Karsten Graul Nov. 2, 2021, 9:30 a.m. UTC | #1
On 01/11/2021 08:39, Tony Lu wrote:
> +
> +	    TP_printk("lnk=%p lgr=%p state=%d dev=%s location=%p",
> +		      __entry->lnk, __entry->lgr,
> +		      __entry->state, __get_str(name),
> +		      __entry->location)

The location is printed as pointer (which might even be randomized?),
is it possible to print the function name of the caller, as described
here: https://stackoverflow.com/questions/4141324/function-caller-in-linux-kernel

  printk("Caller is %pS\n", __builtin_return_address(0));

Not sure if this is possible with the trace points, but it would be
easier to use. You plan to use a dump to find out about the function caller?
Tony Lu Nov. 3, 2021, 6:57 a.m. UTC | #2
On Tue, Nov 02, 2021 at 10:30:22AM +0100, Karsten Graul wrote:
> On 01/11/2021 08:39, Tony Lu wrote:
> > +
> > +	    TP_printk("lnk=%p lgr=%p state=%d dev=%s location=%p",
> > +		      __entry->lnk, __entry->lgr,
> > +		      __entry->state, __get_str(name),
> > +		      __entry->location)
> 
> The location is printed as pointer (which might even be randomized?),
> is it possible to print the function name of the caller, as described
> here: https://stackoverflow.com/questions/4141324/function-caller-in-linux-kernel
> 
>   printk("Caller is %pS\n", __builtin_return_address(0));
> 
> Not sure if this is possible with the trace points, but it would be
> easier to use. You plan to use a dump to find out about the function caller?

Yes, I am going to find out where caused the SMC-R link down. In our
test environment, the tracepoint for link down help me to debug the root
cause of link termination, without eBPF or systemtap.

By using "%pS", it makes the trace log easy to show the function caller
name without additional translating.

  <idle>-0       [000] ..s.    69.087164: smcr_link_down: lnk=00000000dab41cdc lgr=000000007d5d8e24 state=0 rc=1 dev=mlx5_0 location=smc_wr_tx_tasklet_fn+0x5ef/0x6f0 [smc]

I will improve it in the next patch.

Cheers,
Tony Lu
diff mbox series

Patch

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 8e642f8f334f..49b8ba3bb683 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -34,6 +34,7 @@ 
 #include "smc_ism.h"
 #include "smc_netlink.h"
 #include "smc_stats.h"
+#include "smc_tracepoint.h"
 
 #define SMC_LGR_NUM_INCR		256
 #define SMC_LGR_FREE_DELAY_SERV		(600 * HZ)
@@ -1620,15 +1621,19 @@  static void smcr_link_down(struct smc_link *lnk)
 /* must be called under lgr->llc_conf_mutex lock */
 void smcr_link_down_cond(struct smc_link *lnk)
 {
-	if (smc_link_downing(&lnk->state))
+	if (smc_link_downing(&lnk->state)) {
+		trace_smcr_link_down(lnk, __builtin_return_address(0));
 		smcr_link_down(lnk);
+	}
 }
 
 /* will get the lgr->llc_conf_mutex lock */
 void smcr_link_down_cond_sched(struct smc_link *lnk)
 {
-	if (smc_link_downing(&lnk->state))
+	if (smc_link_downing(&lnk->state)) {
+		trace_smcr_link_down(lnk, __builtin_return_address(0));
 		schedule_work(&lnk->link_down_wrk);
+	}
 }
 
 void smcr_port_err(struct smc_ib_device *smcibdev, u8 ibport)
diff --git a/net/smc/smc_tracepoint.c b/net/smc/smc_tracepoint.c
index af031811ddb3..8d47ced5a492 100644
--- a/net/smc/smc_tracepoint.c
+++ b/net/smc/smc_tracepoint.c
@@ -6,3 +6,4 @@ 
 EXPORT_TRACEPOINT_SYMBOL(smc_switch_to_fallback);
 EXPORT_TRACEPOINT_SYMBOL(smc_tx_sendmsg);
 EXPORT_TRACEPOINT_SYMBOL(smc_rx_recvmsg);
+EXPORT_TRACEPOINT_SYMBOL(smcr_link_down);
diff --git a/net/smc/smc_tracepoint.h b/net/smc/smc_tracepoint.h
index eced1546afae..b4c36795a928 100644
--- a/net/smc/smc_tracepoint.h
+++ b/net/smc/smc_tracepoint.h
@@ -75,6 +75,36 @@  DEFINE_EVENT(smc_msg_event, smc_rx_recvmsg,
 	     TP_ARGS(smc, len)
 );
 
+TRACE_EVENT(smcr_link_down,
+
+	    TP_PROTO(const struct smc_link *lnk, void *location),
+
+	    TP_ARGS(lnk, location),
+
+	    TP_STRUCT__entry(
+			     __field(const void *, lnk)
+			     __field(const void *, lgr)
+			     __field(int, state)
+			     __string(name, lnk->ibname)
+			     __field(void *, location)
+	    ),
+
+	    TP_fast_assign(
+			   const struct smc_link_group *lgr = lnk->lgr;
+
+			   __entry->lnk = lnk;
+			   __entry->lgr = lgr;
+			   __entry->state = lnk->state;
+			   __assign_str(name, lnk->ibname);
+			   __entry->location = location;
+	    ),
+
+	    TP_printk("lnk=%p lgr=%p state=%d dev=%s location=%p",
+		      __entry->lnk, __entry->lgr,
+		      __entry->state, __get_str(name),
+		      __entry->location)
+);
+
 #endif /* _TRACE_SMC_H */
 
 #undef TRACE_INCLUDE_PATH