diff mbox series

[iwl-net,v1,1/5] igc: Ensure the PTM cycle is reliably triggered

Message ID 20240807003032.10300-2-christopher.s.hall@intel.com (mailing list archive)
State Awaiting Upstream
Delegated to: Netdev Maintainers
Headers show
Series igc: PTM timeout fix | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for net
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag present in non-next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 29 this patch: 29
netdev/build_tools success No tools touched, skip
netdev/cc_maintainers fail 1 blamed authors not CCed: anthony.l.nguyen@intel.com; 6 maintainers not CCed: edumazet@google.com richardcochran@gmail.com kuba@kernel.org pabeni@redhat.com przemyslaw.kitszel@intel.com anthony.l.nguyen@intel.com
netdev/build_clang success Errors and warnings before: 31 this patch: 31
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success Fixes tag looks correct
netdev/build_allmodconfig_warn success Errors and warnings before: 29 this patch: 29
netdev/checkpatch warning WARNING: line length of 84 exceeds 80 columns WARNING: please, no space before tabs
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 6 this patch: 6
netdev/source_inline success Was 0 now: 0

Commit Message

Hall, Christopher S Aug. 7, 2024, 12:30 a.m. UTC
From: Christopher S M Hall <christopher.s.hall@intel.com>

Writing to clear the PTM status 'valid' bit while the PTM cycle is
triggered results in unreliable PTM operation. To fix this, clear the
PTM 'trigger' and status after each PTM transaction.

The issue can be reproduced with the following:

$ sudo phc2sys -R 1000 -O 0 -i tsn0 -m

Note: 1000 Hz (-R 1000) is unrealistically large, but provides a way to
quickly reproduce the issue.

PHC2SYS exits with:

"ioctl PTP_OFFSET_PRECISE: Connection timed out" when the PTM transaction
  fails

Fixes: a90ec8483732 ("igc: Add support for PTP getcrosststamp()")
Signed-off-by: Christopher S M Hall <christopher.s.hall@intel.com>
Reviewed-by: Michal Swiatkowski <michal.swiatkowski@linux.intel.com>
---
 drivers/net/ethernet/intel/igc/igc_defines.h |  1 +
 drivers/net/ethernet/intel/igc/igc_ptp.c     | 70 ++++++++++++--------
 2 files changed, 42 insertions(+), 29 deletions(-)

Comments

Paul Menzel Aug. 7, 2024, 5:05 a.m. UTC | #1
Dear Christopher,


Thank you for the patch.

Am 07.08.24 um 02:30 schrieb christopher.s.hall@intel.com:
> From: Christopher S M Hall <christopher.s.hall@intel.com>
> 
> Writing to clear the PTM status 'valid' bit while the PTM cycle is
> triggered results in unreliable PTM operation. To fix this, clear the
> PTM 'trigger' and status after each PTM transaction.

I do not understand, why the *valid* bit is not needed anymore, and why 
it’s the trigger bit seems to do the same task. It’d be great if you 
elaborated.

Is that also documented in the dataheet?

> The issue can be reproduced with the following:
> 
> $ sudo phc2sys -R 1000 -O 0 -i tsn0 -m
> 
> Note: 1000 Hz (-R 1000) is unrealistically large, but provides a way to
> quickly reproduce the issue.
> 
> PHC2SYS exits with:
> 
> "ioctl PTP_OFFSET_PRECISE: Connection timed out" when the PTM transaction
>    fails
> 
> Fixes: a90ec8483732 ("igc: Add support for PTP getcrosststamp()")
> Signed-off-by: Christopher S M Hall <christopher.s.hall@intel.com>
> Reviewed-by: Michal Swiatkowski <michal.swiatkowski@linux.intel.com>
> ---
>   drivers/net/ethernet/intel/igc/igc_defines.h |  1 +
>   drivers/net/ethernet/intel/igc/igc_ptp.c     | 70 ++++++++++++--------
>   2 files changed, 42 insertions(+), 29 deletions(-)
> 
> diff --git a/drivers/net/ethernet/intel/igc/igc_defines.h b/drivers/net/ethernet/intel/igc/igc_defines.h
> index 511384f3ec5c..ec191d26c650 100644
> --- a/drivers/net/ethernet/intel/igc/igc_defines.h
> +++ b/drivers/net/ethernet/intel/igc/igc_defines.h
> @@ -583,6 +583,7 @@
>   #define IGC_PTM_STAT_T4M1_OVFL		BIT(3) /* T4 minus T1 overflow */
>   #define IGC_PTM_STAT_ADJUST_1ST		BIT(4) /* 1588 timer adjusted during 1st PTM cycle */
>   #define IGC_PTM_STAT_ADJUST_CYC		BIT(5) /* 1588 timer adjusted during non-1st PTM cycle */
> +#define IGC_PTM_STAT_ALL        		GENMASK(5, 0) /* Used to clear all status */
>   
>   /* PCIe PTM Cycle Control */
>   #define IGC_PTM_CYCLE_CTRL_CYC_TIME(msec)	((msec) & 0x3ff) /* PTM Cycle Time (msec) */
> diff --git a/drivers/net/ethernet/intel/igc/igc_ptp.c b/drivers/net/ethernet/intel/igc/igc_ptp.c
> index 946edbad4302..00cc80d8d164 100644
> --- a/drivers/net/ethernet/intel/igc/igc_ptp.c
> +++ b/drivers/net/ethernet/intel/igc/igc_ptp.c
> @@ -974,11 +974,38 @@ static void igc_ptm_log_error(struct igc_adapter *adapter, u32 ptm_stat)
>   	}
>   }
>   
> +static void igc_ptm_trigger(struct igc_hw *hw)
> +{
> +	u32 ctrl;
> +
> +	/* To "manually" start the PTM cycle we need to set the
> +	 * trigger (TRIG) bit
> +	 */
> +	ctrl = rd32(IGC_PTM_CTRL);
> +	ctrl |= IGC_PTM_CTRL_TRIG;
> +	wr32(IGC_PTM_CTRL, ctrl);
> +	/* Perform flush after write to CTRL register otherwise
> +	 * transaction may not start
> +	 */
> +	wrfl();
> +}
> +
> +static void igc_ptm_reset(struct igc_hw *hw)
> +{
> +	u32 ctrl;
> +
> +	ctrl = rd32(IGC_PTM_CTRL);
> +	ctrl &= ~IGC_PTM_CTRL_TRIG;
> +	wr32(IGC_PTM_CTRL, ctrl);
> +	/* Write to clear all status */
> +	wr32(IGC_PTM_STAT, IGC_PTM_STAT_ALL);
> +}
> +
>   static int igc_phc_get_syncdevicetime(ktime_t *device,
>   				      struct system_counterval_t *system,
>   				      void *ctx)
>   {
> -	u32 stat, t2_curr_h, t2_curr_l, ctrl;
> +	u32 stat, t2_curr_h, t2_curr_l;
>   	struct igc_adapter *adapter = ctx;
>   	struct igc_hw *hw = &adapter->hw;
>   	int err, count = 100;
> @@ -994,25 +1021,13 @@ static int igc_phc_get_syncdevicetime(ktime_t *device,
>   		 * are transitory. Repeating the process returns valid
>   		 * data eventually.
>   		 */
> -
> -		/* To "manually" start the PTM cycle we need to clear and
> -		 * then set again the TRIG bit.
> -		 */
> -		ctrl = rd32(IGC_PTM_CTRL);
> -		ctrl &= ~IGC_PTM_CTRL_TRIG;
> -		wr32(IGC_PTM_CTRL, ctrl);
> -		ctrl |= IGC_PTM_CTRL_TRIG;
> -		wr32(IGC_PTM_CTRL, ctrl);
> -
> -		/* The cycle only starts "for real" when software notifies
> -		 * that it has read the registers, this is done by setting
> -		 * VALID bit.
> -		 */
> -		wr32(IGC_PTM_STAT, IGC_PTM_STAT_VALID);
> +		igc_ptm_trigger(hw);
>   
>   		err = readx_poll_timeout(rd32, IGC_PTM_STAT, stat,
>   					 stat, IGC_PTM_STAT_SLEEP,
>   					 IGC_PTM_STAT_TIMEOUT);
> +		igc_ptm_reset(hw);
> +
>   		if (err < 0) {
>   			netdev_err(adapter->netdev, "Timeout reading IGC_PTM_STAT register\n");
>   			return err;
> @@ -1021,15 +1036,7 @@ static int igc_phc_get_syncdevicetime(ktime_t *device,
>   		if ((stat & IGC_PTM_STAT_VALID) == IGC_PTM_STAT_VALID)
>   			break;
>   
> -		if (stat & ~IGC_PTM_STAT_VALID) {
> -			/* An error occurred, log it. */
> -			igc_ptm_log_error(adapter, stat);
> -			/* The STAT register is write-1-to-clear (W1C),
> -			 * so write the previous error status to clear it.
> -			 */
> -			wr32(IGC_PTM_STAT, stat);
> -			continue;
> -		}
> +		igc_ptm_log_error(adapter, stat);

Could this refactoring be a separate commit?

>   	} while (--count);
>   
>   	if (!count) {
> @@ -1255,7 +1262,7 @@ void igc_ptp_stop(struct igc_adapter *adapter)
>   void igc_ptp_reset(struct igc_adapter *adapter)
>   {
>   	struct igc_hw *hw = &adapter->hw;
> -	u32 cycle_ctrl, ctrl;
> +	u32 cycle_ctrl, ctrl, stat;
>   	unsigned long flags;
>   	u32 timadj;
>   
> @@ -1290,14 +1297,19 @@ void igc_ptp_reset(struct igc_adapter *adapter)
>   		ctrl = IGC_PTM_CTRL_EN |
>   			IGC_PTM_CTRL_START_NOW |
>   			IGC_PTM_CTRL_SHRT_CYC(IGC_PTM_SHORT_CYC_DEFAULT) |
> -			IGC_PTM_CTRL_PTM_TO(IGC_PTM_TIMEOUT_DEFAULT) |
> -			IGC_PTM_CTRL_TRIG;
> +			IGC_PTM_CTRL_PTM_TO(IGC_PTM_TIMEOUT_DEFAULT);
>   
>   		wr32(IGC_PTM_CTRL, ctrl);
>   
>   		/* Force the first cycle to run. */
> -		wr32(IGC_PTM_STAT, IGC_PTM_STAT_VALID);
> +		igc_ptm_trigger(hw);
> +
> +		if (readx_poll_timeout_atomic(rd32, IGC_PTM_STAT, stat,
> +					      stat, IGC_PTM_STAT_SLEEP,
> +					      IGC_PTM_STAT_TIMEOUT))
> +			netdev_err(adapter->netdev, "Timeout reading IGC_PTM_STAT register\n");

I’d add the timeout value to the message.

>   
> +		igc_ptm_reset(hw);
>   		break;
>   	default:
>   		/* No work to do. */


Kind regards,

Paul
Corinna Vinschen Aug. 7, 2024, 4:15 p.m. UTC | #2
Hi Christopher,

On Aug  6 17:30, christopher.s.hall@intel.com wrote:
> From: Christopher S M Hall <christopher.s.hall@intel.com>
> 
> Writing to clear the PTM status 'valid' bit while the PTM cycle is
> triggered results in unreliable PTM operation. To fix this, clear the
> PTM 'trigger' and status after each PTM transaction.
> 
> The issue can be reproduced with the following:
> 
> $ sudo phc2sys -R 1000 -O 0 -i tsn0 -m
> 
> Note: 1000 Hz (-R 1000) is unrealistically large, but provides a way to
> quickly reproduce the issue.
> 
> PHC2SYS exits with:
> 
> "ioctl PTP_OFFSET_PRECISE: Connection timed out" when the PTM transaction
>   fails

It would be great to add the problems encountered with kdump to the
commit message as well, as discussed with Vinicius, wouldn't it?

If you need a description, I can provide one.


Thanks,
Corinna
Hall, Christopher S Aug. 7, 2024, 8:27 p.m. UTC | #3
Hi Corrina,

> > PHC2SYS exits with:
> >
> > "ioctl PTP_OFFSET_PRECISE: Connection timed out" when the PTM
> transaction
> >   fails
> 
> It would be great to add the problems encountered with kdump to the
> commit message as well, as discussed with Vinicius, wouldn't it?
> 
> If you need a description, I can provide one.

Does this patch fix the issue you observed? If it does, I am happy to
include your description of the problem it solves. A tested-by tag would
be appreciated as well.

Thanks,
Chris
Corinna Vinschen Aug. 8, 2024, 2:50 p.m. UTC | #4
Hi Christopher,

On Aug  7 20:27, Hall, Christopher S wrote:
> Hi Corrina,

s/rrin/rinn/ ;)

> > > PHC2SYS exits with:
> > >
> > > "ioctl PTP_OFFSET_PRECISE: Connection timed out" when the PTM
> > transaction
> > >   fails
> > 
> > It would be great to add the problems encountered with kdump to the
> > commit message as well, as discussed with Vinicius, wouldn't it?
> > 
> > If you need a description, I can provide one.
> 
> Does this patch fix the issue you observed?

Yes, it does.  

> If it does, I am happy to
> include your description of the problem it solves.

Is the following ok?

  This patch also fixes a hang in igc_probe() when loading the igc
  driver in the kdump kernel on systems supporting PTM.

  The igc driver running in the base kernel enables PTM trigger in
  igc_probe().  Therefore the driver is always in PTM trigger mode,
  except in brief periods when manually triggering a PTM cycle.

  When a crash occurs, the NIC is reset while PTM trigger is enabled.
  Due to a hardware problem, the NIC is subsequently in a bad busmaster
  state and doesn't handle register reads/writes.  When running
  igc_probe() in the kdump kernel, the first register access to a NIC
  register hangs driver probing and ultimately breaks kdump.

  With this patch, igc has PTM trigger disabled most of the time,
  and the trigger is only enabled for very brief (10 - 100 us) periods
  when manually triggering a PTM cycle.  Chances that a crash occurs
  during a PTM trigger are not 0, but extremly reduced.

> A tested-by tag would be appreciated as well.

Reviewed-by: Corinna Vinschen <vinschen@redhat.com>
Tested-by: Corinna Vinschen <vinschen@redhat.com> (kdump hang only)


Thanks,
Corinna
diff mbox series

Patch

diff --git a/drivers/net/ethernet/intel/igc/igc_defines.h b/drivers/net/ethernet/intel/igc/igc_defines.h
index 511384f3ec5c..ec191d26c650 100644
--- a/drivers/net/ethernet/intel/igc/igc_defines.h
+++ b/drivers/net/ethernet/intel/igc/igc_defines.h
@@ -583,6 +583,7 @@ 
 #define IGC_PTM_STAT_T4M1_OVFL		BIT(3) /* T4 minus T1 overflow */
 #define IGC_PTM_STAT_ADJUST_1ST		BIT(4) /* 1588 timer adjusted during 1st PTM cycle */
 #define IGC_PTM_STAT_ADJUST_CYC		BIT(5) /* 1588 timer adjusted during non-1st PTM cycle */
+#define IGC_PTM_STAT_ALL        	GENMASK(5, 0) /* Used to clear all status */
 
 /* PCIe PTM Cycle Control */
 #define IGC_PTM_CYCLE_CTRL_CYC_TIME(msec)	((msec) & 0x3ff) /* PTM Cycle Time (msec) */
diff --git a/drivers/net/ethernet/intel/igc/igc_ptp.c b/drivers/net/ethernet/intel/igc/igc_ptp.c
index 946edbad4302..00cc80d8d164 100644
--- a/drivers/net/ethernet/intel/igc/igc_ptp.c
+++ b/drivers/net/ethernet/intel/igc/igc_ptp.c
@@ -974,11 +974,38 @@  static void igc_ptm_log_error(struct igc_adapter *adapter, u32 ptm_stat)
 	}
 }
 
+static void igc_ptm_trigger(struct igc_hw *hw)
+{
+	u32 ctrl;
+
+	/* To "manually" start the PTM cycle we need to set the
+	 * trigger (TRIG) bit
+	 */
+	ctrl = rd32(IGC_PTM_CTRL);
+	ctrl |= IGC_PTM_CTRL_TRIG;
+	wr32(IGC_PTM_CTRL, ctrl);
+	/* Perform flush after write to CTRL register otherwise
+	 * transaction may not start
+	 */
+	wrfl();
+}
+
+static void igc_ptm_reset(struct igc_hw *hw)
+{
+	u32 ctrl;
+
+	ctrl = rd32(IGC_PTM_CTRL);
+	ctrl &= ~IGC_PTM_CTRL_TRIG;
+	wr32(IGC_PTM_CTRL, ctrl);
+	/* Write to clear all status */
+	wr32(IGC_PTM_STAT, IGC_PTM_STAT_ALL);
+}
+
 static int igc_phc_get_syncdevicetime(ktime_t *device,
 				      struct system_counterval_t *system,
 				      void *ctx)
 {
-	u32 stat, t2_curr_h, t2_curr_l, ctrl;
+	u32 stat, t2_curr_h, t2_curr_l;
 	struct igc_adapter *adapter = ctx;
 	struct igc_hw *hw = &adapter->hw;
 	int err, count = 100;
@@ -994,25 +1021,13 @@  static int igc_phc_get_syncdevicetime(ktime_t *device,
 		 * are transitory. Repeating the process returns valid
 		 * data eventually.
 		 */
-
-		/* To "manually" start the PTM cycle we need to clear and
-		 * then set again the TRIG bit.
-		 */
-		ctrl = rd32(IGC_PTM_CTRL);
-		ctrl &= ~IGC_PTM_CTRL_TRIG;
-		wr32(IGC_PTM_CTRL, ctrl);
-		ctrl |= IGC_PTM_CTRL_TRIG;
-		wr32(IGC_PTM_CTRL, ctrl);
-
-		/* The cycle only starts "for real" when software notifies
-		 * that it has read the registers, this is done by setting
-		 * VALID bit.
-		 */
-		wr32(IGC_PTM_STAT, IGC_PTM_STAT_VALID);
+		igc_ptm_trigger(hw);
 
 		err = readx_poll_timeout(rd32, IGC_PTM_STAT, stat,
 					 stat, IGC_PTM_STAT_SLEEP,
 					 IGC_PTM_STAT_TIMEOUT);
+		igc_ptm_reset(hw);
+
 		if (err < 0) {
 			netdev_err(adapter->netdev, "Timeout reading IGC_PTM_STAT register\n");
 			return err;
@@ -1021,15 +1036,7 @@  static int igc_phc_get_syncdevicetime(ktime_t *device,
 		if ((stat & IGC_PTM_STAT_VALID) == IGC_PTM_STAT_VALID)
 			break;
 
-		if (stat & ~IGC_PTM_STAT_VALID) {
-			/* An error occurred, log it. */
-			igc_ptm_log_error(adapter, stat);
-			/* The STAT register is write-1-to-clear (W1C),
-			 * so write the previous error status to clear it.
-			 */
-			wr32(IGC_PTM_STAT, stat);
-			continue;
-		}
+		igc_ptm_log_error(adapter, stat);
 	} while (--count);
 
 	if (!count) {
@@ -1255,7 +1262,7 @@  void igc_ptp_stop(struct igc_adapter *adapter)
 void igc_ptp_reset(struct igc_adapter *adapter)
 {
 	struct igc_hw *hw = &adapter->hw;
-	u32 cycle_ctrl, ctrl;
+	u32 cycle_ctrl, ctrl, stat;
 	unsigned long flags;
 	u32 timadj;
 
@@ -1290,14 +1297,19 @@  void igc_ptp_reset(struct igc_adapter *adapter)
 		ctrl = IGC_PTM_CTRL_EN |
 			IGC_PTM_CTRL_START_NOW |
 			IGC_PTM_CTRL_SHRT_CYC(IGC_PTM_SHORT_CYC_DEFAULT) |
-			IGC_PTM_CTRL_PTM_TO(IGC_PTM_TIMEOUT_DEFAULT) |
-			IGC_PTM_CTRL_TRIG;
+			IGC_PTM_CTRL_PTM_TO(IGC_PTM_TIMEOUT_DEFAULT);
 
 		wr32(IGC_PTM_CTRL, ctrl);
 
 		/* Force the first cycle to run. */
-		wr32(IGC_PTM_STAT, IGC_PTM_STAT_VALID);
+		igc_ptm_trigger(hw);
+
+		if (readx_poll_timeout_atomic(rd32, IGC_PTM_STAT, stat,
+					      stat, IGC_PTM_STAT_SLEEP,
+					      IGC_PTM_STAT_TIMEOUT))
+			netdev_err(adapter->netdev, "Timeout reading IGC_PTM_STAT register\n");
 
+		igc_ptm_reset(hw);
 		break;
 	default:
 		/* No work to do. */