diff mbox series

[net-next,v4,12/12] igc: Add support for Frame Preemption verification

Message ID 20210626003314.3159402-13-vinicius.gomes@intel.com (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series ethtool: Add support for frame preemption | expand

Checks

Context Check Description
netdev/cover_letter success Link
netdev/fixes_present success Link
netdev/patch_count success Link
netdev/tree_selection success Clearly marked for net-next
netdev/subject_prefix success Link
netdev/cc_maintainers warning 7 maintainers not CCed: hawk@kernel.org daniel@iogearbox.net bpf@vger.kernel.org ast@kernel.org john.fastabend@gmail.com davem@davemloft.net jesse.brandeburg@intel.com
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit success Errors and warnings before: 0 this patch: 0
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/verify_fixes success Link
netdev/checkpatch warning WARNING: line length of 81 exceeds 80 columns WARNING: line length of 82 exceeds 80 columns WARNING: line length of 83 exceeds 80 columns WARNING: line length of 84 exceeds 80 columns WARNING: line length of 85 exceeds 80 columns WARNING: line length of 86 exceeds 80 columns WARNING: line length of 87 exceeds 80 columns WARNING: line length of 89 exceeds 80 columns WARNING: line length of 92 exceeds 80 columns WARNING: line length of 94 exceeds 80 columns
netdev/build_allmodconfig_warn success Errors and warnings before: 0 this patch: 0
netdev/header_inline success Link

Commit Message

Vinicius Costa Gomes June 26, 2021, 12:33 a.m. UTC
Add support for sending/receiving Frame Preemption verification
frames.

The i225 hardware doesn't implement the process of verification
internally, this is left to the driver.

Add a simple implementation of the state machine defined in IEEE
802.3-2018, Section 99.4.7.

For now, the state machine is started manually by the user, when
enabling verification. Example:

$ ethtool --set-frame-preemption IFACE disable-verify off

The "verified" condition is set to true when the SMD-V frame is sent,
and the SMD-R frame is received. So, it only tracks the transmission
side. This seems to be what's expected from IEEE 802.3-2018.

Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
---
 drivers/net/ethernet/intel/igc/igc.h         |  15 ++
 drivers/net/ethernet/intel/igc/igc_defines.h |  13 ++
 drivers/net/ethernet/intel/igc/igc_ethtool.c |  20 +-
 drivers/net/ethernet/intel/igc/igc_main.c    | 216 +++++++++++++++++++
 4 files changed, 261 insertions(+), 3 deletions(-)

Comments

Vladimir Oltean June 28, 2021, 9:59 a.m. UTC | #1
On Fri, Jun 25, 2021 at 05:33:14PM -0700, Vinicius Costa Gomes wrote:
> Add support for sending/receiving Frame Preemption verification
> frames.
> 
> The i225 hardware doesn't implement the process of verification
> internally, this is left to the driver.
> 
> Add a simple implementation of the state machine defined in IEEE
> 802.3-2018, Section 99.4.7.
> 
> For now, the state machine is started manually by the user, when
> enabling verification. Example:
> 
> $ ethtool --set-frame-preemption IFACE disable-verify off
> 
> The "verified" condition is set to true when the SMD-V frame is sent,
> and the SMD-R frame is received. So, it only tracks the transmission
> side. This seems to be what's expected from IEEE 802.3-2018.
> 
> Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
> ---
>  drivers/net/ethernet/intel/igc/igc.h         |  15 ++
>  drivers/net/ethernet/intel/igc/igc_defines.h |  13 ++
>  drivers/net/ethernet/intel/igc/igc_ethtool.c |  20 +-
>  drivers/net/ethernet/intel/igc/igc_main.c    | 216 +++++++++++++++++++
>  4 files changed, 261 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/net/ethernet/intel/igc/igc.h b/drivers/net/ethernet/intel/igc/igc.h
> index 9b2ddcbf65fb..84234efed781 100644
> --- a/drivers/net/ethernet/intel/igc/igc.h
> +++ b/drivers/net/ethernet/intel/igc/igc.h
> @@ -122,6 +122,13 @@ struct igc_ring {
>  	struct xsk_buff_pool *xsk_pool;
>  } ____cacheline_internodealigned_in_smp;
>  
> +enum frame_preemption_state {
> +	FRAME_PREEMPTION_STATE_FAILED,
> +	FRAME_PREEMPTION_STATE_DONE,
> +	FRAME_PREEMPTION_STATE_START,
> +	FRAME_PREEMPTION_STATE_SENT,
> +};
> +
>  /* Board specific private data structure */
>  struct igc_adapter {
>  	struct net_device *netdev;
> @@ -240,6 +247,14 @@ struct igc_adapter {
>  		struct timespec64 start;
>  		struct timespec64 period;
>  	} perout[IGC_N_PEROUT];
> +
> +	struct delayed_work fp_verification_work;
> +	unsigned long fp_start;
> +	bool fp_received_smd_v;
> +	bool fp_received_smd_r;
> +	unsigned int fp_verify_cnt;
> +	enum frame_preemption_state fp_tx_state;
> +	bool fp_disable_verify;
>  };
>  
>  void igc_up(struct igc_adapter *adapter);
> diff --git a/drivers/net/ethernet/intel/igc/igc_defines.h b/drivers/net/ethernet/intel/igc/igc_defines.h
> index a2ea057d8e6e..cf46f5d5a505 100644
> --- a/drivers/net/ethernet/intel/igc/igc_defines.h
> +++ b/drivers/net/ethernet/intel/igc/igc_defines.h
> @@ -268,6 +268,8 @@
>  #define IGC_TXD_DTYP_C		0x00000000 /* Context Descriptor */
>  #define IGC_TXD_POPTS_IXSM	0x01       /* Insert IP checksum */
>  #define IGC_TXD_POPTS_TXSM	0x02       /* Insert TCP/UDP checksum */
> +#define IGC_TXD_POPTS_SMD_V	0x10       /* Transmitted packet is a SMD-Verify */
> +#define IGC_TXD_POPTS_SMD_R	0x20       /* Transmitted packet is a SMD-Response */
>  #define IGC_TXD_CMD_EOP		0x01000000 /* End of Packet */
>  #define IGC_TXD_CMD_IC		0x04000000 /* Insert Checksum */
>  #define IGC_TXD_CMD_DEXT	0x20000000 /* Desc extension (0 = legacy) */
> @@ -327,9 +329,20 @@
>  
>  #define IGC_RXDEXT_STATERR_LB	0x00040000
>  
> +#define IGC_RXD_STAT_SMD_V	0x2000  /* Received packet is SMD-Verify packet */
> +#define IGC_RXD_STAT_SMD_R	0x4000  /* Received packet is SMD-Response packet */
> +

So the i225 gives you the ability to select from multiple
Start-of-mPacket-Delimiter values on a per-TX descriptor basis?
And this is in addition to configuring that TX ring as preemptable I
guess? Because I notice that you're sending on the TX ring affine to the
current CPU that the verification work item is running on (which you
don't check anywhere that it is configured as going to the pMAC or not).
And on RX, it always gives you the kind of SMD that the packet had
(including the classic SFD for express packets)?
Cool.

It would be nice if I could connect back to back an i225 board with an
NXP LS1028A to see if the verification state machines pass both ways (on
LS1028A it is 100% hardware based, we just enable/disable the feature
and we can monitor the state changes via an interrupt).

>  /* Advanced Receive Descriptor bit definitions */
>  #define IGC_RXDADV_STAT_TSIP	0x08000 /* timestamp in packet */
>  
> +#define IGC_RXDADV_STAT_SMD_TYPE_MASK	0x06000
> +#define IGC_RXDADV_STAT_SMD_TYPE_SHIFT	13
> +
> +#define IGC_SMD_TYPE_SFD		0x0
> +#define IGC_SMD_TYPE_SMD_V		0x1
> +#define IGC_SMD_TYPE_SMD_R		0x2
> +#define IGC_SMD_TYPE_COMPLETE		0x3
> +
>  #define IGC_RXDEXT_STATERR_L4E		0x20000000
>  #define IGC_RXDEXT_STATERR_IPE		0x40000000
>  #define IGC_RXDEXT_STATERR_RXE		0x80000000
> diff --git a/drivers/net/ethernet/intel/igc/igc_ethtool.c b/drivers/net/ethernet/intel/igc/igc_ethtool.c
> index 84d5afe92154..f52a7be3af66 100644
> --- a/drivers/net/ethernet/intel/igc/igc_ethtool.c
> +++ b/drivers/net/ethernet/intel/igc/igc_ethtool.c
> @@ -1649,6 +1649,8 @@ static int igc_ethtool_get_preempt(struct net_device *netdev,
>  
>  	fpcmd->enabled = adapter->frame_preemption_active;
>  	fpcmd->add_frag_size = adapter->add_frag_size;
> +	fpcmd->verified = adapter->fp_tx_state == FRAME_PREEMPTION_STATE_DONE;
> +	fpcmd->disable_verify = adapter->fp_disable_verify;
>  
>  	return 0;
>  }
> @@ -1664,10 +1666,22 @@ static int igc_ethtool_set_preempt(struct net_device *netdev,
>  		return -EINVAL;
>  	}
>  
> -	adapter->frame_preemption_active = fpcmd->enabled;
> -	adapter->add_frag_size = fpcmd->add_frag_size;
> +	if (!fpcmd->disable_verify && adapter->fp_disable_verify) {
> +		adapter->fp_tx_state = FRAME_PREEMPTION_STATE_START;
> +		schedule_delayed_work(&adapter->fp_verification_work, msecs_to_jiffies(10));

Not sure how much you'd like to tune this, but the spec has a
configurable verifyTime between 1 ms and 128 ms. You chose the default
value, so we should be ok for now.

> +	}
>  
> -	return igc_tsn_offload_apply(adapter);
> +	adapter->fp_disable_verify = fpcmd->disable_verify;
> +
> +	if (adapter->frame_preemption_active != fpcmd->enabled ||
> +	    adapter->add_frag_size != fpcmd->add_frag_size) {
> +		adapter->frame_preemption_active = fpcmd->enabled;
> +		adapter->add_frag_size = fpcmd->add_frag_size;
> +
> +		return igc_tsn_offload_apply(adapter);
> +	}
> +
> +	return 0;
>  }
>  
>  static int igc_ethtool_begin(struct net_device *netdev)
> diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c
> index 20dac04a02f2..ed55bd13e4a1 100644
> --- a/drivers/net/ethernet/intel/igc/igc_main.c
> +++ b/drivers/net/ethernet/intel/igc/igc_main.c
> @@ -28,6 +28,11 @@
>  #define IGC_XDP_TX		BIT(1)
>  #define IGC_XDP_REDIRECT	BIT(2)
>  
> +#define IGC_FP_TIMEOUT msecs_to_jiffies(100)
> +#define IGC_MAX_VERIFY_CNT 3
> +
> +#define IGC_FP_SMD_FRAME_SIZE 60
> +
>  static int debug = -1;
>  
>  MODULE_AUTHOR("Intel Corporation, <linux.nics@intel.com>");
> @@ -2169,6 +2174,79 @@ static int igc_xdp_init_tx_descriptor(struct igc_ring *ring,
>  	return 0;
>  }
>  
> +static int igc_fp_init_smd_frame(struct igc_ring *ring, struct igc_tx_buffer *buffer,
> +				 struct sk_buff *skb)
> +{
> +	dma_addr_t dma;
> +	unsigned int size;
> +
> +	size = skb_headlen(skb);
> +
> +	dma = dma_map_single(ring->dev, skb->data, size, DMA_TO_DEVICE);
> +	if (dma_mapping_error(ring->dev, dma)) {
> +		netdev_err_once(ring->netdev, "Failed to map DMA for TX\n");
> +		return -ENOMEM;
> +	}
> +
> +	buffer->skb = skb;
> +	buffer->protocol = 0;
> +	buffer->bytecount = skb->len;
> +	buffer->gso_segs = 1;
> +	buffer->time_stamp = jiffies;
> +	dma_unmap_len_set(buffer, len, skb->len);
> +	dma_unmap_addr_set(buffer, dma, dma);
> +
> +	return 0;
> +}
> +
> +static int igc_fp_init_tx_descriptor(struct igc_ring *ring,
> +				     struct sk_buff *skb, int type)
> +{
> +	struct igc_tx_buffer *buffer;
> +	union igc_adv_tx_desc *desc;
> +	u32 cmd_type, olinfo_status;
> +	int err;
> +
> +	if (!igc_desc_unused(ring))
> +		return -EBUSY;
> +
> +	buffer = &ring->tx_buffer_info[ring->next_to_use];
> +	err = igc_fp_init_smd_frame(ring, buffer, skb);
> +	if (err)
> +		return err;
> +
> +	cmd_type = IGC_ADVTXD_DTYP_DATA | IGC_ADVTXD_DCMD_DEXT |
> +		   IGC_ADVTXD_DCMD_IFCS | IGC_TXD_DCMD |
> +		   buffer->bytecount;
> +	olinfo_status = buffer->bytecount << IGC_ADVTXD_PAYLEN_SHIFT;
> +
> +	switch (type) {
> +	case IGC_SMD_TYPE_SMD_V:
> +		olinfo_status |= (IGC_TXD_POPTS_SMD_V << 8);
> +		break;
> +	case IGC_SMD_TYPE_SMD_R:
> +		olinfo_status |= (IGC_TXD_POPTS_SMD_R << 8);
> +		break;
> +	default:
> +		return -EINVAL;
> +	}
> +
> +	desc = IGC_TX_DESC(ring, ring->next_to_use);
> +	desc->read.cmd_type_len = cpu_to_le32(cmd_type);
> +	desc->read.olinfo_status = cpu_to_le32(olinfo_status);
> +	desc->read.buffer_addr = cpu_to_le64(dma_unmap_addr(buffer, dma));
> +
> +	netdev_tx_sent_queue(txring_txq(ring), skb->len);
> +
> +	buffer->next_to_watch = desc;
> +
> +	ring->next_to_use++;
> +	if (ring->next_to_use == ring->count)
> +		ring->next_to_use = 0;
> +
> +	return 0;
> +}
> +
>  static struct igc_ring *igc_xdp_get_tx_ring(struct igc_adapter *adapter,
>  					    int cpu)
>  {
> @@ -2299,6 +2377,19 @@ static void igc_update_rx_stats(struct igc_q_vector *q_vector,
>  	q_vector->rx.total_bytes += bytes;
>  }
>  
> +static int igc_rx_desc_smd_type(union igc_adv_rx_desc *rx_desc)
> +{
> +	u32 status = le32_to_cpu(rx_desc->wb.upper.status_error);
> +
> +	return (status & IGC_RXDADV_STAT_SMD_TYPE_MASK)
> +		>> IGC_RXDADV_STAT_SMD_TYPE_SHIFT;
> +}
> +
> +static bool igc_check_smd_frame(struct igc_rx_buffer *rx_buffer, unsigned int size)
> +{
> +	return size == 60;

You should probably also verify that the contents is 60 octets of zeroes (sans the mCRC)?

> +}
> +
>  static int igc_clean_rx_irq(struct igc_q_vector *q_vector, const int budget)
>  {
>  	unsigned int total_bytes = 0, total_packets = 0;
> @@ -2315,6 +2406,7 @@ static int igc_clean_rx_irq(struct igc_q_vector *q_vector, const int budget)
>  		ktime_t timestamp = 0;
>  		struct xdp_buff xdp;
>  		int pkt_offset = 0;
> +		int smd_type;
>  		void *pktbuf;
>  
>  		/* return some buffers to hardware, one at a time is too slow */
> @@ -2346,6 +2438,22 @@ static int igc_clean_rx_irq(struct igc_q_vector *q_vector, const int budget)
>  			size -= IGC_TS_HDR_LEN;
>  		}
>  
> +		smd_type = igc_rx_desc_smd_type(rx_desc);
> +
> +		if (smd_type == IGC_SMD_TYPE_SMD_V || smd_type == IGC_SMD_TYPE_SMD_R) {

I guess the performance people will love you for this change. You should
probably guard it by an "if (unlikely(disableVerify == false))" condition.

> +			if (igc_check_smd_frame(rx_buffer, size)) {
> +				adapter->fp_received_smd_v = smd_type == IGC_SMD_TYPE_SMD_V;
> +				adapter->fp_received_smd_r = smd_type == IGC_SMD_TYPE_SMD_R;
> +				schedule_delayed_work(&adapter->fp_verification_work, 0);
> +			}
> +
> +			/* Advance the ring next-to-clean */
> +			igc_is_non_eop(rx_ring, rx_desc);
> +
> +			cleaned_count++;
> +			continue;
> +		}
> +
>  		if (!skb) {
>  			xdp_init_buff(&xdp, truesize, &rx_ring->xdp_rxq);
>  			xdp_prepare_buff(&xdp, pktbuf - igc_rx_offset(rx_ring),
> @@ -5607,6 +5715,107 @@ static int igc_tsn_enable_qbv_scheduling(struct igc_adapter *adapter,
>  	return igc_tsn_offload_apply(adapter);
>  }
>  
> +/* I225 doesn't send the SMD frames automatically, we need to handle
> + * them ourselves.
> + */
> +static int igc_xmit_smd_frame(struct igc_adapter *adapter, int type)
> +{
> +	int cpu = smp_processor_id();
> +	struct netdev_queue *nq;
> +	struct igc_ring *ring;
> +	struct sk_buff *skb;
> +	void *data;
> +	int err;
> +
> +	if (!netif_running(adapter->netdev))
> +		return -ENOTCONN;
> +
> +	/* FIXME: rename this function to something less specific, as
> +	 * it can be used outside XDP.
> +	 */
> +	ring = igc_xdp_get_tx_ring(adapter, cpu);
> +	nq = txring_txq(ring);
> +
> +	skb = alloc_skb(IGC_FP_SMD_FRAME_SIZE, GFP_KERNEL);
> +	if (!skb)
> +		return -ENOMEM;
> +
> +	data = skb_put(skb, IGC_FP_SMD_FRAME_SIZE);
> +	memset(data, 0, IGC_FP_SMD_FRAME_SIZE);
> +
> +	__netif_tx_lock(nq, cpu);
> +
> +	err = igc_fp_init_tx_descriptor(ring, skb, type);
> +
> +	igc_flush_tx_descriptors(ring);
> +
> +	__netif_tx_unlock(nq);
> +
> +	return err;
> +}
> +
> +static void igc_fp_verification_work(struct work_struct *work)
> +{
> +	struct delayed_work *dwork = to_delayed_work(work);
> +	struct igc_adapter *adapter;
> +	int err;
> +
> +	adapter = container_of(dwork, struct igc_adapter, fp_verification_work);
> +
> +	if (adapter->fp_disable_verify)
> +		goto done;
> +
> +	switch (adapter->fp_tx_state) {
> +	case FRAME_PREEMPTION_STATE_START:
> +		adapter->fp_received_smd_r = false;
> +		err = igc_xmit_smd_frame(adapter, IGC_SMD_TYPE_SMD_V);
> +		if (err < 0)
> +			netdev_err(adapter->netdev, "Error sending SMD-V frame\n");

On TX error should you really advance to the STATE_SENT?

> +
> +		adapter->fp_tx_state = FRAME_PREEMPTION_STATE_SENT;
> +		adapter->fp_start = jiffies;
> +		schedule_delayed_work(&adapter->fp_verification_work, IGC_FP_TIMEOUT);
> +		break;
> +
> +	case FRAME_PREEMPTION_STATE_SENT:
> +		if (adapter->fp_received_smd_r) {
> +			adapter->fp_tx_state = FRAME_PREEMPTION_STATE_DONE;
> +			adapter->fp_received_smd_r = false;
> +			break;
> +		}
> +
> +		if (time_is_before_jiffies(adapter->fp_start + IGC_FP_TIMEOUT)) {
> +			adapter->fp_verify_cnt++;
> +			netdev_warn(adapter->netdev, "Timeout waiting for SMD-R frame\n");
> +
> +			if (adapter->fp_verify_cnt > IGC_MAX_VERIFY_CNT) {
> +				adapter->fp_verify_cnt = 0;
> +				adapter->fp_tx_state = FRAME_PREEMPTION_STATE_FAILED;
> +				netdev_err(adapter->netdev,
> +					   "Exceeded number of attempts for frame preemption verification\n");
> +			} else {
> +				adapter->fp_tx_state = FRAME_PREEMPTION_STATE_START;
> +			}
> +			schedule_delayed_work(&adapter->fp_verification_work, IGC_FP_TIMEOUT);
> +		}
> +
> +		break;
> +
> +	case FRAME_PREEMPTION_STATE_FAILED:
> +	case FRAME_PREEMPTION_STATE_DONE:
> +		break;
> +	}
> +
> +done:
> +	if (adapter->fp_received_smd_v) {
> +		err = igc_xmit_smd_frame(adapter, IGC_SMD_TYPE_SMD_R);
> +		if (err < 0)
> +			netdev_err(adapter->netdev, "Error sending SMD-R frame\n");
> +
> +		adapter->fp_received_smd_v = false;
> +	}
> +}
> +
>  static int igc_setup_tc(struct net_device *dev, enum tc_setup_type type,
>  			void *type_data)
>  {
> @@ -6023,6 +6232,7 @@ static int igc_probe(struct pci_dev *pdev,
>  
>  	INIT_WORK(&adapter->reset_task, igc_reset_task);
>  	INIT_WORK(&adapter->watchdog_task, igc_watchdog_task);
> +	INIT_DELAYED_WORK(&adapter->fp_verification_work, igc_fp_verification_work);
>  
>  	/* Initialize link properties that are user-changeable */
>  	adapter->fc_autoneg = true;
> @@ -6044,6 +6254,12 @@ static int igc_probe(struct pci_dev *pdev,
>  
>  	igc_ptp_init(adapter);
>  
> +	/* FIXME: This sets the default to not do the verification
> +	 * automatically, when we have support in multiple
> +	 * controllers, this default can be changed.
> +	 */
> +	adapter->fp_disable_verify = true;
> +

Hmmmmm. So we need to instruct our users to explicitly enable
verification in their ethtool-based scripts, since the default values
will vary wildly from one vendor to another. On LS1028A I see no reason
why verification would be disabled by default.

>  	/* reset the hardware with the new settings */
>  	igc_reset(adapter);
>  
> -- 
> 2.32.0
>
Vinicius Costa Gomes April 12, 2022, 12:13 a.m. UTC | #2
Vladimir Oltean <vladimir.oltean@nxp.com> writes:

> On Fri, Jun 25, 2021 at 05:33:14PM -0700, Vinicius Costa Gomes wrote:
>> Add support for sending/receiving Frame Preemption verification
>> frames.
>> 
>> The i225 hardware doesn't implement the process of verification
>> internally, this is left to the driver.
>> 
>> Add a simple implementation of the state machine defined in IEEE
>> 802.3-2018, Section 99.4.7.
>> 
>> For now, the state machine is started manually by the user, when
>> enabling verification. Example:
>> 
>> $ ethtool --set-frame-preemption IFACE disable-verify off
>> 
>> The "verified" condition is set to true when the SMD-V frame is sent,
>> and the SMD-R frame is received. So, it only tracks the transmission
>> side. This seems to be what's expected from IEEE 802.3-2018.
>> 
>> Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
>> ---
>>  drivers/net/ethernet/intel/igc/igc.h         |  15 ++
>>  drivers/net/ethernet/intel/igc/igc_defines.h |  13 ++
>>  drivers/net/ethernet/intel/igc/igc_ethtool.c |  20 +-
>>  drivers/net/ethernet/intel/igc/igc_main.c    | 216 +++++++++++++++++++
>>  4 files changed, 261 insertions(+), 3 deletions(-)
>> 
>> diff --git a/drivers/net/ethernet/intel/igc/igc.h b/drivers/net/ethernet/intel/igc/igc.h
>> index 9b2ddcbf65fb..84234efed781 100644
>> --- a/drivers/net/ethernet/intel/igc/igc.h
>> +++ b/drivers/net/ethernet/intel/igc/igc.h
>> @@ -122,6 +122,13 @@ struct igc_ring {
>>  	struct xsk_buff_pool *xsk_pool;
>>  } ____cacheline_internodealigned_in_smp;
>>  
>> +enum frame_preemption_state {
>> +	FRAME_PREEMPTION_STATE_FAILED,
>> +	FRAME_PREEMPTION_STATE_DONE,
>> +	FRAME_PREEMPTION_STATE_START,
>> +	FRAME_PREEMPTION_STATE_SENT,
>> +};
>> +
>>  /* Board specific private data structure */
>>  struct igc_adapter {
>>  	struct net_device *netdev;
>> @@ -240,6 +247,14 @@ struct igc_adapter {
>>  		struct timespec64 start;
>>  		struct timespec64 period;
>>  	} perout[IGC_N_PEROUT];
>> +
>> +	struct delayed_work fp_verification_work;
>> +	unsigned long fp_start;
>> +	bool fp_received_smd_v;
>> +	bool fp_received_smd_r;
>> +	unsigned int fp_verify_cnt;
>> +	enum frame_preemption_state fp_tx_state;
>> +	bool fp_disable_verify;
>>  };
>>  
>>  void igc_up(struct igc_adapter *adapter);
>> diff --git a/drivers/net/ethernet/intel/igc/igc_defines.h b/drivers/net/ethernet/intel/igc/igc_defines.h
>> index a2ea057d8e6e..cf46f5d5a505 100644
>> --- a/drivers/net/ethernet/intel/igc/igc_defines.h
>> +++ b/drivers/net/ethernet/intel/igc/igc_defines.h
>> @@ -268,6 +268,8 @@
>>  #define IGC_TXD_DTYP_C		0x00000000 /* Context Descriptor */
>>  #define IGC_TXD_POPTS_IXSM	0x01       /* Insert IP checksum */
>>  #define IGC_TXD_POPTS_TXSM	0x02       /* Insert TCP/UDP checksum */
>> +#define IGC_TXD_POPTS_SMD_V	0x10       /* Transmitted packet is a SMD-Verify */
>> +#define IGC_TXD_POPTS_SMD_R	0x20       /* Transmitted packet is a SMD-Response */
>>  #define IGC_TXD_CMD_EOP		0x01000000 /* End of Packet */
>>  #define IGC_TXD_CMD_IC		0x04000000 /* Insert Checksum */
>>  #define IGC_TXD_CMD_DEXT	0x20000000 /* Desc extension (0 = legacy) */
>> @@ -327,9 +329,20 @@
>>  
>>  #define IGC_RXDEXT_STATERR_LB	0x00040000
>>  
>> +#define IGC_RXD_STAT_SMD_V	0x2000  /* Received packet is SMD-Verify packet */
>> +#define IGC_RXD_STAT_SMD_R	0x4000  /* Received packet is SMD-Response packet */
>> +
>
> So the i225 gives you the ability to select from multiple
> Start-of-mPacket-Delimiter values on a per-TX descriptor basis?
> And this is in addition to configuring that TX ring as preemptable I
> guess? Because I notice that you're sending on the TX ring affine to the
> current CPU that the verification work item is running on (which you
> don't check anywhere that it is configured as going to the pMAC or
> not).

Yeah, talking to the hardware folks, those descriptors are handled
differently by the hardware.

> And on RX, it always gives you the kind of SMD that the packet had
> (including the classic SFD for express packets)?
> Cool.

I would use another word, but yeah :-)

>
> It would be nice if I could connect back to back an i225 board with an
> NXP LS1028A to see if the verification state machines pass both ways (on
> LS1028A it is 100% hardware based, we just enable/disable the feature
> and we can monitor the state changes via an interrupt).
>

My life would be easier if that were the case here.

>>  /* Advanced Receive Descriptor bit definitions */
>>  #define IGC_RXDADV_STAT_TSIP	0x08000 /* timestamp in packet */
>>  
>> +#define IGC_RXDADV_STAT_SMD_TYPE_MASK	0x06000
>> +#define IGC_RXDADV_STAT_SMD_TYPE_SHIFT	13
>> +
>> +#define IGC_SMD_TYPE_SFD		0x0
>> +#define IGC_SMD_TYPE_SMD_V		0x1
>> +#define IGC_SMD_TYPE_SMD_R		0x2
>> +#define IGC_SMD_TYPE_COMPLETE		0x3
>> +
>>  #define IGC_RXDEXT_STATERR_L4E		0x20000000
>>  #define IGC_RXDEXT_STATERR_IPE		0x40000000
>>  #define IGC_RXDEXT_STATERR_RXE		0x80000000
>> diff --git a/drivers/net/ethernet/intel/igc/igc_ethtool.c b/drivers/net/ethernet/intel/igc/igc_ethtool.c
>> index 84d5afe92154..f52a7be3af66 100644
>> --- a/drivers/net/ethernet/intel/igc/igc_ethtool.c
>> +++ b/drivers/net/ethernet/intel/igc/igc_ethtool.c
>> @@ -1649,6 +1649,8 @@ static int igc_ethtool_get_preempt(struct net_device *netdev,
>>  
>>  	fpcmd->enabled = adapter->frame_preemption_active;
>>  	fpcmd->add_frag_size = adapter->add_frag_size;
>> +	fpcmd->verified = adapter->fp_tx_state == FRAME_PREEMPTION_STATE_DONE;
>> +	fpcmd->disable_verify = adapter->fp_disable_verify;
>>  
>>  	return 0;
>>  }
>> @@ -1664,10 +1666,22 @@ static int igc_ethtool_set_preempt(struct net_device *netdev,
>>  		return -EINVAL;
>>  	}
>>  
>> -	adapter->frame_preemption_active = fpcmd->enabled;
>> -	adapter->add_frag_size = fpcmd->add_frag_size;
>> +	if (!fpcmd->disable_verify && adapter->fp_disable_verify) {
>> +		adapter->fp_tx_state = FRAME_PREEMPTION_STATE_START;
>> +		schedule_delayed_work(&adapter->fp_verification_work, msecs_to_jiffies(10));
>
> Not sure how much you'd like to tune this, but the spec has a
> configurable verifyTime between 1 ms and 128 ms. You chose the default
> value, so we should be ok for now.

We can add a configurable for that later, via ethtool for example.

>
>> +	}
>>  
>> -	return igc_tsn_offload_apply(adapter);
>> +	adapter->fp_disable_verify = fpcmd->disable_verify;
>> +
>> +	if (adapter->frame_preemption_active != fpcmd->enabled ||
>> +	    adapter->add_frag_size != fpcmd->add_frag_size) {
>> +		adapter->frame_preemption_active = fpcmd->enabled;
>> +		adapter->add_frag_size = fpcmd->add_frag_size;
>> +
>> +		return igc_tsn_offload_apply(adapter);
>> +	}
>> +
>> +	return 0;
>>  }
>>  
>>  static int igc_ethtool_begin(struct net_device *netdev)
>> diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c
>> index 20dac04a02f2..ed55bd13e4a1 100644
>> --- a/drivers/net/ethernet/intel/igc/igc_main.c
>> +++ b/drivers/net/ethernet/intel/igc/igc_main.c
>> @@ -28,6 +28,11 @@
>>  #define IGC_XDP_TX		BIT(1)
>>  #define IGC_XDP_REDIRECT	BIT(2)
>>  
>> +#define IGC_FP_TIMEOUT msecs_to_jiffies(100)
>> +#define IGC_MAX_VERIFY_CNT 3
>> +
>> +#define IGC_FP_SMD_FRAME_SIZE 60
>> +
>>  static int debug = -1;
>>  
>>  MODULE_AUTHOR("Intel Corporation, <linux.nics@intel.com>");
>> @@ -2169,6 +2174,79 @@ static int igc_xdp_init_tx_descriptor(struct igc_ring *ring,
>>  	return 0;
>>  }
>>  
>> +static int igc_fp_init_smd_frame(struct igc_ring *ring, struct igc_tx_buffer *buffer,
>> +				 struct sk_buff *skb)
>> +{
>> +	dma_addr_t dma;
>> +	unsigned int size;
>> +
>> +	size = skb_headlen(skb);
>> +
>> +	dma = dma_map_single(ring->dev, skb->data, size, DMA_TO_DEVICE);
>> +	if (dma_mapping_error(ring->dev, dma)) {
>> +		netdev_err_once(ring->netdev, "Failed to map DMA for TX\n");
>> +		return -ENOMEM;
>> +	}
>> +
>> +	buffer->skb = skb;
>> +	buffer->protocol = 0;
>> +	buffer->bytecount = skb->len;
>> +	buffer->gso_segs = 1;
>> +	buffer->time_stamp = jiffies;
>> +	dma_unmap_len_set(buffer, len, skb->len);
>> +	dma_unmap_addr_set(buffer, dma, dma);
>> +
>> +	return 0;
>> +}
>> +
>> +static int igc_fp_init_tx_descriptor(struct igc_ring *ring,
>> +				     struct sk_buff *skb, int type)
>> +{
>> +	struct igc_tx_buffer *buffer;
>> +	union igc_adv_tx_desc *desc;
>> +	u32 cmd_type, olinfo_status;
>> +	int err;
>> +
>> +	if (!igc_desc_unused(ring))
>> +		return -EBUSY;
>> +
>> +	buffer = &ring->tx_buffer_info[ring->next_to_use];
>> +	err = igc_fp_init_smd_frame(ring, buffer, skb);
>> +	if (err)
>> +		return err;
>> +
>> +	cmd_type = IGC_ADVTXD_DTYP_DATA | IGC_ADVTXD_DCMD_DEXT |
>> +		   IGC_ADVTXD_DCMD_IFCS | IGC_TXD_DCMD |
>> +		   buffer->bytecount;
>> +	olinfo_status = buffer->bytecount << IGC_ADVTXD_PAYLEN_SHIFT;
>> +
>> +	switch (type) {
>> +	case IGC_SMD_TYPE_SMD_V:
>> +		olinfo_status |= (IGC_TXD_POPTS_SMD_V << 8);
>> +		break;
>> +	case IGC_SMD_TYPE_SMD_R:
>> +		olinfo_status |= (IGC_TXD_POPTS_SMD_R << 8);
>> +		break;
>> +	default:
>> +		return -EINVAL;
>> +	}
>> +
>> +	desc = IGC_TX_DESC(ring, ring->next_to_use);
>> +	desc->read.cmd_type_len = cpu_to_le32(cmd_type);
>> +	desc->read.olinfo_status = cpu_to_le32(olinfo_status);
>> +	desc->read.buffer_addr = cpu_to_le64(dma_unmap_addr(buffer, dma));
>> +
>> +	netdev_tx_sent_queue(txring_txq(ring), skb->len);
>> +
>> +	buffer->next_to_watch = desc;
>> +
>> +	ring->next_to_use++;
>> +	if (ring->next_to_use == ring->count)
>> +		ring->next_to_use = 0;
>> +
>> +	return 0;
>> +}
>> +
>>  static struct igc_ring *igc_xdp_get_tx_ring(struct igc_adapter *adapter,
>>  					    int cpu)
>>  {
>> @@ -2299,6 +2377,19 @@ static void igc_update_rx_stats(struct igc_q_vector *q_vector,
>>  	q_vector->rx.total_bytes += bytes;
>>  }
>>  
>> +static int igc_rx_desc_smd_type(union igc_adv_rx_desc *rx_desc)
>> +{
>> +	u32 status = le32_to_cpu(rx_desc->wb.upper.status_error);
>> +
>> +	return (status & IGC_RXDADV_STAT_SMD_TYPE_MASK)
>> +		>> IGC_RXDADV_STAT_SMD_TYPE_SHIFT;
>> +}
>> +
>> +static bool igc_check_smd_frame(struct igc_rx_buffer *rx_buffer, unsigned int size)
>> +{
>> +	return size == 60;
>
> You should probably also verify that the contents is 60 octets of zeroes (sans the mCRC)?
>

Yeah, I will add some checks for that.

>> +}
>> +
>>  static int igc_clean_rx_irq(struct igc_q_vector *q_vector, const int budget)
>>  {
>>  	unsigned int total_bytes = 0, total_packets = 0;
>> @@ -2315,6 +2406,7 @@ static int igc_clean_rx_irq(struct igc_q_vector *q_vector, const int budget)
>>  		ktime_t timestamp = 0;
>>  		struct xdp_buff xdp;
>>  		int pkt_offset = 0;
>> +		int smd_type;
>>  		void *pktbuf;
>>  
>>  		/* return some buffers to hardware, one at a time is too slow */
>> @@ -2346,6 +2438,22 @@ static int igc_clean_rx_irq(struct igc_q_vector *q_vector, const int budget)
>>  			size -= IGC_TS_HDR_LEN;
>>  		}
>>  
>> +		smd_type = igc_rx_desc_smd_type(rx_desc);
>> +
>> +		if (smd_type == IGC_SMD_TYPE_SMD_V || smd_type == IGC_SMD_TYPE_SMD_R) {
>
> I guess the performance people will love you for this change. You should
> probably guard it by an "if (unlikely(disableVerify == false))" condition.
>

Will add the unlikely().

>> +			if (igc_check_smd_frame(rx_buffer, size)) {
>> +				adapter->fp_received_smd_v = smd_type == IGC_SMD_TYPE_SMD_V;
>> +				adapter->fp_received_smd_r = smd_type == IGC_SMD_TYPE_SMD_R;
>> +				schedule_delayed_work(&adapter->fp_verification_work, 0);
>> +			}
>> +
>> +			/* Advance the ring next-to-clean */
>> +			igc_is_non_eop(rx_ring, rx_desc);
>> +
>> +			cleaned_count++;
>> +			continue;
>> +		}
>> +
>>  		if (!skb) {
>>  			xdp_init_buff(&xdp, truesize, &rx_ring->xdp_rxq);
>>  			xdp_prepare_buff(&xdp, pktbuf - igc_rx_offset(rx_ring),
>> @@ -5607,6 +5715,107 @@ static int igc_tsn_enable_qbv_scheduling(struct igc_adapter *adapter,
>>  	return igc_tsn_offload_apply(adapter);
>>  }
>>  
>> +/* I225 doesn't send the SMD frames automatically, we need to handle
>> + * them ourselves.
>> + */
>> +static int igc_xmit_smd_frame(struct igc_adapter *adapter, int type)
>> +{
>> +	int cpu = smp_processor_id();
>> +	struct netdev_queue *nq;
>> +	struct igc_ring *ring;
>> +	struct sk_buff *skb;
>> +	void *data;
>> +	int err;
>> +
>> +	if (!netif_running(adapter->netdev))
>> +		return -ENOTCONN;
>> +
>> +	/* FIXME: rename this function to something less specific, as
>> +	 * it can be used outside XDP.
>> +	 */
>> +	ring = igc_xdp_get_tx_ring(adapter, cpu);
>> +	nq = txring_txq(ring);
>> +
>> +	skb = alloc_skb(IGC_FP_SMD_FRAME_SIZE, GFP_KERNEL);
>> +	if (!skb)
>> +		return -ENOMEM;
>> +
>> +	data = skb_put(skb, IGC_FP_SMD_FRAME_SIZE);
>> +	memset(data, 0, IGC_FP_SMD_FRAME_SIZE);
>> +
>> +	__netif_tx_lock(nq, cpu);
>> +
>> +	err = igc_fp_init_tx_descriptor(ring, skb, type);
>> +
>> +	igc_flush_tx_descriptors(ring);
>> +
>> +	__netif_tx_unlock(nq);
>> +
>> +	return err;
>> +}
>> +
>> +static void igc_fp_verification_work(struct work_struct *work)
>> +{
>> +	struct delayed_work *dwork = to_delayed_work(work);
>> +	struct igc_adapter *adapter;
>> +	int err;
>> +
>> +	adapter = container_of(dwork, struct igc_adapter, fp_verification_work);
>> +
>> +	if (adapter->fp_disable_verify)
>> +		goto done;
>> +
>> +	switch (adapter->fp_tx_state) {
>> +	case FRAME_PREEMPTION_STATE_START:
>> +		adapter->fp_received_smd_r = false;
>> +		err = igc_xmit_smd_frame(adapter, IGC_SMD_TYPE_SMD_V);
>> +		if (err < 0)
>> +			netdev_err(adapter->netdev, "Error sending SMD-V frame\n");
>
> On TX error should you really advance to the STATE_SENT?
>

We tried to send a SMD-V frame and it failed, the error was probably
transient (unable to allocate memory) and it's going to be retried later.

>> +
>> +		adapter->fp_tx_state = FRAME_PREEMPTION_STATE_SENT;
>> +		adapter->fp_start = jiffies;
>> +		schedule_delayed_work(&adapter->fp_verification_work, IGC_FP_TIMEOUT);
>> +		break;
>> +
>> +	case FRAME_PREEMPTION_STATE_SENT:
>> +		if (adapter->fp_received_smd_r) {
>> +			adapter->fp_tx_state = FRAME_PREEMPTION_STATE_DONE;
>> +			adapter->fp_received_smd_r = false;
>> +			break;
>> +		}
>> +
>> +		if (time_is_before_jiffies(adapter->fp_start + IGC_FP_TIMEOUT)) {
>> +			adapter->fp_verify_cnt++;
>> +			netdev_warn(adapter->netdev, "Timeout waiting for SMD-R frame\n");
>> +
>> +			if (adapter->fp_verify_cnt > IGC_MAX_VERIFY_CNT) {
>> +				adapter->fp_verify_cnt = 0;
>> +				adapter->fp_tx_state = FRAME_PREEMPTION_STATE_FAILED;
>> +				netdev_err(adapter->netdev,
>> +					   "Exceeded number of attempts for frame preemption verification\n");
>> +			} else {
>> +				adapter->fp_tx_state = FRAME_PREEMPTION_STATE_START;
>> +			}
>> +			schedule_delayed_work(&adapter->fp_verification_work, IGC_FP_TIMEOUT);
>> +		}
>> +
>> +		break;
>> +
>> +	case FRAME_PREEMPTION_STATE_FAILED:
>> +	case FRAME_PREEMPTION_STATE_DONE:
>> +		break;
>> +	}
>> +
>> +done:
>> +	if (adapter->fp_received_smd_v) {
>> +		err = igc_xmit_smd_frame(adapter, IGC_SMD_TYPE_SMD_R);
>> +		if (err < 0)
>> +			netdev_err(adapter->netdev, "Error sending SMD-R frame\n");
>> +
>> +		adapter->fp_received_smd_v = false;
>> +	}
>> +}
>> +
>>  static int igc_setup_tc(struct net_device *dev, enum tc_setup_type type,
>>  			void *type_data)
>>  {
>> @@ -6023,6 +6232,7 @@ static int igc_probe(struct pci_dev *pdev,
>>  
>>  	INIT_WORK(&adapter->reset_task, igc_reset_task);
>>  	INIT_WORK(&adapter->watchdog_task, igc_watchdog_task);
>> +	INIT_DELAYED_WORK(&adapter->fp_verification_work, igc_fp_verification_work);
>>  
>>  	/* Initialize link properties that are user-changeable */
>>  	adapter->fc_autoneg = true;
>> @@ -6044,6 +6254,12 @@ static int igc_probe(struct pci_dev *pdev,
>>  
>>  	igc_ptp_init(adapter);
>>  
>> +	/* FIXME: This sets the default to not do the verification
>> +	 * automatically, when we have support in multiple
>> +	 * controllers, this default can be changed.
>> +	 */
>> +	adapter->fp_disable_verify = true;
>> +
>
> Hmmmmm. So we need to instruct our users to explicitly enable
> verification in their ethtool-based scripts, since the default values
> will vary wildly from one vendor to another. On LS1028A I see no reason
> why verification would be disabled by default.
>

Reading 99.4.3 (IEEE 802.3-2018) again, that "Verification may be disabled"
seems to imply that it should be enabled by default.

I will change this.

>>  	/* reset the hardware with the new settings */
>>  	igc_reset(adapter);
>>  
>> -- 
>> 2.32.0
>>
diff mbox series

Patch

diff --git a/drivers/net/ethernet/intel/igc/igc.h b/drivers/net/ethernet/intel/igc/igc.h
index 9b2ddcbf65fb..84234efed781 100644
--- a/drivers/net/ethernet/intel/igc/igc.h
+++ b/drivers/net/ethernet/intel/igc/igc.h
@@ -122,6 +122,13 @@  struct igc_ring {
 	struct xsk_buff_pool *xsk_pool;
 } ____cacheline_internodealigned_in_smp;
 
+enum frame_preemption_state {
+	FRAME_PREEMPTION_STATE_FAILED,
+	FRAME_PREEMPTION_STATE_DONE,
+	FRAME_PREEMPTION_STATE_START,
+	FRAME_PREEMPTION_STATE_SENT,
+};
+
 /* Board specific private data structure */
 struct igc_adapter {
 	struct net_device *netdev;
@@ -240,6 +247,14 @@  struct igc_adapter {
 		struct timespec64 start;
 		struct timespec64 period;
 	} perout[IGC_N_PEROUT];
+
+	struct delayed_work fp_verification_work;
+	unsigned long fp_start;
+	bool fp_received_smd_v;
+	bool fp_received_smd_r;
+	unsigned int fp_verify_cnt;
+	enum frame_preemption_state fp_tx_state;
+	bool fp_disable_verify;
 };
 
 void igc_up(struct igc_adapter *adapter);
diff --git a/drivers/net/ethernet/intel/igc/igc_defines.h b/drivers/net/ethernet/intel/igc/igc_defines.h
index a2ea057d8e6e..cf46f5d5a505 100644
--- a/drivers/net/ethernet/intel/igc/igc_defines.h
+++ b/drivers/net/ethernet/intel/igc/igc_defines.h
@@ -268,6 +268,8 @@ 
 #define IGC_TXD_DTYP_C		0x00000000 /* Context Descriptor */
 #define IGC_TXD_POPTS_IXSM	0x01       /* Insert IP checksum */
 #define IGC_TXD_POPTS_TXSM	0x02       /* Insert TCP/UDP checksum */
+#define IGC_TXD_POPTS_SMD_V	0x10       /* Transmitted packet is a SMD-Verify */
+#define IGC_TXD_POPTS_SMD_R	0x20       /* Transmitted packet is a SMD-Response */
 #define IGC_TXD_CMD_EOP		0x01000000 /* End of Packet */
 #define IGC_TXD_CMD_IC		0x04000000 /* Insert Checksum */
 #define IGC_TXD_CMD_DEXT	0x20000000 /* Desc extension (0 = legacy) */
@@ -327,9 +329,20 @@ 
 
 #define IGC_RXDEXT_STATERR_LB	0x00040000
 
+#define IGC_RXD_STAT_SMD_V	0x2000  /* Received packet is SMD-Verify packet */
+#define IGC_RXD_STAT_SMD_R	0x4000  /* Received packet is SMD-Response packet */
+
 /* Advanced Receive Descriptor bit definitions */
 #define IGC_RXDADV_STAT_TSIP	0x08000 /* timestamp in packet */
 
+#define IGC_RXDADV_STAT_SMD_TYPE_MASK	0x06000
+#define IGC_RXDADV_STAT_SMD_TYPE_SHIFT	13
+
+#define IGC_SMD_TYPE_SFD		0x0
+#define IGC_SMD_TYPE_SMD_V		0x1
+#define IGC_SMD_TYPE_SMD_R		0x2
+#define IGC_SMD_TYPE_COMPLETE		0x3
+
 #define IGC_RXDEXT_STATERR_L4E		0x20000000
 #define IGC_RXDEXT_STATERR_IPE		0x40000000
 #define IGC_RXDEXT_STATERR_RXE		0x80000000
diff --git a/drivers/net/ethernet/intel/igc/igc_ethtool.c b/drivers/net/ethernet/intel/igc/igc_ethtool.c
index 84d5afe92154..f52a7be3af66 100644
--- a/drivers/net/ethernet/intel/igc/igc_ethtool.c
+++ b/drivers/net/ethernet/intel/igc/igc_ethtool.c
@@ -1649,6 +1649,8 @@  static int igc_ethtool_get_preempt(struct net_device *netdev,
 
 	fpcmd->enabled = adapter->frame_preemption_active;
 	fpcmd->add_frag_size = adapter->add_frag_size;
+	fpcmd->verified = adapter->fp_tx_state == FRAME_PREEMPTION_STATE_DONE;
+	fpcmd->disable_verify = adapter->fp_disable_verify;
 
 	return 0;
 }
@@ -1664,10 +1666,22 @@  static int igc_ethtool_set_preempt(struct net_device *netdev,
 		return -EINVAL;
 	}
 
-	adapter->frame_preemption_active = fpcmd->enabled;
-	adapter->add_frag_size = fpcmd->add_frag_size;
+	if (!fpcmd->disable_verify && adapter->fp_disable_verify) {
+		adapter->fp_tx_state = FRAME_PREEMPTION_STATE_START;
+		schedule_delayed_work(&adapter->fp_verification_work, msecs_to_jiffies(10));
+	}
 
-	return igc_tsn_offload_apply(adapter);
+	adapter->fp_disable_verify = fpcmd->disable_verify;
+
+	if (adapter->frame_preemption_active != fpcmd->enabled ||
+	    adapter->add_frag_size != fpcmd->add_frag_size) {
+		adapter->frame_preemption_active = fpcmd->enabled;
+		adapter->add_frag_size = fpcmd->add_frag_size;
+
+		return igc_tsn_offload_apply(adapter);
+	}
+
+	return 0;
 }
 
 static int igc_ethtool_begin(struct net_device *netdev)
diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c
index 20dac04a02f2..ed55bd13e4a1 100644
--- a/drivers/net/ethernet/intel/igc/igc_main.c
+++ b/drivers/net/ethernet/intel/igc/igc_main.c
@@ -28,6 +28,11 @@ 
 #define IGC_XDP_TX		BIT(1)
 #define IGC_XDP_REDIRECT	BIT(2)
 
+#define IGC_FP_TIMEOUT msecs_to_jiffies(100)
+#define IGC_MAX_VERIFY_CNT 3
+
+#define IGC_FP_SMD_FRAME_SIZE 60
+
 static int debug = -1;
 
 MODULE_AUTHOR("Intel Corporation, <linux.nics@intel.com>");
@@ -2169,6 +2174,79 @@  static int igc_xdp_init_tx_descriptor(struct igc_ring *ring,
 	return 0;
 }
 
+static int igc_fp_init_smd_frame(struct igc_ring *ring, struct igc_tx_buffer *buffer,
+				 struct sk_buff *skb)
+{
+	dma_addr_t dma;
+	unsigned int size;
+
+	size = skb_headlen(skb);
+
+	dma = dma_map_single(ring->dev, skb->data, size, DMA_TO_DEVICE);
+	if (dma_mapping_error(ring->dev, dma)) {
+		netdev_err_once(ring->netdev, "Failed to map DMA for TX\n");
+		return -ENOMEM;
+	}
+
+	buffer->skb = skb;
+	buffer->protocol = 0;
+	buffer->bytecount = skb->len;
+	buffer->gso_segs = 1;
+	buffer->time_stamp = jiffies;
+	dma_unmap_len_set(buffer, len, skb->len);
+	dma_unmap_addr_set(buffer, dma, dma);
+
+	return 0;
+}
+
+static int igc_fp_init_tx_descriptor(struct igc_ring *ring,
+				     struct sk_buff *skb, int type)
+{
+	struct igc_tx_buffer *buffer;
+	union igc_adv_tx_desc *desc;
+	u32 cmd_type, olinfo_status;
+	int err;
+
+	if (!igc_desc_unused(ring))
+		return -EBUSY;
+
+	buffer = &ring->tx_buffer_info[ring->next_to_use];
+	err = igc_fp_init_smd_frame(ring, buffer, skb);
+	if (err)
+		return err;
+
+	cmd_type = IGC_ADVTXD_DTYP_DATA | IGC_ADVTXD_DCMD_DEXT |
+		   IGC_ADVTXD_DCMD_IFCS | IGC_TXD_DCMD |
+		   buffer->bytecount;
+	olinfo_status = buffer->bytecount << IGC_ADVTXD_PAYLEN_SHIFT;
+
+	switch (type) {
+	case IGC_SMD_TYPE_SMD_V:
+		olinfo_status |= (IGC_TXD_POPTS_SMD_V << 8);
+		break;
+	case IGC_SMD_TYPE_SMD_R:
+		olinfo_status |= (IGC_TXD_POPTS_SMD_R << 8);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	desc = IGC_TX_DESC(ring, ring->next_to_use);
+	desc->read.cmd_type_len = cpu_to_le32(cmd_type);
+	desc->read.olinfo_status = cpu_to_le32(olinfo_status);
+	desc->read.buffer_addr = cpu_to_le64(dma_unmap_addr(buffer, dma));
+
+	netdev_tx_sent_queue(txring_txq(ring), skb->len);
+
+	buffer->next_to_watch = desc;
+
+	ring->next_to_use++;
+	if (ring->next_to_use == ring->count)
+		ring->next_to_use = 0;
+
+	return 0;
+}
+
 static struct igc_ring *igc_xdp_get_tx_ring(struct igc_adapter *adapter,
 					    int cpu)
 {
@@ -2299,6 +2377,19 @@  static void igc_update_rx_stats(struct igc_q_vector *q_vector,
 	q_vector->rx.total_bytes += bytes;
 }
 
+static int igc_rx_desc_smd_type(union igc_adv_rx_desc *rx_desc)
+{
+	u32 status = le32_to_cpu(rx_desc->wb.upper.status_error);
+
+	return (status & IGC_RXDADV_STAT_SMD_TYPE_MASK)
+		>> IGC_RXDADV_STAT_SMD_TYPE_SHIFT;
+}
+
+static bool igc_check_smd_frame(struct igc_rx_buffer *rx_buffer, unsigned int size)
+{
+	return size == 60;
+}
+
 static int igc_clean_rx_irq(struct igc_q_vector *q_vector, const int budget)
 {
 	unsigned int total_bytes = 0, total_packets = 0;
@@ -2315,6 +2406,7 @@  static int igc_clean_rx_irq(struct igc_q_vector *q_vector, const int budget)
 		ktime_t timestamp = 0;
 		struct xdp_buff xdp;
 		int pkt_offset = 0;
+		int smd_type;
 		void *pktbuf;
 
 		/* return some buffers to hardware, one at a time is too slow */
@@ -2346,6 +2438,22 @@  static int igc_clean_rx_irq(struct igc_q_vector *q_vector, const int budget)
 			size -= IGC_TS_HDR_LEN;
 		}
 
+		smd_type = igc_rx_desc_smd_type(rx_desc);
+
+		if (smd_type == IGC_SMD_TYPE_SMD_V || smd_type == IGC_SMD_TYPE_SMD_R) {
+			if (igc_check_smd_frame(rx_buffer, size)) {
+				adapter->fp_received_smd_v = smd_type == IGC_SMD_TYPE_SMD_V;
+				adapter->fp_received_smd_r = smd_type == IGC_SMD_TYPE_SMD_R;
+				schedule_delayed_work(&adapter->fp_verification_work, 0);
+			}
+
+			/* Advance the ring next-to-clean */
+			igc_is_non_eop(rx_ring, rx_desc);
+
+			cleaned_count++;
+			continue;
+		}
+
 		if (!skb) {
 			xdp_init_buff(&xdp, truesize, &rx_ring->xdp_rxq);
 			xdp_prepare_buff(&xdp, pktbuf - igc_rx_offset(rx_ring),
@@ -5607,6 +5715,107 @@  static int igc_tsn_enable_qbv_scheduling(struct igc_adapter *adapter,
 	return igc_tsn_offload_apply(adapter);
 }
 
+/* I225 doesn't send the SMD frames automatically, we need to handle
+ * them ourselves.
+ */
+static int igc_xmit_smd_frame(struct igc_adapter *adapter, int type)
+{
+	int cpu = smp_processor_id();
+	struct netdev_queue *nq;
+	struct igc_ring *ring;
+	struct sk_buff *skb;
+	void *data;
+	int err;
+
+	if (!netif_running(adapter->netdev))
+		return -ENOTCONN;
+
+	/* FIXME: rename this function to something less specific, as
+	 * it can be used outside XDP.
+	 */
+	ring = igc_xdp_get_tx_ring(adapter, cpu);
+	nq = txring_txq(ring);
+
+	skb = alloc_skb(IGC_FP_SMD_FRAME_SIZE, GFP_KERNEL);
+	if (!skb)
+		return -ENOMEM;
+
+	data = skb_put(skb, IGC_FP_SMD_FRAME_SIZE);
+	memset(data, 0, IGC_FP_SMD_FRAME_SIZE);
+
+	__netif_tx_lock(nq, cpu);
+
+	err = igc_fp_init_tx_descriptor(ring, skb, type);
+
+	igc_flush_tx_descriptors(ring);
+
+	__netif_tx_unlock(nq);
+
+	return err;
+}
+
+static void igc_fp_verification_work(struct work_struct *work)
+{
+	struct delayed_work *dwork = to_delayed_work(work);
+	struct igc_adapter *adapter;
+	int err;
+
+	adapter = container_of(dwork, struct igc_adapter, fp_verification_work);
+
+	if (adapter->fp_disable_verify)
+		goto done;
+
+	switch (adapter->fp_tx_state) {
+	case FRAME_PREEMPTION_STATE_START:
+		adapter->fp_received_smd_r = false;
+		err = igc_xmit_smd_frame(adapter, IGC_SMD_TYPE_SMD_V);
+		if (err < 0)
+			netdev_err(adapter->netdev, "Error sending SMD-V frame\n");
+
+		adapter->fp_tx_state = FRAME_PREEMPTION_STATE_SENT;
+		adapter->fp_start = jiffies;
+		schedule_delayed_work(&adapter->fp_verification_work, IGC_FP_TIMEOUT);
+		break;
+
+	case FRAME_PREEMPTION_STATE_SENT:
+		if (adapter->fp_received_smd_r) {
+			adapter->fp_tx_state = FRAME_PREEMPTION_STATE_DONE;
+			adapter->fp_received_smd_r = false;
+			break;
+		}
+
+		if (time_is_before_jiffies(adapter->fp_start + IGC_FP_TIMEOUT)) {
+			adapter->fp_verify_cnt++;
+			netdev_warn(adapter->netdev, "Timeout waiting for SMD-R frame\n");
+
+			if (adapter->fp_verify_cnt > IGC_MAX_VERIFY_CNT) {
+				adapter->fp_verify_cnt = 0;
+				adapter->fp_tx_state = FRAME_PREEMPTION_STATE_FAILED;
+				netdev_err(adapter->netdev,
+					   "Exceeded number of attempts for frame preemption verification\n");
+			} else {
+				adapter->fp_tx_state = FRAME_PREEMPTION_STATE_START;
+			}
+			schedule_delayed_work(&adapter->fp_verification_work, IGC_FP_TIMEOUT);
+		}
+
+		break;
+
+	case FRAME_PREEMPTION_STATE_FAILED:
+	case FRAME_PREEMPTION_STATE_DONE:
+		break;
+	}
+
+done:
+	if (adapter->fp_received_smd_v) {
+		err = igc_xmit_smd_frame(adapter, IGC_SMD_TYPE_SMD_R);
+		if (err < 0)
+			netdev_err(adapter->netdev, "Error sending SMD-R frame\n");
+
+		adapter->fp_received_smd_v = false;
+	}
+}
+
 static int igc_setup_tc(struct net_device *dev, enum tc_setup_type type,
 			void *type_data)
 {
@@ -6023,6 +6232,7 @@  static int igc_probe(struct pci_dev *pdev,
 
 	INIT_WORK(&adapter->reset_task, igc_reset_task);
 	INIT_WORK(&adapter->watchdog_task, igc_watchdog_task);
+	INIT_DELAYED_WORK(&adapter->fp_verification_work, igc_fp_verification_work);
 
 	/* Initialize link properties that are user-changeable */
 	adapter->fc_autoneg = true;
@@ -6044,6 +6254,12 @@  static int igc_probe(struct pci_dev *pdev,
 
 	igc_ptp_init(adapter);
 
+	/* FIXME: This sets the default to not do the verification
+	 * automatically, when we have support in multiple
+	 * controllers, this default can be changed.
+	 */
+	adapter->fp_disable_verify = true;
+
 	/* reset the hardware with the new settings */
 	igc_reset(adapter);