diff mbox series

[RFC,1/1] cxl/pci: Support Global Persistent Flush (GPF)

Message ID 20241220164337.204900-2-dave@stgolabs.net
State New
Headers show
Series cxl/pci: Support Global Persistent Flush (GPF) | expand

Commit Message

Davidlohr Bueso Dec. 20, 2024, 4:43 p.m. UTC
Add support for GPF flows. It is found that the CXL specification
around this to be a bit too involved from the driver side. And while
this should really all handled by the hardware, this patch takes
things with a grain of salt.

Timeout detection is based on dirty Shutdown semantics. The driver
will mark it as dirty, expecting that the device clear it upon a
successful GPF event. The admin may consult the device Health and
check the dirty shutdown counter to see if there was a problem
with data integrity.

Timeout arming is done throughout the decode hierarchy, upon device
probing and hot-remove. These timeouts can be over-specified,
particularly T1. Set the max timeout to 20 seconds for T1, which is
the NMI watchdog default for lockup detection. For T2, the policy
is to use the largest device T2 available in the hierarchy.

Signed-off-by: Davidlohr Bueso <dave@stgolabs.net>
---
 Documentation/driver-api/cxl/maturity-map.rst |   2 +-
 drivers/cxl/core/mbox.c                       |  17 +++
 drivers/cxl/core/pci.c                        | 104 ++++++++++++++++++
 drivers/cxl/core/port.c                       |  68 ++++++++++++
 drivers/cxl/cxl.h                             |   3 +
 drivers/cxl/cxlmem.h                          |  24 ++++
 drivers/cxl/cxlpci.h                          |  64 +++++++++++
 drivers/cxl/pci.c                             |  97 ++++++++++++++++
 8 files changed, 378 insertions(+), 1 deletion(-)

Comments

Ira Weiny Dec. 20, 2024, 6:21 p.m. UTC | #1
Davidlohr Bueso wrote:
> Add support for GPF flows. It is found that the CXL specification
> around this to be a bit too involved from the driver side. And while
> this should really all handled by the hardware, this patch takes
> things with a grain of salt.
> 
> Timeout detection is based on dirty Shutdown semantics. The driver
> will mark it as dirty, expecting that the device clear it upon a
> successful GPF event. The admin may consult the device Health and
> check the dirty shutdown counter to see if there was a problem
> with data integrity.
> 
> Timeout arming is done throughout the decode hierarchy, upon device
> probing and hot-remove. These timeouts can be over-specified,
> particularly T1. Set the max timeout to 20 seconds for T1, which is
> the NMI watchdog default for lockup detection. For T2, the policy
> is to use the largest device T2 available in the hierarchy.
> 
> Signed-off-by: Davidlohr Bueso <dave@stgolabs.net>

[snip]

>  int cxl_set_timestamp(struct cxl_memdev_state *mds)
>  {
>  	struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
> index 9d58ab9d33c5..9b1e110817f2 100644
> --- a/drivers/cxl/core/pci.c
> +++ b/drivers/cxl/core/pci.c
> @@ -1054,3 +1054,107 @@ int cxl_pci_get_bandwidth(struct pci_dev *pdev, struct access_coordinate *c)
>  
>  	return 0;
>  }
> +
> +int cxl_pci_update_gpf_port(struct pci_dev *pdev,
> +			    struct cxl_memdev *cxlmd, bool remove)
> +{
> +	u16 ctrl;
> +	int port_t1_base, port_t1_scale;
> +	int port_t2_base, port_t2_scale;
> +	unsigned long device_tmo, port_tmo;
> +	int rc, dvsec;
> +	struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds);
> +
> +	dvsec = pci_find_dvsec_capability(
> +		pdev, PCI_VENDOR_ID_CXL, CXL_DVSEC_PORT_GPF);
> +	if (!dvsec) {
> +		dev_warn(&pdev->dev,
> +			 "GPF Port DVSEC not present\n");
> +		return -EINVAL;
> +	}
> +
> +	/* check for t1 */
> +	rc = pci_read_config_word(
> +		pdev,
> +		dvsec + CXL_DVSEC_PORT_GPF_PHASE_1_CONTROL_OFFSET,
> +		&ctrl);
> +	if (rc)
> +		return rc;
> +
> +	port_t1_base = FIELD_GET(CXL_DVSEC_PORT_GPF_PHASE_1_TMO_BASE_MASK,
> +				 ctrl);
> +	port_t1_scale = FIELD_GET(CXL_DVSEC_PORT_GPF_PHASE_1_TMO_SCALE_MASK,
> +				  ctrl);
> +	if (port_t1_scale > GPF_TIMEOUT_SCALE_MAX) {
> +		dev_warn(&pdev->dev, "GPF: invalid port phase 1 timeout\n");
> +		return -EINVAL;
> +	}
> +
> +	/*
> +	 * Set max timeout such that vendors will optimize GPF flow to
> +	 * avoid the implied worst-case scenario delays.
> +	 */
> +	device_tmo = gpf_timeout_us(7, GPF_TIMEOUT_SCALE_MAX);

What is '7' here?  Seems like another define would be good.

> +	port_tmo = gpf_timeout_us(port_t1_base, port_t1_scale);
> +
> +	dev_dbg(&pdev->dev, "Port GPF phase 1 timeout: %lu us\n", port_tmo);
> +
> +	if ((remove && device_tmo != port_tmo) || device_tmo > port_tmo) {
> +		/* update the timeout in DVSEC */
> +		ctrl = FIELD_PREP(CXL_DVSEC_PORT_GPF_PHASE_1_TMO_BASE_MASK,
> +				   7);

And use it here...

> +		ctrl |= FIELD_PREP(CXL_DVSEC_PORT_GPF_PHASE_1_TMO_SCALE_MASK,
> +				   GPF_TIMEOUT_SCALE_MAX);
> +		rc = pci_write_config_word(
> +			pdev,
> +			dvsec + CXL_DVSEC_PORT_GPF_PHASE_1_CONTROL_OFFSET,
> +			ctrl);
> +		if (rc)
> +			return rc;
> +
> +		dev_dbg(&pdev->dev,
> +			"new GPF Port phase 1 timeout: %lu us\n", device_tmo);
> +	}
> +
> +	/* check for t2 */
> +	rc = pci_read_config_word(
> +		pdev,
> +		dvsec + CXL_DVSEC_PORT_GPF_PHASE_2_CONTROL_OFFSET,
> +		&ctrl);
> +	if (rc)
> +		return rc;
> +
> +	port_t2_base = FIELD_GET(CXL_DVSEC_PORT_GPF_PHASE_2_TMO_BASE_MASK,
> +			    ctrl);
> +	port_t2_scale = FIELD_GET(CXL_DVSEC_PORT_GPF_PHASE_2_TMO_SCALE_MASK,
> +			     ctrl);
> +	if (port_t2_scale > GPF_TIMEOUT_SCALE_MAX) {
> +		dev_warn(&pdev->dev, "GPF: invalid port phase 2 timeout\n");
> +		return -EINVAL;
> +	}
> +
> +	device_tmo = gpf_timeout_us(mds->gpf.t2_base, mds->gpf.t2_scale);
> +	port_tmo = gpf_timeout_us(port_t2_base, port_t2_scale);
> +
> +	dev_dbg(&pdev->dev, "Port GPF phase 2 timeout: %lu us\n", port_tmo);
> +
> +	if ((remove && device_tmo != port_tmo) || device_tmo > port_tmo) {
> +		/* update the timeout in DVSEC */
> +		ctrl = FIELD_PREP(CXL_DVSEC_PORT_GPF_PHASE_2_TMO_BASE_MASK,
> +				   mds->gpf.t2_base);
> +		ctrl |= FIELD_PREP(CXL_DVSEC_PORT_GPF_PHASE_2_TMO_SCALE_MASK,
> +				   mds->gpf.t2_scale);
> +		rc = pci_write_config_word(
> +			pdev,
> +			dvsec + CXL_DVSEC_PORT_GPF_PHASE_2_CONTROL_OFFSET,
> +			ctrl);
> +		if (rc)
> +			return rc;
> +
> +		dev_dbg(&pdev->dev,
> +			"new GPF Port phase 2 timeout: %lu us\n", device_tmo);
> +	}
> +
> +	return 0;
> +}
> +EXPORT_SYMBOL_NS_GPL(cxl_pci_update_gpf_port, "CXL");
> diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c
> index 78a5c2c25982..0bd09669af68 100644
> --- a/drivers/cxl/core/port.c
> +++ b/drivers/cxl/core/port.c
> @@ -1370,6 +1370,73 @@ static struct cxl_port *find_cxl_port_at(struct cxl_port *parent_port,
>  	return port;
>  }
>  
> +static void delete_update_gpf(struct cxl_memdev *cxlmd)
> +{
> +	struct cxl_port *port = cxlmd->endpoint;
> +	struct cxl_port *parent_port = to_cxl_port(port->dev.parent);
> +	struct cxl_memdev *max_cxlmd = NULL;
> +	struct cxl_memdev_state *mds;
> +	struct cxl_ep *ep;
> +	unsigned long index;
> +
> +	/* first calculate the new max T2 timeout */
> +	xa_for_each(&parent_port->endpoints, index, ep) {
> +		struct cxl_memdev *this_cxlmd;
> +		struct cxl_memdev_state *max_mds;
> +
> +		this_cxlmd = to_cxl_memdev(ep->ep);
> +		if (cxlmd == this_cxlmd) /* ignore self */
> +			continue;
> +		if (!max_cxlmd) {
> +			max_cxlmd = this_cxlmd;
> +			continue;
> +		}
> +
> +		mds = to_cxl_memdev_state(this_cxlmd->cxlds);
> +		max_mds = to_cxl_memdev_state(max_cxlmd->cxlds);
> +
> +		if (gpf_timeout_us(mds->gpf.t2_base, mds->gpf.t2_scale) >
> +		    gpf_timeout_us(max_mds->gpf.t2_base, max_mds->gpf.t2_scale))
> +			max_cxlmd = this_cxlmd;
> +	}
> +
> +	if (!max_cxlmd) /* no other devices */
> +		goto clean_shutdown;
> +
> +	while (1) {
> +		struct cxl_dport *dport;
> +
> +		parent_port = to_cxl_port(port->dev.parent);
> +		xa_for_each(&parent_port->dports, index, dport) {

Does this set the dports for all ports on a switch?  Is that the
intention?

> +			if (!dev_is_pci(dport->dport_dev))
> +				continue;
> +
> +			cxl_pci_update_gpf_port(to_pci_dev(dport->dport_dev),
> +						max_cxlmd, true);
> +		}
> +
> +		if (is_cxl_root(parent_port))
> +			break;
> +
> +		port = parent_port;
> +	}
> +
> +clean_shutdown:
> +	/*
> +	 * Device can still dirty the shutdown upon detecting any
> +	 * failed internal flush.
> +	 */
> +	if (resource_size(&cxlmd->cxlds->pmem_res)) {
> +		mds = to_cxl_memdev_state(cxlmd->cxlds);
> +
> +		if (mds->dirtied_shutdown) {
> +			if (cxl_set_shutdown_state(mds, false))
> +				dev_warn(&cxlmd->dev,
> +					 "could not clean Shutdown state");
> +		}
> +	}
> +}
> +
>  /*
>   * All users of grandparent() are using it to walk PCIe-like switch port
>   * hierarchy. A PCIe switch is comprised of a bridge device representing the
> @@ -1400,6 +1467,7 @@ static void delete_endpoint(void *data)
>  	struct device *host = endpoint_host(endpoint);
>  
>  	scoped_guard(device, host) {
> +		delete_update_gpf(cxlmd);
>  		if (host->driver && !endpoint->dead) {
>  			devm_release_action(host, cxl_unlink_parent_dport, endpoint);
>  			devm_release_action(host, cxl_unlink_uport, endpoint);

[snip]

> @@ -129,4 +140,57 @@ void read_cdat_data(struct cxl_port *port);
>  void cxl_cor_error_detected(struct pci_dev *pdev);
>  pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
>  				    pci_channel_state_t state);
> +
> +#define GPF_TIMEOUT_SCALE_MAX 7 /* 10 seconds */
> +
> +static inline unsigned long gpf_timeout_us(int base, int scale)
> +{
> +	unsigned long tmo, others;
> +
> +	switch (scale) {
> +	case 0: /* 1 us */
> +		tmo = 1;
> +		break;
> +	case 1: /* 10 us */
> +		tmo = 10UL;
> +		break;
> +	case 2: /* 100 us */
> +		tmo = 100UL;
> +		break;
> +	case 3: /* 1 ms */
> +		tmo = 1000UL;
> +		break;
> +	case 4: /* 10 ms */
> +		tmo = 10000UL;
> +		break;
> +	case 5: /* 100 ms */
> +		tmo = 100000UL;
> +		break;
> +	case 6: /* 1 s */
> +		tmo = 1000000UL;
> +		break;
> +	case GPF_TIMEOUT_SCALE_MAX:
> +		tmo = 10000000UL;
> +		break;
> +	default:
> +		tmo = 0;
> +		break;
> +	}
> +
> +	tmo *= base;
> +	/*
> +	 * The spec is over involved. Do not account for any ad-hoc
> +	 * host delays. Ie: propagation delay, host-side processing
> +	 * delays, and any other  host/system-specific delays.
> +	 */
> +	others = 0;
> +	tmo += others;
> +
> +	/*
> +	 * Limit max timeout to 20 seconds (per phase), which is
> +	 * already the default for the nmi watchdog.
> +	 */
> +	return min(20000000UL, tmo);
> +}
> +
>  #endif /* __CXL_PCI_H__ */
> diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
> index 6d94ff4a4f1a..37d5616b6fc8 100644
> --- a/drivers/cxl/pci.c
> +++ b/drivers/cxl/pci.c
> @@ -900,6 +900,101 @@ static struct attribute_group cxl_rcd_group = {
>  };
>  __ATTRIBUTE_GROUPS(cxl_rcd);
>  
> +
> +static int cxl_gpf_setup(struct pci_dev *pdev)
> +{
> +	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
> +	struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds);
> +	struct cxl_memdev *cxlmd = cxlds->cxlmd;
> +	struct cxl_port *port;
> +	int rc, gpf_dvsec;
> +	u16 duration;
> +	u32 power;
> +	int device_t2_base, device_t2_scale;
> +
> +	/* get the timeouts for phase 2, given by the hardware */
> +	gpf_dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_CXL,
> +					      CXL_DVSEC_DEVICE_GPF);
> +	if (!gpf_dvsec) {
> +		dev_warn(&pdev->dev,
> +			 "GPF Device DVSEC not present\n");
> +		return -EINVAL;
> +	}
> +
> +	rc = pci_read_config_word(
> +		pdev,
> +		gpf_dvsec + CXL_DVSEC_DEVICE_GPF_PHASE_2_DURATION_OFFSET,
> +		&duration);
> +	if (rc)
> +		return rc;
> +
> +	device_t2_base = FIELD_GET(CXL_DVSEC_DEVICE_GPF_PHASE_2_TIME_BASE_MASK,
> +			    duration);
> +	device_t2_scale = FIELD_GET(CXL_DVSEC_DEVICE_GPF_PHASE_2_TIME_SCALE_MASK,
> +			     duration);
> +	if (device_t2_scale > GPF_TIMEOUT_SCALE_MAX) {
> +		dev_warn(&pdev->dev, "GPF: invalid device timeout\n");
> +		return -EINVAL;
> +	}
> +
> +	/* cache device GPF timeout and power consumption for phase 2 */
> +	mds->gpf.t2_base = device_t2_base;
> +	mds->gpf.t2_scale = device_t2_scale;
> +
> +	rc = pci_read_config_dword(
> +		pdev,
> +		gpf_dvsec + CXL_DVSEC_DEVICE_GPF_PHASE_2_POWER_OFFSET,
> +		&power);
> +	if (rc)
> +		return rc;
> +
> +	mds->gpf.power_mwatts = power;
> +
> +	dev_dbg(&pdev->dev, "Device GPF timeout: %lu us (power needed: %dmW)\n",
> +		gpf_timeout_us(device_t2_base, device_t2_scale),
> +		mds->gpf.power_mwatts);
> +
> +	/* iterate up the hierarchy updating max port timeouts where necessary */
> +	port = cxlmd->endpoint;
> +	while (1) {
> +		struct cxl_port *parent_port = to_cxl_port(port->dev.parent);
> +		struct cxl_dport *dport;
> +		unsigned long index;
> +
> +		device_lock(&parent_port->dev);
> +		xa_for_each(&parent_port->dports, index, dport) {
> +			if (!dev_is_pci(dport->dport_dev))
> +				continue;
> +
> +			cxl_pci_update_gpf_port(to_pci_dev(dport->dport_dev),
> +						cxlmd, false);
> +		}
> +		device_unlock(&parent_port->dev);
> +
> +		if (is_cxl_root(parent_port))
> +			break;
> +
> +		port = parent_port;
> +	}
> +
> +	/*
> +	 * Set dirty shutdown now, with the expectation that the device
> +	 * clear it upon a successful GPF flow. The exception to this
> +	 * is upon Viral detection, per CXL 3.2 section 12.4.2.
> +	 *
> +	 * XXX: For non-fail scenarios, this is cleared by the driver
> +	 * at hot-unplug. But, what about reboot/shutdown case? Is this
> +	 * done by hw if no data integrity failure is detected?

Does this path execute on reboot/shutdown as part of delete_endpoint()?

Ira

> +	 */
> +	if (resource_size(&cxlds->pmem_res)) {
> +		rc = cxl_set_shutdown_state(mds, true);
> +		if (!rc)
> +			mds->dirtied_shutdown = true;
> +	}
> +
> +	return rc;
> +}
> +

[snip]
Davidlohr Bueso Dec. 21, 2024, 4:14 a.m. UTC | #2
On Fri, 20 Dec 2024, Ira Weiny wrote:

>Davidlohr Bueso wrote:
>> +	/*
>> +	 * Set max timeout such that vendors will optimize GPF flow to
>> +	 * avoid the implied worst-case scenario delays.
>> +	 */
>> +	device_tmo = gpf_timeout_us(7, GPF_TIMEOUT_SCALE_MAX);
>
>What is '7' here?  Seems like another define would be good.

This is the max for both scale and base components of the timeout,
and we actually have:

+#define GPF_TIMEOUT_SCALE_MAX 7 /* 10 seconds */

... which could be renamed to GPF_TIMEOUT_MAX and use it
for the cases you mention.

Alternatively, since we limit timeouts to 20 secs, we could
just set that here and not rely on gpf_timeout_us() to do it.

...

>> +	while (1) {
>> +		struct cxl_dport *dport;
>> +
>> +		parent_port = to_cxl_port(port->dev.parent);
>> +		xa_for_each(&parent_port->dports, index, dport) {
>
>Does this set the dports for all ports on a switch?  Is that the
>intention?

Correct. For full output of a CXL Switch example, please see (iii) from
the tests shown in:

     https://lore.kernel.org/linux-cxl/20241205082101.1193038-1-dave@stgolabs.net/

...

>> +	/*
>> +	 * Set dirty shutdown now, with the expectation that the device
>> +	 * clear it upon a successful GPF flow. The exception to this
>> +	 * is upon Viral detection, per CXL 3.2 section 12.4.2.
>> +	 *
>> +	 * XXX: For non-fail scenarios, this is cleared by the driver
>> +	 * at hot-unplug. But, what about reboot/shutdown case? Is this
>> +	 * done by hw if no data integrity failure is detected?
>
>Does this path execute on reboot/shutdown as part of delete_endpoint()?

That was my expectation, but it did not; only for hot-remove. This is
my main concern about this patch.

Thanks,
Davidlohr
diff mbox series

Patch

diff --git a/Documentation/driver-api/cxl/maturity-map.rst b/Documentation/driver-api/cxl/maturity-map.rst
index df8e2ac2a320..99dd2c841e69 100644
--- a/Documentation/driver-api/cxl/maturity-map.rst
+++ b/Documentation/driver-api/cxl/maturity-map.rst
@@ -130,7 +130,7 @@  Mailbox commands
 * [0] Switch CCI
 * [3] Timestamp
 * [1] PMEM labels
-* [0] PMEM GPF / Dirty Shutdown
+* [1] PMEM GPF / Dirty Shutdown
 * [0] Scan Media
 
 PMU
diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
index 548564c770c0..c82cf1547027 100644
--- a/drivers/cxl/core/mbox.c
+++ b/drivers/cxl/core/mbox.c
@@ -1308,6 +1308,23 @@  int cxl_mem_create_range_info(struct cxl_memdev_state *mds)
 }
 EXPORT_SYMBOL_NS_GPL(cxl_mem_create_range_info, "CXL");
 
+int cxl_set_shutdown_state(struct cxl_memdev_state *mds, bool dirty)
+{
+	struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
+	struct cxl_mbox_cmd mbox_cmd;
+	struct cxl_mbox_set_shutdown_state in;
+
+	in.state = dirty;
+	mbox_cmd = (struct cxl_mbox_cmd) {
+		.opcode = CXL_MBOX_OP_SET_SHUTDOWN_STATE,
+		.size_in = sizeof(in),
+		.payload_in = &in,
+	};
+
+	return cxl_internal_send_cmd(cxl_mbox, &mbox_cmd);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_set_shutdown_state, "CXL");
+
 int cxl_set_timestamp(struct cxl_memdev_state *mds)
 {
 	struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
index 9d58ab9d33c5..9b1e110817f2 100644
--- a/drivers/cxl/core/pci.c
+++ b/drivers/cxl/core/pci.c
@@ -1054,3 +1054,107 @@  int cxl_pci_get_bandwidth(struct pci_dev *pdev, struct access_coordinate *c)
 
 	return 0;
 }
+
+int cxl_pci_update_gpf_port(struct pci_dev *pdev,
+			    struct cxl_memdev *cxlmd, bool remove)
+{
+	u16 ctrl;
+	int port_t1_base, port_t1_scale;
+	int port_t2_base, port_t2_scale;
+	unsigned long device_tmo, port_tmo;
+	int rc, dvsec;
+	struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds);
+
+	dvsec = pci_find_dvsec_capability(
+		pdev, PCI_VENDOR_ID_CXL, CXL_DVSEC_PORT_GPF);
+	if (!dvsec) {
+		dev_warn(&pdev->dev,
+			 "GPF Port DVSEC not present\n");
+		return -EINVAL;
+	}
+
+	/* check for t1 */
+	rc = pci_read_config_word(
+		pdev,
+		dvsec + CXL_DVSEC_PORT_GPF_PHASE_1_CONTROL_OFFSET,
+		&ctrl);
+	if (rc)
+		return rc;
+
+	port_t1_base = FIELD_GET(CXL_DVSEC_PORT_GPF_PHASE_1_TMO_BASE_MASK,
+				 ctrl);
+	port_t1_scale = FIELD_GET(CXL_DVSEC_PORT_GPF_PHASE_1_TMO_SCALE_MASK,
+				  ctrl);
+	if (port_t1_scale > GPF_TIMEOUT_SCALE_MAX) {
+		dev_warn(&pdev->dev, "GPF: invalid port phase 1 timeout\n");
+		return -EINVAL;
+	}
+
+	/*
+	 * Set max timeout such that vendors will optimize GPF flow to
+	 * avoid the implied worst-case scenario delays.
+	 */
+	device_tmo = gpf_timeout_us(7, GPF_TIMEOUT_SCALE_MAX);
+	port_tmo = gpf_timeout_us(port_t1_base, port_t1_scale);
+
+	dev_dbg(&pdev->dev, "Port GPF phase 1 timeout: %lu us\n", port_tmo);
+
+	if ((remove && device_tmo != port_tmo) || device_tmo > port_tmo) {
+		/* update the timeout in DVSEC */
+		ctrl = FIELD_PREP(CXL_DVSEC_PORT_GPF_PHASE_1_TMO_BASE_MASK,
+				   7);
+		ctrl |= FIELD_PREP(CXL_DVSEC_PORT_GPF_PHASE_1_TMO_SCALE_MASK,
+				   GPF_TIMEOUT_SCALE_MAX);
+		rc = pci_write_config_word(
+			pdev,
+			dvsec + CXL_DVSEC_PORT_GPF_PHASE_1_CONTROL_OFFSET,
+			ctrl);
+		if (rc)
+			return rc;
+
+		dev_dbg(&pdev->dev,
+			"new GPF Port phase 1 timeout: %lu us\n", device_tmo);
+	}
+
+	/* check for t2 */
+	rc = pci_read_config_word(
+		pdev,
+		dvsec + CXL_DVSEC_PORT_GPF_PHASE_2_CONTROL_OFFSET,
+		&ctrl);
+	if (rc)
+		return rc;
+
+	port_t2_base = FIELD_GET(CXL_DVSEC_PORT_GPF_PHASE_2_TMO_BASE_MASK,
+			    ctrl);
+	port_t2_scale = FIELD_GET(CXL_DVSEC_PORT_GPF_PHASE_2_TMO_SCALE_MASK,
+			     ctrl);
+	if (port_t2_scale > GPF_TIMEOUT_SCALE_MAX) {
+		dev_warn(&pdev->dev, "GPF: invalid port phase 2 timeout\n");
+		return -EINVAL;
+	}
+
+	device_tmo = gpf_timeout_us(mds->gpf.t2_base, mds->gpf.t2_scale);
+	port_tmo = gpf_timeout_us(port_t2_base, port_t2_scale);
+
+	dev_dbg(&pdev->dev, "Port GPF phase 2 timeout: %lu us\n", port_tmo);
+
+	if ((remove && device_tmo != port_tmo) || device_tmo > port_tmo) {
+		/* update the timeout in DVSEC */
+		ctrl = FIELD_PREP(CXL_DVSEC_PORT_GPF_PHASE_2_TMO_BASE_MASK,
+				   mds->gpf.t2_base);
+		ctrl |= FIELD_PREP(CXL_DVSEC_PORT_GPF_PHASE_2_TMO_SCALE_MASK,
+				   mds->gpf.t2_scale);
+		rc = pci_write_config_word(
+			pdev,
+			dvsec + CXL_DVSEC_PORT_GPF_PHASE_2_CONTROL_OFFSET,
+			ctrl);
+		if (rc)
+			return rc;
+
+		dev_dbg(&pdev->dev,
+			"new GPF Port phase 2 timeout: %lu us\n", device_tmo);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_pci_update_gpf_port, "CXL");
diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c
index 78a5c2c25982..0bd09669af68 100644
--- a/drivers/cxl/core/port.c
+++ b/drivers/cxl/core/port.c
@@ -1370,6 +1370,73 @@  static struct cxl_port *find_cxl_port_at(struct cxl_port *parent_port,
 	return port;
 }
 
+static void delete_update_gpf(struct cxl_memdev *cxlmd)
+{
+	struct cxl_port *port = cxlmd->endpoint;
+	struct cxl_port *parent_port = to_cxl_port(port->dev.parent);
+	struct cxl_memdev *max_cxlmd = NULL;
+	struct cxl_memdev_state *mds;
+	struct cxl_ep *ep;
+	unsigned long index;
+
+	/* first calculate the new max T2 timeout */
+	xa_for_each(&parent_port->endpoints, index, ep) {
+		struct cxl_memdev *this_cxlmd;
+		struct cxl_memdev_state *max_mds;
+
+		this_cxlmd = to_cxl_memdev(ep->ep);
+		if (cxlmd == this_cxlmd) /* ignore self */
+			continue;
+		if (!max_cxlmd) {
+			max_cxlmd = this_cxlmd;
+			continue;
+		}
+
+		mds = to_cxl_memdev_state(this_cxlmd->cxlds);
+		max_mds = to_cxl_memdev_state(max_cxlmd->cxlds);
+
+		if (gpf_timeout_us(mds->gpf.t2_base, mds->gpf.t2_scale) >
+		    gpf_timeout_us(max_mds->gpf.t2_base, max_mds->gpf.t2_scale))
+			max_cxlmd = this_cxlmd;
+	}
+
+	if (!max_cxlmd) /* no other devices */
+		goto clean_shutdown;
+
+	while (1) {
+		struct cxl_dport *dport;
+
+		parent_port = to_cxl_port(port->dev.parent);
+		xa_for_each(&parent_port->dports, index, dport) {
+			if (!dev_is_pci(dport->dport_dev))
+				continue;
+
+			cxl_pci_update_gpf_port(to_pci_dev(dport->dport_dev),
+						max_cxlmd, true);
+		}
+
+		if (is_cxl_root(parent_port))
+			break;
+
+		port = parent_port;
+	}
+
+clean_shutdown:
+	/*
+	 * Device can still dirty the shutdown upon detecting any
+	 * failed internal flush.
+	 */
+	if (resource_size(&cxlmd->cxlds->pmem_res)) {
+		mds = to_cxl_memdev_state(cxlmd->cxlds);
+
+		if (mds->dirtied_shutdown) {
+			if (cxl_set_shutdown_state(mds, false))
+				dev_warn(&cxlmd->dev,
+					 "could not clean Shutdown state");
+		}
+	}
+}
+
 /*
  * All users of grandparent() are using it to walk PCIe-like switch port
  * hierarchy. A PCIe switch is comprised of a bridge device representing the
@@ -1400,6 +1467,7 @@  static void delete_endpoint(void *data)
 	struct device *host = endpoint_host(endpoint);
 
 	scoped_guard(device, host) {
+		delete_update_gpf(cxlmd);
 		if (host->driver && !endpoint->dead) {
 			devm_release_action(host, cxl_unlink_parent_dport, endpoint);
 			devm_release_action(host, cxl_unlink_uport, endpoint);
diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
index f6015f24ad38..a83f8ed8d80b 100644
--- a/drivers/cxl/cxl.h
+++ b/drivers/cxl/cxl.h
@@ -911,6 +911,9 @@  void cxl_coordinates_combine(struct access_coordinate *out,
 
 bool cxl_endpoint_decoder_reset_detected(struct cxl_port *port);
 
+int cxl_pci_update_gpf_port(struct pci_dev *pdev,
+			    struct cxl_memdev *mds, bool remove);
+
 /*
  * Unit test builds overrides this to __weak, find the 'strong' version
  * of these symbols in tools/testing/cxl/.
diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
index 2a25d1957ddb..670cf4f91c6a 100644
--- a/drivers/cxl/cxlmem.h
+++ b/drivers/cxl/cxlmem.h
@@ -450,6 +450,21 @@  static inline struct cxl_dev_state *mbox_to_cxlds(struct cxl_mailbox *cxl_mbox)
 	return dev_get_drvdata(cxl_mbox->host);
 }
 
+/**
+ * struct cxl_gpf_device - Device GPF information
+ *
+ * Cached Device Global Persistent Flush data (Phase 2).
+ *
+ * @t2_base: Base component of the timeout.
+ * @t2_scale: Scale component of the timeout.
+ * @power_nwatts: Required power from the device.
+ */
+struct cxl_gpf_dev {
+	u16 t2_base;
+	u16 t2_scale;
+	u32 power_mwatts;
+};
+
 /**
  * struct cxl_memdev_state - Generic Type-3 Memory Device Class driver data
  *
@@ -477,6 +492,7 @@  static inline struct cxl_dev_state *mbox_to_cxlds(struct cxl_mailbox *cxl_mbox)
  * @poison: poison driver state info
  * @security: security driver state info
  * @fw: firmware upload / activation state
+ * @gpf: cached Device GPF information
  *
  * See CXL 3.0 8.2.9.8.2 Capacity Configuration and Label Storage for
  * details on capacity parameters.
@@ -503,6 +519,9 @@  struct cxl_memdev_state {
 	struct cxl_poison_state poison;
 	struct cxl_security_state security;
 	struct cxl_fw_state fw;
+
+	struct cxl_gpf_dev gpf;
+	bool dirtied_shutdown;
 };
 
 static inline struct cxl_memdev_state *
@@ -693,6 +712,10 @@  struct cxl_mbox_set_partition_info {
 
 #define  CXL_SET_PARTITION_IMMEDIATE_FLAG	BIT(0)
 
+struct cxl_mbox_set_shutdown_state {
+	u8 state;
+} __packed;
+
 /* Set Timestamp CXL 3.0 Spec 8.2.9.4.2 */
 struct cxl_mbox_set_timestamp_in {
 	__le64 timestamp;
@@ -829,6 +852,7 @@  void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
 			    enum cxl_event_log_type type,
 			    enum cxl_event_type event_type,
 			    const uuid_t *uuid, union cxl_event *evt);
+int cxl_set_shutdown_state(struct cxl_memdev_state *mds, bool dirty);
 int cxl_set_timestamp(struct cxl_memdev_state *mds);
 int cxl_poison_state_init(struct cxl_memdev_state *mds);
 int cxl_mem_get_poison(struct cxl_memdev *cxlmd, u64 offset, u64 len,
diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h
index 4da07727ab9c..fe7d92ef538f 100644
--- a/drivers/cxl/cxlpci.h
+++ b/drivers/cxl/cxlpci.h
@@ -40,9 +40,20 @@ 
 
 /* CXL 2.0 8.1.6: GPF DVSEC for CXL Port */
 #define CXL_DVSEC_PORT_GPF					4
+#define   CXL_DVSEC_PORT_GPF_PHASE_1_CONTROL_OFFSET		0x0C
+#define     CXL_DVSEC_PORT_GPF_PHASE_1_TMO_BASE_MASK		GENMASK(3, 0)
+#define	    CXL_DVSEC_PORT_GPF_PHASE_1_TMO_SCALE_MASK		GENMASK(11, 8)
+#define   CXL_DVSEC_PORT_GPF_PHASE_2_CONTROL_OFFSET		0xE
+#define     CXL_DVSEC_PORT_GPF_PHASE_2_TMO_BASE_MASK		GENMASK(3, 0)
+#define	    CXL_DVSEC_PORT_GPF_PHASE_2_TMO_SCALE_MASK		GENMASK(11, 8)
 
 /* CXL 2.0 8.1.7: GPF DVSEC for CXL Device */
 #define CXL_DVSEC_DEVICE_GPF					5
+#define   CXL_DVSEC_DEVICE_GPF_PHASE_2_DURATION_OFFSET		0xA
+#define     CXL_DVSEC_DEVICE_GPF_PHASE_2_TIME_BASE_MASK		GENMASK(3, 0)
+#define	    CXL_DVSEC_DEVICE_GPF_PHASE_2_TIME_SCALE_MASK	GENMASK(11, 8)
+#define   CXL_DVSEC_DEVICE_GPF_PHASE_2_POWER_OFFSET		0xC
+#define	    CXL_DVSEC_DEVICE_GPF_PHASE_2_ACTIVE_POWER_MASK	GENMASK(31, 0)
 
 /* CXL 2.0 8.1.8: PCIe DVSEC for Flex Bus Port */
 #define CXL_DVSEC_PCIE_FLEXBUS_PORT				7
@@ -129,4 +140,57 @@  void read_cdat_data(struct cxl_port *port);
 void cxl_cor_error_detected(struct pci_dev *pdev);
 pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
 				    pci_channel_state_t state);
+
+#define GPF_TIMEOUT_SCALE_MAX 7 /* 10 seconds */
+
+static inline unsigned long gpf_timeout_us(int base, int scale)
+{
+	unsigned long tmo, others;
+
+	switch (scale) {
+	case 0: /* 1 us */
+		tmo = 1;
+		break;
+	case 1: /* 10 us */
+		tmo = 10UL;
+		break;
+	case 2: /* 100 us */
+		tmo = 100UL;
+		break;
+	case 3: /* 1 ms */
+		tmo = 1000UL;
+		break;
+	case 4: /* 10 ms */
+		tmo = 10000UL;
+		break;
+	case 5: /* 100 ms */
+		tmo = 100000UL;
+		break;
+	case 6: /* 1 s */
+		tmo = 1000000UL;
+		break;
+	case GPF_TIMEOUT_SCALE_MAX:
+		tmo = 10000000UL;
+		break;
+	default:
+		tmo = 0;
+		break;
+	}
+
+	tmo *= base;
+	/*
+	 * The spec is over involved. Do not account for any ad-hoc
+	 * host delays. Ie: propagation delay, host-side processing
+	 * delays, and any other  host/system-specific delays.
+	 */
+	others = 0;
+	tmo += others;
+
+	/*
+	 * Limit max timeout to 20 seconds (per phase), which is
+	 * already the default for the nmi watchdog.
+	 */
+	return min(20000000UL, tmo);
+}
+
 #endif /* __CXL_PCI_H__ */
diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
index 6d94ff4a4f1a..37d5616b6fc8 100644
--- a/drivers/cxl/pci.c
+++ b/drivers/cxl/pci.c
@@ -900,6 +900,101 @@  static struct attribute_group cxl_rcd_group = {
 };
 __ATTRIBUTE_GROUPS(cxl_rcd);
 
+
+static int cxl_gpf_setup(struct pci_dev *pdev)
+{
+	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
+	struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds);
+	struct cxl_memdev *cxlmd = cxlds->cxlmd;
+	struct cxl_port *port;
+	int rc, gpf_dvsec;
+	u16 duration;
+	u32 power;
+	int device_t2_base, device_t2_scale;
+
+	/* get the timeouts for phase 2, given by the hardware */
+	gpf_dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_CXL,
+					      CXL_DVSEC_DEVICE_GPF);
+	if (!gpf_dvsec) {
+		dev_warn(&pdev->dev,
+			 "GPF Device DVSEC not present\n");
+		return -EINVAL;
+	}
+
+	rc = pci_read_config_word(
+		pdev,
+		gpf_dvsec + CXL_DVSEC_DEVICE_GPF_PHASE_2_DURATION_OFFSET,
+		&duration);
+	if (rc)
+		return rc;
+
+	device_t2_base = FIELD_GET(CXL_DVSEC_DEVICE_GPF_PHASE_2_TIME_BASE_MASK,
+			    duration);
+	device_t2_scale = FIELD_GET(CXL_DVSEC_DEVICE_GPF_PHASE_2_TIME_SCALE_MASK,
+			     duration);
+	if (device_t2_scale > GPF_TIMEOUT_SCALE_MAX) {
+		dev_warn(&pdev->dev, "GPF: invalid device timeout\n");
+		return -EINVAL;
+	}
+
+	/* cache device GPF timeout and power consumption for phase 2 */
+	mds->gpf.t2_base = device_t2_base;
+	mds->gpf.t2_scale = device_t2_scale;
+
+	rc = pci_read_config_dword(
+		pdev,
+		gpf_dvsec + CXL_DVSEC_DEVICE_GPF_PHASE_2_POWER_OFFSET,
+		&power);
+	if (rc)
+		return rc;
+
+	mds->gpf.power_mwatts = power;
+
+	dev_dbg(&pdev->dev, "Device GPF timeout: %lu us (power needed: %dmW)\n",
+		gpf_timeout_us(device_t2_base, device_t2_scale),
+		mds->gpf.power_mwatts);
+
+	/* iterate up the hierarchy updating max port timeouts where necessary */
+	port = cxlmd->endpoint;
+	while (1) {
+		struct cxl_port *parent_port = to_cxl_port(port->dev.parent);
+		struct cxl_dport *dport;
+		unsigned long index;
+
+		device_lock(&parent_port->dev);
+		xa_for_each(&parent_port->dports, index, dport) {
+			if (!dev_is_pci(dport->dport_dev))
+				continue;
+
+			cxl_pci_update_gpf_port(to_pci_dev(dport->dport_dev),
+						cxlmd, false);
+		}
+		device_unlock(&parent_port->dev);
+
+		if (is_cxl_root(parent_port))
+			break;
+
+		port = parent_port;
+	}
+
+	/*
+	 * Set dirty shutdown now, with the expectation that the device
+	 * clear it upon a successful GPF flow. The exception to this
+	 * is upon Viral detection, per CXL 3.2 section 12.4.2.
+	 *
+	 * XXX: For non-fail scenarios, this is cleared by the driver
+	 * at hot-unplug. But, what about reboot/shutdown case? Is this
+	 * done by hw if no data integrity failure is detected?
+	 */
+	if (resource_size(&cxlds->pmem_res)) {
+		rc = cxl_set_shutdown_state(mds, true);
+		if (!rc)
+			mds->dirtied_shutdown = true;
+	}
+
+	return rc;
+}
+
 static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
 	struct pci_host_bridge *host_bridge = pci_find_host_bridge(pdev->bus);
@@ -1038,6 +1133,8 @@  static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (cxl_pci_ras_unmask(pdev))
 		dev_dbg(&pdev->dev, "No RAS reporting unmasked\n");
 
+	cxl_gpf_setup(pdev);
+
 	pci_save_state(pdev);
 
 	return rc;