diff mbox series

[v2,1/1] cxl/mem: Fix no cxl_nvd during pmem region auto-assembing

Message ID 20240605101524.2244180-1-ming4.li@intel.com
State Superseded
Headers show
Series [v2,1/1] cxl/mem: Fix no cxl_nvd during pmem region auto-assembing | expand

Commit Message

Li, Ming4 June 5, 2024, 10:15 a.m. UTC
When CXL subsystem is auto-assembling a pmem region during cxl
endpoint port probing, always hit below calltrack.

 BUG: kernel NULL pointer dereference, address: 0000000000000078
 #PF: supervisor read access in kernel mode
 #PF: error_code(0x0000) - not-present page
 RIP: 0010:cxl_pmem_region_probe+0x22e/0x360 [cxl_pmem]
 Call Trace:
  <TASK>
  ? __die+0x24/0x70
  ? page_fault_oops+0x82/0x160
  ? do_user_addr_fault+0x65/0x6b0
  ? exc_page_fault+0x7d/0x170
  ? asm_exc_page_fault+0x26/0x30
  ? cxl_pmem_region_probe+0x22e/0x360 [cxl_pmem]
  ? cxl_pmem_region_probe+0x1ac/0x360 [cxl_pmem]
  cxl_bus_probe+0x1b/0x60 [cxl_core]
  really_probe+0x173/0x410
  ? __pfx___device_attach_driver+0x10/0x10
  __driver_probe_device+0x80/0x170
  driver_probe_device+0x1e/0x90
  __device_attach_driver+0x90/0x120
  bus_for_each_drv+0x84/0xe0
  __device_attach+0xbc/0x1f0
  bus_probe_device+0x90/0xa0
  device_add+0x51c/0x710
  devm_cxl_add_pmem_region+0x1b5/0x380 [cxl_core]
  cxl_bus_probe+0x1b/0x60 [cxl_core]

Because the cxl_nvd of the memdev is necessary during pmem region
probing, but the cxl_nvd can be registered only after endpoint port
probing done, that is a collision dependency, so adjust the sequence
between cxl_nvd registration and adding endpoint port to guarantee there
is a cxl_nvd in memdev during the pmem region auto-assembling.

Fixes: f17b558d6663 ("cxl/pmem: Refactor nvdimm device registration, delete the workqueue")
Suggested-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Li Ming <ming4.li@intel.com>
---

v2:
- fix wrong function name in the description of
cxl_find_nvdimm_bridge()(lkp)


 drivers/cxl/core/pmem.c   | 15 ++++++++++-----
 drivers/cxl/core/region.c |  2 +-
 drivers/cxl/cxl.h         |  4 ++--
 drivers/cxl/mem.c         | 17 +++++++++--------
 4 files changed, 22 insertions(+), 16 deletions(-)

Comments

Dan Williams June 17, 2024, 11:50 p.m. UTC | #1
Li Ming wrote:
> When CXL subsystem is auto-assembling a pmem region during cxl
> endpoint port probing, always hit below calltrack.
> 
>  BUG: kernel NULL pointer dereference, address: 0000000000000078
>  #PF: supervisor read access in kernel mode
>  #PF: error_code(0x0000) - not-present page
>  RIP: 0010:cxl_pmem_region_probe+0x22e/0x360 [cxl_pmem]
>  Call Trace:
>   <TASK>
>   ? __die+0x24/0x70
>   ? page_fault_oops+0x82/0x160
>   ? do_user_addr_fault+0x65/0x6b0
>   ? exc_page_fault+0x7d/0x170
>   ? asm_exc_page_fault+0x26/0x30
>   ? cxl_pmem_region_probe+0x22e/0x360 [cxl_pmem]
>   ? cxl_pmem_region_probe+0x1ac/0x360 [cxl_pmem]
>   cxl_bus_probe+0x1b/0x60 [cxl_core]
>   really_probe+0x173/0x410
>   ? __pfx___device_attach_driver+0x10/0x10
>   __driver_probe_device+0x80/0x170
>   driver_probe_device+0x1e/0x90
>   __device_attach_driver+0x90/0x120
>   bus_for_each_drv+0x84/0xe0
>   __device_attach+0xbc/0x1f0
>   bus_probe_device+0x90/0xa0
>   device_add+0x51c/0x710
>   devm_cxl_add_pmem_region+0x1b5/0x380 [cxl_core]
>   cxl_bus_probe+0x1b/0x60 [cxl_core]
> 
> Because the cxl_nvd of the memdev is necessary during pmem region
> probing, but the cxl_nvd can be registered only after endpoint port
> probing done, that is a collision dependency, so adjust the sequence
> between cxl_nvd registration and adding endpoint port to guarantee there
> is a cxl_nvd in memdev during the pmem region auto-assembling.
> 
> Fixes: f17b558d6663 ("cxl/pmem: Refactor nvdimm device registration, delete the workqueue")
> Suggested-by: Dan Williams <dan.j.williams@intel.com>
> Signed-off-by: Li Ming <ming4.li@intel.com>

Looks good to me:

Reviewed-by: Dan Williams <dan.j.williams@intel.com>
diff mbox series

Patch

diff --git a/drivers/cxl/core/pmem.c b/drivers/cxl/core/pmem.c
index e69625a8d6a1..78c6f4672fcf 100644
--- a/drivers/cxl/core/pmem.c
+++ b/drivers/cxl/core/pmem.c
@@ -62,10 +62,14 @@  static int match_nvdimm_bridge(struct device *dev, void *data)
 	return is_cxl_nvdimm_bridge(dev);
 }
 
-struct cxl_nvdimm_bridge *cxl_find_nvdimm_bridge(struct cxl_memdev *cxlmd)
+/**
+ * cxl_find_nvdimm_bridge() - find a bridge device relative to a port
+ * @port: any descendant port of an nvdimm-bridge associated
+ *        root-cxl-port
+ */
+struct cxl_nvdimm_bridge *cxl_find_nvdimm_bridge(struct cxl_port *port)
 {
-	struct cxl_root *cxl_root __free(put_cxl_root) =
-		find_cxl_root(cxlmd->endpoint);
+	struct cxl_root *cxl_root __free(put_cxl_root) = find_cxl_root(port);
 	struct device *dev;
 
 	if (!cxl_root)
@@ -242,18 +246,19 @@  static void cxlmd_release_nvdimm(void *_cxlmd)
 
 /**
  * devm_cxl_add_nvdimm() - add a bridge between a cxl_memdev and an nvdimm
+ * @port: parent port for the (to be added) @cxlmd endpoint port
  * @cxlmd: cxl_memdev instance that will perform LIBNVDIMM operations
  *
  * Return: 0 on success negative error code on failure.
  */
-int devm_cxl_add_nvdimm(struct cxl_memdev *cxlmd)
+int devm_cxl_add_nvdimm(struct cxl_port *port, struct cxl_memdev *cxlmd)
 {
 	struct cxl_nvdimm_bridge *cxl_nvb;
 	struct cxl_nvdimm *cxl_nvd;
 	struct device *dev;
 	int rc;
 
-	cxl_nvb = cxl_find_nvdimm_bridge(cxlmd);
+	cxl_nvb = cxl_find_nvdimm_bridge(port);
 	if (!cxl_nvb)
 		return -ENODEV;
 
diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
index 3c2b6144be23..f0cafc7ffb45 100644
--- a/drivers/cxl/core/region.c
+++ b/drivers/cxl/core/region.c
@@ -2847,7 +2847,7 @@  static int cxl_pmem_region_alloc(struct cxl_region *cxlr)
 		 * bridge for one device is the same for all.
 		 */
 		if (i == 0) {
-			cxl_nvb = cxl_find_nvdimm_bridge(cxlmd);
+			cxl_nvb = cxl_find_nvdimm_bridge(cxlmd->endpoint);
 			if (!cxl_nvb)
 				return -ENODEV;
 			cxlr->cxl_nvb = cxl_nvb;
diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
index 603c0120cff8..e8fca6c6952b 100644
--- a/drivers/cxl/cxl.h
+++ b/drivers/cxl/cxl.h
@@ -855,8 +855,8 @@  struct cxl_nvdimm_bridge *devm_cxl_add_nvdimm_bridge(struct device *host,
 struct cxl_nvdimm *to_cxl_nvdimm(struct device *dev);
 bool is_cxl_nvdimm(struct device *dev);
 bool is_cxl_nvdimm_bridge(struct device *dev);
-int devm_cxl_add_nvdimm(struct cxl_memdev *cxlmd);
-struct cxl_nvdimm_bridge *cxl_find_nvdimm_bridge(struct cxl_memdev *cxlmd);
+int devm_cxl_add_nvdimm(struct cxl_port *port, struct cxl_memdev *cxlmd);
+struct cxl_nvdimm_bridge *cxl_find_nvdimm_bridge(struct cxl_port *port);
 
 #ifdef CONFIG_CXL_REGION
 bool is_cxl_pmem_region(struct device *dev);
diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c
index 0c79d9ce877c..2f1b49bfe162 100644
--- a/drivers/cxl/mem.c
+++ b/drivers/cxl/mem.c
@@ -152,6 +152,15 @@  static int cxl_mem_probe(struct device *dev)
 		return -ENXIO;
 	}
 
+	if (resource_size(&cxlds->pmem_res) && IS_ENABLED(CONFIG_CXL_PMEM)) {
+		rc = devm_cxl_add_nvdimm(parent_port, cxlmd);
+		if (rc) {
+			if (rc == -ENODEV)
+				dev_info(dev, "PMEM disabled by platform\n");
+			return rc;
+		}
+	}
+
 	if (dport->rch)
 		endpoint_parent = parent_port->uport_dev;
 	else
@@ -174,14 +183,6 @@  static int cxl_mem_probe(struct device *dev)
 	if (rc)
 		return rc;
 
-	if (resource_size(&cxlds->pmem_res) && IS_ENABLED(CONFIG_CXL_PMEM)) {
-		rc = devm_cxl_add_nvdimm(cxlmd);
-		if (rc == -ENODEV)
-			dev_info(dev, "PMEM disabled by platform\n");
-		else
-			return rc;
-	}
-
 	/*
 	 * The kernel may be operating out of CXL memory on this device,
 	 * there is no spec defined way to determine whether this device