diff mbox series

[v2,7/8] cxl/port: Introduce cxl_port objects

Message ID 161728748083.2474040.753623311074560290.stgit@dwillia2-desk3.amr.corp.intel.com (mailing list archive)
State Superseded
Delegated to: Bjorn Helgaas
Headers show
Series CXL Port Enumeration | expand

Commit Message

Dan Williams April 1, 2021, 2:31 p.m. UTC
Once the cxl_root is established then other ports in the hierarchy can
be attached. The cxl_port object, unlike cxl_root that is associated
with host bridges, is associated with PCIE Root Ports or PCIE Switch
Ports. Add cxl_port instances for all PCIE Root Ports in an ACPI0016
host bridge. The cxl_port instances for PCIE Switch Ports are not
included here as those are to be modeled as another service device
registered on the pcie_port_bus_type.

A sample sysfs topology for a single-host-bridge with
single-PCIE/CXL-root-port:

/sys/bus/cxl/devices/root0
├── address_space0
│   ├── devtype
│   ├── end
│   ├── start
│   ├── supports_ram
│   ├── supports_type2
│   ├── supports_type3
│   └── uevent
├── address_space1
│   ├── devtype
│   ├── end
│   ├── start
│   ├── supports_pmem
│   ├── supports_type2
│   ├── supports_type3
│   └── uevent
├── devtype
├── port1
│   ├── devtype
│   ├── host -> ../../../../LNXSYSTM:00/LNXSYBUS:00/ACPI0016:00
│   ├── port2
│   │   ├── devtype
│   │   ├── host -> ../../../../../pci0000:34/0000:34:00.0
│   │   ├── subsystem -> ../../../../../../bus/cxl
│   │   ├── target_id
│   │   └── uevent
│   ├── subsystem -> ../../../../../bus/cxl
│   ├── target_id
│   └── uevent
├── subsystem -> ../../../../bus/cxl
├── target_id
└── uevent

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/cxl/acpi.c |   99 +++++++++++++++++++++++++++++++++++++++++++
 drivers/cxl/core.c |  121 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/cxl/cxl.h  |    5 ++
 3 files changed, 224 insertions(+), 1 deletion(-)

Comments

Jonathan Cameron April 6, 2021, 5:44 p.m. UTC | #1
On Thu, 1 Apr 2021 07:31:20 -0700
Dan Williams <dan.j.williams@intel.com> wrote:

> Once the cxl_root is established then other ports in the hierarchy can
> be attached. The cxl_port object, unlike cxl_root that is associated
> with host bridges, is associated with PCIE Root Ports or PCIE Switch
> Ports. Add cxl_port instances for all PCIE Root Ports in an ACPI0016
> host bridge. The cxl_port instances for PCIE Switch Ports are not
> included here as those are to be modeled as another service device
> registered on the pcie_port_bus_type.

Good to give a bit of description of what port2 represents vs port1.

> 
> A sample sysfs topology for a single-host-bridge with
> single-PCIE/CXL-root-port:
> 
> /sys/bus/cxl/devices/root0
> ├── address_space0
> │   ├── devtype
> │   ├── end
> │   ├── start
> │   ├── supports_ram
> │   ├── supports_type2
> │   ├── supports_type3
> │   └── uevent
> ├── address_space1
> │   ├── devtype
> │   ├── end
> │   ├── start
> │   ├── supports_pmem
> │   ├── supports_type2
> │   ├── supports_type3
> │   └── uevent
> ├── devtype
> ├── port1
> │   ├── devtype
> │   ├── host -> ../../../../LNXSYSTM:00/LNXSYBUS:00/ACPI0016:00
> │   ├── port2
> │   │   ├── devtype
> │   │   ├── host -> ../../../../../pci0000:34/0000:34:00.0
> │   │   ├── subsystem -> ../../../../../../bus/cxl
> │   │   ├── target_id
> │   │   └── uevent
> │   ├── subsystem -> ../../../../../bus/cxl
> │   ├── target_id
> │   └── uevent
> ├── subsystem -> ../../../../bus/cxl
> ├── target_id
> └── uevent
> 
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> ---
>  drivers/cxl/acpi.c |   99 +++++++++++++++++++++++++++++++++++++++++++
>  drivers/cxl/core.c |  121 ++++++++++++++++++++++++++++++++++++++++++++++++++++
>  drivers/cxl/cxl.h  |    5 ++
>  3 files changed, 224 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c
> index d54c2d5de730..bc2a35ae880b 100644
> --- a/drivers/cxl/acpi.c
> +++ b/drivers/cxl/acpi.c
> @@ -5,18 +5,117 @@
>  #include <linux/device.h>
>  #include <linux/kernel.h>
>  #include <linux/acpi.h>
> +#include <linux/pci.h>
>  #include "cxl.h"
>  
> +static int match_ACPI0016(struct device *dev, const void *host)
> +{
> +	struct acpi_device *adev = to_acpi_device(dev);
> +	const char *hid = acpi_device_hid(adev);
> +
> +	return strcmp(hid, "ACPI0016") == 0;
> +}
> +
> +struct cxl_walk_context {
> +	struct device *dev;
> +	struct pci_bus *root;
> +	struct cxl_port *port;
> +	int error;
> +	int count;
> +};
> +
> +static int match_add_root_ports(struct pci_dev *pdev, void *data)
> +{
> +	struct cxl_walk_context *ctx = data;
> +	struct pci_bus *root_bus = ctx->root;
> +	struct cxl_port *port = ctx->port;
> +	int type = pci_pcie_type(pdev);
> +	struct device *dev = ctx->dev;
> +	resource_size_t cxl_regs_phys;
> +	int target_id = ctx->count;
> +
> +	if (pdev->bus != root_bus)
> +		return 0;
> +	if (!pci_is_pcie(pdev))
> +		return 0;
> +	if (type != PCI_EXP_TYPE_ROOT_PORT)
> +		return 0;
> +
> +	ctx->count++;
> +
> +	/* TODO walk DVSEC to find component register base */
> +	cxl_regs_phys = -1;
> +
> +	port = devm_cxl_add_port(dev, port, &pdev->dev, target_id,
> +				 cxl_regs_phys);
> +	if (IS_ERR(port)) {
> +		ctx->error = PTR_ERR(port);
> +		return ctx->error;
> +	}
> +
> +	dev_dbg(dev, "%s: register: %s\n", dev_name(&pdev->dev),
> +		dev_name(&port->dev));
> +
> +	return 0;
> +}
> +
> +/*
> + * A host bridge may contain one or more root ports.  Register each port
> + * as a child of the cxl_root.
> + */
> +static int cxl_acpi_register_ports(struct device *dev, struct acpi_device *root,
> +				   struct cxl_port *port, int idx)
> +{
> +	struct acpi_pci_root *pci_root = acpi_pci_find_root(root->handle);
> +	struct cxl_walk_context ctx;
> +
> +	if (!pci_root)
> +		return -ENXIO;
> +
> +	/* TODO: fold in CEDT.CHBS retrieval */
> +	port = devm_cxl_add_port(dev, port, &root->dev, idx, ~0ULL);
> +	if (IS_ERR(port))
> +		return PTR_ERR(port);
> +	dev_dbg(dev, "%s: register: %s\n", dev_name(&root->dev),
> +		dev_name(&port->dev));
> +
> +	ctx = (struct cxl_walk_context) {
> +		.dev = dev,
> +		.root = pci_root->bus,
> +		.port = port,
> +	};
> +	pci_walk_bus(pci_root->bus, match_add_root_ports, &ctx);
> +
> +	if (ctx.count == 0)
> +		return -ENODEV;
> +	return ctx.error;
> +}
> +
>  static int cxl_acpi_probe(struct platform_device *pdev)
>  {
>  	struct device *dev = &pdev->dev;
> +	struct acpi_device *adev = ACPI_COMPANION(dev);
> +	struct device *bridge = NULL;
>  	struct cxl_root *cxl_root;
> +	int rc, i = 0;
>  
>  	cxl_root = devm_cxl_add_root(dev, NULL, 0);
>  	if (IS_ERR(cxl_root))
>  		return PTR_ERR(cxl_root);
>  	dev_dbg(dev, "register: %s\n", dev_name(&cxl_root->port.dev));
>  
> +	while (true) {
> +		bridge = bus_find_device(adev->dev.bus, bridge, dev,
> +					 match_ACPI0016);
> +		if (!bridge)
> +			break;
> +
> +		rc = cxl_acpi_register_ports(dev, to_acpi_device(bridge),
> +					     &cxl_root->port, i++);
> +		if (rc)
> +			return rc;
> +	}
> +
>  	return 0;
>  }
>  
> diff --git a/drivers/cxl/core.c b/drivers/cxl/core.c
> index 46c3b2588d2f..65cd704581bc 100644
> --- a/drivers/cxl/core.c
> +++ b/drivers/cxl/core.c
> @@ -148,6 +148,15 @@ static void cxl_root_release(struct device *dev)
>  	kfree(cxl_root);
>  }
>  
> +static void cxl_port_release(struct device *dev)
> +{
> +	struct cxl_port *port = to_cxl_port(dev);
> +
> +	ida_free(&cxl_port_ida, port->id);
> +	put_device(port->port_host);
> +	kfree(port);
> +}
> +
>  static ssize_t target_id_show(struct device *dev, struct device_attribute *attr,
>  			      char *buf)
>  {
> @@ -178,6 +187,12 @@ static const struct device_type cxl_root_type = {
>  	.groups = cxl_port_attribute_groups,
>  };
>  
> +static const struct device_type cxl_port_type = {
> +	.name = "cxl_port",
> +	.release = cxl_port_release,
> +	.groups = cxl_port_attribute_groups,
> +};
> +
>  struct cxl_root *to_cxl_root(struct device *dev)
>  {
>  	if (dev_WARN_ONCE(dev, dev->type != &cxl_root_type,
> @@ -188,7 +203,9 @@ struct cxl_root *to_cxl_root(struct device *dev)
>  
>  struct cxl_port *to_cxl_port(struct device *dev)
>  {
> -	if (dev_WARN_ONCE(dev, dev->type != &cxl_root_type,
> +	if (dev_WARN_ONCE(dev,
> +			  dev->type != &cxl_root_type &&
> +			  dev->type != &cxl_port_type,
>  			  "not a cxl_port device\n"))
>  		return NULL;
>  	return container_of(dev, struct cxl_port, dev);
> @@ -360,6 +377,108 @@ struct cxl_root *devm_cxl_add_root(struct device *host,
>  }
>  EXPORT_SYMBOL_GPL(devm_cxl_add_root);
>  
> +static void cxl_unlink_port(void *_port)
> +{
> +	struct cxl_port *port = _port;
> +
> +	sysfs_remove_link(&port->dev.kobj, "host");
> +}
> +
> +static int devm_cxl_link_port(struct device *dev, struct cxl_port *port)
> +{
> +	int rc;
> +
> +	rc = sysfs_create_link(&port->dev.kobj, &port->port_host->kobj, "host");
> +	if (rc)
> +		return rc;
> +	return devm_add_action_or_reset(dev, cxl_unlink_port, port);
> +}
> +
> +static struct cxl_port *cxl_port_alloc(struct cxl_port *parent_port,
> +				       struct device *port_dev, int target_id,
> +				       resource_size_t component_regs_phys)
> +{
> +	struct cxl_port *port;
> +	struct device *dev;
> +	int rc;
> +
> +	if (!port_dev)
> +		return ERR_PTR(-EINVAL);
> +
> +	port = kzalloc(sizeof(*port), GFP_KERNEL);
> +	if (!port)
> +		return ERR_PTR(-ENOMEM);
> +
> +	rc = ida_alloc(&cxl_port_ida, GFP_KERNEL);
> +	if (rc < 0)
> +		goto err;
> +
> +	port->id = rc;
> +	port->target_id = target_id;
> +	port->port_host = get_device(port_dev);
> +	port->component_regs_phys = component_regs_phys;
> +
> +	dev = &port->dev;
> +	device_initialize(dev);
> +	device_set_pm_not_required(dev);
> +	dev->parent = &parent_port->dev;
> +	dev->bus = &cxl_bus_type;
> +	dev->type = &cxl_port_type;
> +
> +	return port;
> +
> +err:
> +	kfree(port);
> +	return ERR_PTR(rc);
> +}
> +
> +/**
> + * devm_cxl_add_port() - add a cxl_port to the topology
> + * @host: devm context / discovery agent
> + * @parent_port: immediate ancestor towards cxl_root
> + * @port_host: PCI or platform-firmware device hosting this port
> + * @target_id: ordinal id relative to other siblings under @parent_port
> + * @component_regs_phys: CXL component register base address
> + */
> +struct cxl_port *devm_cxl_add_port(struct device *host,
> +				   struct cxl_port *parent_port,
> +				   struct device *port_host, int target_id,
> +				   resource_size_t component_regs_phys)
> +{
> +	struct cxl_port *port;
> +	struct device *dev;
> +	int rc;
> +
> +	port = cxl_port_alloc(parent_port, port_host, target_id,
> +			      component_regs_phys);
> +	if (IS_ERR(port))
> +		return port;
> +
> +	dev = &port->dev;
> +	rc = dev_set_name(dev, "port%d", port->id);
> +	if (rc)
> +		goto err;
> +
> +	rc = device_add(dev);
> +	if (rc)
> +		goto err;
> +
> +	rc = devm_add_action_or_reset(host, unregister_dev, dev);
> +	if (rc)
> +		return ERR_PTR(rc);
> +
> +	rc = devm_cxl_link_port(host, port);
> +	if (rc)
> +		return ERR_PTR(rc);
> +
> +	return port;
> +
> +err:
> +	put_device(dev);
> +	return ERR_PTR(rc);
> +}
> +EXPORT_SYMBOL_GPL(devm_cxl_add_port);
> +
>  /*
>   * cxl_setup_device_regs() - Detect CXL Device register blocks
>   * @dev: Host device of the @base mapping
> diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
> index 559f8343fee4..0211f44c95a2 100644
> --- a/drivers/cxl/cxl.h
> +++ b/drivers/cxl/cxl.h
> @@ -134,5 +134,10 @@ struct cxl_address_space_dev *to_cxl_address_space(struct device *dev);
>  struct cxl_root *devm_cxl_add_root(struct device *parent,
>  				   struct cxl_address_space *cxl_space,
>  				   int nr_spaces);
> +struct cxl_port *devm_cxl_add_port(struct device *host,
> +				   struct cxl_port *parent_port,
> +				   struct device *port_host, int target_id,
> +				   resource_size_t component_regs_phys);
> +
>  extern struct bus_type cxl_bus_type;
>  #endif /* __CXL_H__ */
>
Bjorn Helgaas April 8, 2021, 10:42 p.m. UTC | #2
[+cc Greg, Rafael, Matthew: device model questions]

Hi Dan,

On Thu, Apr 01, 2021 at 07:31:20AM -0700, Dan Williams wrote:
> Once the cxl_root is established then other ports in the hierarchy can
> be attached. The cxl_port object, unlike cxl_root that is associated
> with host bridges, is associated with PCIE Root Ports or PCIE Switch
> Ports. Add cxl_port instances for all PCIE Root Ports in an ACPI0016
> host bridge. 

I'm not a device model expert, but I'm not sure about adding a new
/sys/bus/cxl/devices hierarchy.  I'm under the impression that CXL
devices will be enumerated by the PCI core as PCIe devices.  Doesn't
that mean we will have one struct device in the pci_dev, and another
one in the cxl_port?  That seems like an issue to me.  More below.

> The cxl_port instances for PCIE Switch Ports are not
> included here as those are to be modeled as another service device
> registered on the pcie_port_bus_type.

I'm hesitant about the idea of adding more uses of pcie_port_bus_type.
I really dislike portdrv because it makes a parallel hierarchy:

  /sys/bus/pci
  /sys/bus/pci_express

for things that really should not be different.  There's a struct
device in pci_dev, and potentially several pcie_devices, each with
another struct device.  We make these pcie_device things for AER, DPC,
hotplug, etc.  E.g.,

  /sys/bus/pci/devices/0000:00:1c.0
  /sys/bus/pci_express/devices/0000:00:1c.0:pcie002  # AER
  /sys/bus/pci_express/devices/0000:00:1c.0:pcie010  # BW notification

These are all the same PCI device.  AER is a PCI capability.
Bandwidth notification is just a feature of all Downstream Ports.  I
think it makes zero sense to have extra struct devices for them.  From
a device point of view (enumeration, power management, VM assignment),
we can't manage them separately from the underlying PCI device.  For
example, we have three separate "power/" directories, but obviously
there's only one point of control (00:1c.0):

  /sys/devices/pci0000:00/0000:00:1c.0/power/
  /sys/devices/pci0000:00/0000:00:1c.0/0000:00:1c.0:pcie002/power/
  /sys/devices/pci0000:00/0000:00:1c.0/0000:00:1c.0:pcie010/power/

> A sample sysfs topology for a single-host-bridge with
> single-PCIE/CXL-root-port:
> 
> /sys/bus/cxl/devices/root0
> ├── address_space0
> │   ├── devtype
> │   ├── end
> │   ├── start
> │   ├── supports_ram
> │   ├── supports_type2
> │   ├── supports_type3
> │   └── uevent
> ├── address_space1
> │   ├── devtype
> │   ├── end
> │   ├── start
> │   ├── supports_pmem
> │   ├── supports_type2
> │   ├── supports_type3
> │   └── uevent
> ├── devtype
> ├── port1
> │   ├── devtype
> │   ├── host -> ../../../../LNXSYSTM:00/LNXSYBUS:00/ACPI0016:00
> │   ├── port2
> │   │   ├── devtype
> │   │   ├── host -> ../../../../../pci0000:34/0000:34:00.0
> │   │   ├── subsystem -> ../../../../../../bus/cxl
> │   │   ├── target_id
> │   │   └── uevent
> │   ├── subsystem -> ../../../../../bus/cxl
> │   ├── target_id
> │   └── uevent
> ├── subsystem -> ../../../../bus/cxl
> ├── target_id
> └── uevent
> 
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> ---
>  drivers/cxl/acpi.c |   99 +++++++++++++++++++++++++++++++++++++++++++
>  drivers/cxl/core.c |  121 ++++++++++++++++++++++++++++++++++++++++++++++++++++
>  drivers/cxl/cxl.h  |    5 ++
>  3 files changed, 224 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c
> index d54c2d5de730..bc2a35ae880b 100644
> --- a/drivers/cxl/acpi.c
> +++ b/drivers/cxl/acpi.c
> @@ -5,18 +5,117 @@
>  #include <linux/device.h>
>  #include <linux/kernel.h>
>  #include <linux/acpi.h>
> +#include <linux/pci.h>
>  #include "cxl.h"
>  
> +static int match_ACPI0016(struct device *dev, const void *host)
> +{
> +	struct acpi_device *adev = to_acpi_device(dev);
> +	const char *hid = acpi_device_hid(adev);
> +
> +	return strcmp(hid, "ACPI0016") == 0;
> +}
> +
> +struct cxl_walk_context {
> +	struct device *dev;
> +	struct pci_bus *root;
> +	struct cxl_port *port;
> +	int error;
> +	int count;
> +};
> +
> +static int match_add_root_ports(struct pci_dev *pdev, void *data)
> +{
> +	struct cxl_walk_context *ctx = data;
> +	struct pci_bus *root_bus = ctx->root;
> +	struct cxl_port *port = ctx->port;
> +	int type = pci_pcie_type(pdev);
> +	struct device *dev = ctx->dev;
> +	resource_size_t cxl_regs_phys;
> +	int target_id = ctx->count;
> +
> +	if (pdev->bus != root_bus)
> +		return 0;
> +	if (!pci_is_pcie(pdev))
> +		return 0;
> +	if (type != PCI_EXP_TYPE_ROOT_PORT)
> +		return 0;
> +
> +	ctx->count++;
> +
> +	/* TODO walk DVSEC to find component register base */
> +	cxl_regs_phys = -1;
> +
> +	port = devm_cxl_add_port(dev, port, &pdev->dev, target_id,
> +				 cxl_regs_phys);
> +	if (IS_ERR(port)) {
> +		ctx->error = PTR_ERR(port);
> +		return ctx->error;
> +	}
> +
> +	dev_dbg(dev, "%s: register: %s\n", dev_name(&pdev->dev),
> +		dev_name(&port->dev));
> +
> +	return 0;
> +}
> +
> +/*
> + * A host bridge may contain one or more root ports.  Register each port
> + * as a child of the cxl_root.
> + */
> +static int cxl_acpi_register_ports(struct device *dev, struct acpi_device *root,
> +				   struct cxl_port *port, int idx)
> +{
> +	struct acpi_pci_root *pci_root = acpi_pci_find_root(root->handle);
> +	struct cxl_walk_context ctx;
> +
> +	if (!pci_root)
> +		return -ENXIO;
> +
> +	/* TODO: fold in CEDT.CHBS retrieval */
> +	port = devm_cxl_add_port(dev, port, &root->dev, idx, ~0ULL);
> +	if (IS_ERR(port))
> +		return PTR_ERR(port);
> +	dev_dbg(dev, "%s: register: %s\n", dev_name(&root->dev),
> +		dev_name(&port->dev));
> +
> +	ctx = (struct cxl_walk_context) {
> +		.dev = dev,
> +		.root = pci_root->bus,
> +		.port = port,
> +	};
> +	pci_walk_bus(pci_root->bus, match_add_root_ports, &ctx);
> +
> +	if (ctx.count == 0)
> +		return -ENODEV;
> +	return ctx.error;
> +}
> +
>  static int cxl_acpi_probe(struct platform_device *pdev)
>  {
>  	struct device *dev = &pdev->dev;
> +	struct acpi_device *adev = ACPI_COMPANION(dev);
> +	struct device *bridge = NULL;
>  	struct cxl_root *cxl_root;
> +	int rc, i = 0;
>  
>  	cxl_root = devm_cxl_add_root(dev, NULL, 0);
>  	if (IS_ERR(cxl_root))
>  		return PTR_ERR(cxl_root);
>  	dev_dbg(dev, "register: %s\n", dev_name(&cxl_root->port.dev));
>  
> +	while (true) {
> +		bridge = bus_find_device(adev->dev.bus, bridge, dev,
> +					 match_ACPI0016);
> +		if (!bridge)
> +			break;
> +
> +		rc = cxl_acpi_register_ports(dev, to_acpi_device(bridge),
> +					     &cxl_root->port, i++);
> +		if (rc)
> +			return rc;
> +	}
> +
>  	return 0;
>  }
>  
> diff --git a/drivers/cxl/core.c b/drivers/cxl/core.c
> index 46c3b2588d2f..65cd704581bc 100644
> --- a/drivers/cxl/core.c
> +++ b/drivers/cxl/core.c
> @@ -148,6 +148,15 @@ static void cxl_root_release(struct device *dev)
>  	kfree(cxl_root);
>  }
>  
> +static void cxl_port_release(struct device *dev)
> +{
> +	struct cxl_port *port = to_cxl_port(dev);
> +
> +	ida_free(&cxl_port_ida, port->id);
> +	put_device(port->port_host);
> +	kfree(port);
> +}
> +
>  static ssize_t target_id_show(struct device *dev, struct device_attribute *attr,
>  			      char *buf)
>  {
> @@ -178,6 +187,12 @@ static const struct device_type cxl_root_type = {
>  	.groups = cxl_port_attribute_groups,
>  };
>  
> +static const struct device_type cxl_port_type = {
> +	.name = "cxl_port",
> +	.release = cxl_port_release,
> +	.groups = cxl_port_attribute_groups,
> +};
> +
>  struct cxl_root *to_cxl_root(struct device *dev)
>  {
>  	if (dev_WARN_ONCE(dev, dev->type != &cxl_root_type,
> @@ -188,7 +203,9 @@ struct cxl_root *to_cxl_root(struct device *dev)
>  
>  struct cxl_port *to_cxl_port(struct device *dev)
>  {
> -	if (dev_WARN_ONCE(dev, dev->type != &cxl_root_type,
> +	if (dev_WARN_ONCE(dev,
> +			  dev->type != &cxl_root_type &&
> +			  dev->type != &cxl_port_type,
>  			  "not a cxl_port device\n"))
>  		return NULL;
>  	return container_of(dev, struct cxl_port, dev);
> @@ -360,6 +377,108 @@ struct cxl_root *devm_cxl_add_root(struct device *host,
>  }
>  EXPORT_SYMBOL_GPL(devm_cxl_add_root);
>  
> +static void cxl_unlink_port(void *_port)
> +{
> +	struct cxl_port *port = _port;
> +
> +	sysfs_remove_link(&port->dev.kobj, "host");
> +}
> +
> +static int devm_cxl_link_port(struct device *dev, struct cxl_port *port)
> +{
> +	int rc;
> +
> +	rc = sysfs_create_link(&port->dev.kobj, &port->port_host->kobj, "host");
> +	if (rc)
> +		return rc;
> +	return devm_add_action_or_reset(dev, cxl_unlink_port, port);
> +}
> +
> +static struct cxl_port *cxl_port_alloc(struct cxl_port *parent_port,
> +				       struct device *port_dev, int target_id,
> +				       resource_size_t component_regs_phys)
> +{
> +	struct cxl_port *port;
> +	struct device *dev;
> +	int rc;
> +
> +	if (!port_dev)
> +		return ERR_PTR(-EINVAL);
> +
> +	port = kzalloc(sizeof(*port), GFP_KERNEL);
> +	if (!port)
> +		return ERR_PTR(-ENOMEM);
> +
> +	rc = ida_alloc(&cxl_port_ida, GFP_KERNEL);
> +	if (rc < 0)
> +		goto err;
> +
> +	port->id = rc;
> +	port->target_id = target_id;
> +	port->port_host = get_device(port_dev);
> +	port->component_regs_phys = component_regs_phys;
> +
> +	dev = &port->dev;
> +	device_initialize(dev);
> +	device_set_pm_not_required(dev);
> +	dev->parent = &parent_port->dev;
> +	dev->bus = &cxl_bus_type;
> +	dev->type = &cxl_port_type;
> +
> +	return port;
> +
> +err:
> +	kfree(port);
> +	return ERR_PTR(rc);
> +}
> +
> +/**
> + * devm_cxl_add_port() - add a cxl_port to the topology
> + * @host: devm context / discovery agent
> + * @parent_port: immediate ancestor towards cxl_root
> + * @port_host: PCI or platform-firmware device hosting this port
> + * @target_id: ordinal id relative to other siblings under @parent_port
> + * @component_regs_phys: CXL component register base address
> + */
> +struct cxl_port *devm_cxl_add_port(struct device *host,
> +				   struct cxl_port *parent_port,
> +				   struct device *port_host, int target_id,
> +				   resource_size_t component_regs_phys)
> +{
> +	struct cxl_port *port;
> +	struct device *dev;
> +	int rc;
> +
> +	port = cxl_port_alloc(parent_port, port_host, target_id,
> +			      component_regs_phys);
> +	if (IS_ERR(port))
> +		return port;
> +
> +	dev = &port->dev;
> +	rc = dev_set_name(dev, "port%d", port->id);
> +	if (rc)
> +		goto err;
> +
> +	rc = device_add(dev);
> +	if (rc)
> +		goto err;
> +
> +	rc = devm_add_action_or_reset(host, unregister_dev, dev);
> +	if (rc)
> +		return ERR_PTR(rc);
> +
> +	rc = devm_cxl_link_port(host, port);
> +	if (rc)
> +		return ERR_PTR(rc);
> +
> +	return port;
> +
> +err:
> +	put_device(dev);
> +	return ERR_PTR(rc);
> +}
> +EXPORT_SYMBOL_GPL(devm_cxl_add_port);
> +
>  /*
>   * cxl_setup_device_regs() - Detect CXL Device register blocks
>   * @dev: Host device of the @base mapping
> diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
> index 559f8343fee4..0211f44c95a2 100644
> --- a/drivers/cxl/cxl.h
> +++ b/drivers/cxl/cxl.h
> @@ -134,5 +134,10 @@ struct cxl_address_space_dev *to_cxl_address_space(struct device *dev);
>  struct cxl_root *devm_cxl_add_root(struct device *parent,
>  				   struct cxl_address_space *cxl_space,
>  				   int nr_spaces);
> +struct cxl_port *devm_cxl_add_port(struct device *host,
> +				   struct cxl_port *parent_port,
> +				   struct device *port_host, int target_id,
> +				   resource_size_t component_regs_phys);
> +
>  extern struct bus_type cxl_bus_type;
>  #endif /* __CXL_H__ */
>
Dan Williams April 9, 2021, 2:13 a.m. UTC | #3
Hi Bjorn, thanks for taking a look.


On Thu, Apr 8, 2021 at 3:42 PM Bjorn Helgaas <helgaas@kernel.org> wrote:
>
> [+cc Greg, Rafael, Matthew: device model questions]
>
> Hi Dan,
>
> On Thu, Apr 01, 2021 at 07:31:20AM -0700, Dan Williams wrote:
> > Once the cxl_root is established then other ports in the hierarchy can
> > be attached. The cxl_port object, unlike cxl_root that is associated
> > with host bridges, is associated with PCIE Root Ports or PCIE Switch
> > Ports. Add cxl_port instances for all PCIE Root Ports in an ACPI0016
> > host bridge.
>
> I'm not a device model expert, but I'm not sure about adding a new
> /sys/bus/cxl/devices hierarchy.  I'm under the impression that CXL
> devices will be enumerated by the PCI core as PCIe devices.

Yes, PCIe is involved, but mostly only for the CXL.io slow path
(configuration and provisioning via mailbox) when we're talking about
memory expander devices (CXL calls these Type-3). So-called "Type-3"
support is the primary driver of this infrastructure.

You might be thinking of CXL accelerator devices that will look like
plain PCIe devices that happen to participate in the CPU cache
hierarchy (CXL calls these Type-1). There will also be accelerator
devices that want to share coherent memory with the system (CXL calls
these Type-2).

The infrastructure being proposed here is primarily for the memory
expander (Type-3) device case where the PCI sysfs hierarchy is wholly
unsuited for modeling it. A single CXL memory region device may span
multiple endpoints, switches, and host bridges. It poses similar
stress to an OS device model as RAID where there is a driver for the
component contributors to an upper level device / driver that exposes
the RAID Volume (CXL memory region interleave set). The CXL memory
decode space (HDM: Host Managed Device Memory) is independent of the
PCIe MMIO BAR space.

That's where the /sys/bus/cxl hierarchy is needed, to manage the HDM
space across the CXL topology in a way that is foreign to PCIE (HDM
Decoder hierarchy).

> Doesn't
> that mean we will have one struct device in the pci_dev, and another
> one in the cxl_port?

Yes, that is the proposal.

> That seems like an issue to me.  More below.

hmm...

>
> > The cxl_port instances for PCIE Switch Ports are not
> > included here as those are to be modeled as another service device
> > registered on the pcie_port_bus_type.
>
> I'm hesitant about the idea of adding more uses of pcie_port_bus_type.
> I really dislike portdrv because it makes a parallel hierarchy:
>
>   /sys/bus/pci
>   /sys/bus/pci_express
>
> for things that really should not be different.  There's a struct
> device in pci_dev, and potentially several pcie_devices, each with
> another struct device.  We make these pcie_device things for AER, DPC,
> hotplug, etc.  E.g.,
>
>   /sys/bus/pci/devices/0000:00:1c.0
>   /sys/bus/pci_express/devices/0000:00:1c.0:pcie002  # AER
>   /sys/bus/pci_express/devices/0000:00:1c.0:pcie010  # BW notification
>
> These are all the same PCI device.  AER is a PCI capability.
> Bandwidth notification is just a feature of all Downstream Ports.  I
> think it makes zero sense to have extra struct devices for them.  From
> a device point of view (enumeration, power management, VM assignment),
> we can't manage them separately from the underlying PCI device.  For
> example, we have three separate "power/" directories, but obviously
> there's only one point of control (00:1c.0):
>
>   /sys/devices/pci0000:00/0000:00:1c.0/power/
>   /sys/devices/pci0000:00/0000:00:1c.0/0000:00:1c.0:pcie002/power/
>   /sys/devices/pci0000:00/0000:00:1c.0/0000:00:1c.0:pcie010/power/

The superfluous power/ issue can be cleaned up with
device_set_pm_not_required().

What are the other problems this poses, because in other areas this
ability to subdivide a device's functionality into sub-drivers is a
useful organization principle? So much so that several device writer
teams came together to create the auxiliary-bus for the purpose of
allowing sub-drivers to be carved off for independent functionality
similar to the portdrv organization.

That said, I'm open to CXL switch support *not* building on the
portdrv model, but I'm not yet on the same page with your concern.
Dan Williams April 13, 2021, 5:18 p.m. UTC | #4
On Thu, Apr 8, 2021 at 7:13 PM Dan Williams <dan.j.williams@intel.com> wrote:
>
> Hi Bjorn, thanks for taking a look.
>
>
> On Thu, Apr 8, 2021 at 3:42 PM Bjorn Helgaas <helgaas@kernel.org> wrote:
> >
> > [+cc Greg, Rafael, Matthew: device model questions]
> >
> > Hi Dan,
> >
> > On Thu, Apr 01, 2021 at 07:31:20AM -0700, Dan Williams wrote:
> > > Once the cxl_root is established then other ports in the hierarchy can
> > > be attached. The cxl_port object, unlike cxl_root that is associated
> > > with host bridges, is associated with PCIE Root Ports or PCIE Switch
> > > Ports. Add cxl_port instances for all PCIE Root Ports in an ACPI0016
> > > host bridge.
> >
> > I'm not a device model expert, but I'm not sure about adding a new
> > /sys/bus/cxl/devices hierarchy.  I'm under the impression that CXL
> > devices will be enumerated by the PCI core as PCIe devices.
>
> Yes, PCIe is involved, but mostly only for the CXL.io slow path
> (configuration and provisioning via mailbox) when we're talking about
> memory expander devices (CXL calls these Type-3). So-called "Type-3"
> support is the primary driver of this infrastructure.
>
> You might be thinking of CXL accelerator devices that will look like
> plain PCIe devices that happen to participate in the CPU cache
> hierarchy (CXL calls these Type-1). There will also be accelerator
> devices that want to share coherent memory with the system (CXL calls
> these Type-2).
>
> The infrastructure being proposed here is primarily for the memory
> expander (Type-3) device case where the PCI sysfs hierarchy is wholly
> unsuited for modeling it. A single CXL memory region device may span
> multiple endpoints, switches, and host bridges. It poses similar
> stress to an OS device model as RAID where there is a driver for the
> component contributors to an upper level device / driver that exposes
> the RAID Volume (CXL memory region interleave set). The CXL memory
> decode space (HDM: Host Managed Device Memory) is independent of the
> PCIe MMIO BAR space.
>
> That's where the /sys/bus/cxl hierarchy is needed, to manage the HDM
> space across the CXL topology in a way that is foreign to PCIE (HDM
> Decoder hierarchy).
>
> > Doesn't
> > that mean we will have one struct device in the pci_dev, and another
> > one in the cxl_port?
>
> Yes, that is the proposal.
>
> > That seems like an issue to me.  More below.
>
> hmm...
>
> >
> > > The cxl_port instances for PCIE Switch Ports are not
> > > included here as those are to be modeled as another service device
> > > registered on the pcie_port_bus_type.
> >
> > I'm hesitant about the idea of adding more uses of pcie_port_bus_type.
> > I really dislike portdrv because it makes a parallel hierarchy:
> >
> >   /sys/bus/pci
> >   /sys/bus/pci_express
> >
> > for things that really should not be different.  There's a struct
> > device in pci_dev, and potentially several pcie_devices, each with
> > another struct device.  We make these pcie_device things for AER, DPC,
> > hotplug, etc.  E.g.,
> >
> >   /sys/bus/pci/devices/0000:00:1c.0
> >   /sys/bus/pci_express/devices/0000:00:1c.0:pcie002  # AER
> >   /sys/bus/pci_express/devices/0000:00:1c.0:pcie010  # BW notification
> >
> > These are all the same PCI device.  AER is a PCI capability.
> > Bandwidth notification is just a feature of all Downstream Ports.  I
> > think it makes zero sense to have extra struct devices for them.  From
> > a device point of view (enumeration, power management, VM assignment),
> > we can't manage them separately from the underlying PCI device.  For
> > example, we have three separate "power/" directories, but obviously
> > there's only one point of control (00:1c.0):
> >
> >   /sys/devices/pci0000:00/0000:00:1c.0/power/
> >   /sys/devices/pci0000:00/0000:00:1c.0/0000:00:1c.0:pcie002/power/
> >   /sys/devices/pci0000:00/0000:00:1c.0/0000:00:1c.0:pcie010/power/
>
> The superfluous power/ issue can be cleaned up with
> device_set_pm_not_required().
>
> What are the other problems this poses, because in other areas this
> ability to subdivide a device's functionality into sub-drivers is a
> useful organization principle? So much so that several device writer
> teams came together to create the auxiliary-bus for the purpose of
> allowing sub-drivers to be carved off for independent functionality
> similar to the portdrv organization.
>

Bjorn, any further thoughts on this?

This port architecture question is in the critical path for the next
phase of CXL development (targeting v5.14 not v5.13).
Bjorn Helgaas April 14, 2021, 1:14 a.m. UTC | #5
On Thu, Apr 08, 2021 at 07:13:38PM -0700, Dan Williams wrote:
> Hi Bjorn, thanks for taking a look.
> 
> On Thu, Apr 8, 2021 at 3:42 PM Bjorn Helgaas <helgaas@kernel.org> wrote:
> >
> > [+cc Greg, Rafael, Matthew: device model questions]
> >
> > Hi Dan,
> >
> > On Thu, Apr 01, 2021 at 07:31:20AM -0700, Dan Williams wrote:
> > > Once the cxl_root is established then other ports in the hierarchy can
> > > be attached. The cxl_port object, unlike cxl_root that is associated
> > > with host bridges, is associated with PCIE Root Ports or PCIE Switch
> > > Ports. Add cxl_port instances for all PCIE Root Ports in an ACPI0016
> > > host bridge.

Incidentally, "PCIe" is the abbreviation used in the PCIe specs, so I
try to use that instead of "PCIE" in drivers/pci/.

> > I'm not a device model expert, but I'm not sure about adding a new
> > /sys/bus/cxl/devices hierarchy.  I'm under the impression that CXL
> > devices will be enumerated by the PCI core as PCIe devices.
> 
> Yes, PCIe is involved, but mostly only for the CXL.io slow path
> (configuration and provisioning via mailbox) when we're talking about
> memory expander devices (CXL calls these Type-3). So-called "Type-3"
> support is the primary driver of this infrastructure.
>
> You might be thinking of CXL accelerator devices that will look like
> plain PCIe devices that happen to participate in the CPU cache
> hierarchy (CXL calls these Type-1). There will also be accelerator
> devices that want to share coherent memory with the system (CXL calls
> these Type-2).

IIUC all these CXL devices will be enumerated by the PCI core.  They
seem to have regular PCI BARs (separate from the HDM stuff), so the
PCI core will presumably manage address allocation for them.  It looks
like Function Level Reset and hotplug are supposed to use the regular
PCIe code.  I guess this will all be visible via lspci just like
regular PCI devices, right?

> The infrastructure being proposed here is primarily for the memory
> expander (Type-3) device case where the PCI sysfs hierarchy is wholly
> unsuited for modeling it. A single CXL memory region device may span
> multiple endpoints, switches, and host bridges. It poses similar
> stress to an OS device model as RAID where there is a driver for the
> component contributors to an upper level device / driver that exposes
> the RAID Volume (CXL memory region interleave set). The CXL memory
> decode space (HDM: Host Managed Device Memory) is independent of the
> PCIe MMIO BAR space.

It looks like you add a cxl_port for each ACPI0016 device and every
PCIe Root Port below it.  So I guess the upper level spanning is at a
higher level than cxl_port?

> That's where the /sys/bus/cxl hierarchy is needed, to manage the HDM
> space across the CXL topology in a way that is foreign to PCIE (HDM
> Decoder hierarchy).

When we do FLR on the PCIe device, what happens to these CXL clients?
Do they care?  Are they notified?  Do they need to do anything before
or after the FLR?

What about hotplug?  Spec says it leverages PCIe hotplug, but it looks
like maybe this all requires ACPI hotplug (acpiphp) for adding
ACPI0017 devices and notifying of hot remove requests?  If it uses
PCIe native hotplug (pciehp), what connects the CXL side to the PCI
side?

I guess the HDM address space management is entirely outside the scope
of PCI -- the address space is not described by the CXL host bridge
_CRS and not described by CXL endpoint BARs?  Where *is* it described
and who manages and allocates it?  I guess any transaction routing
through the CXL fabric for HDM space is also completely outside the
scope of PCI -- we don't need to worry about managing PCI-to-PCI
bridge windows, for instance?

Is there a cxl_register_driver() or something?  I assume there will be
drivers that need to manage CXL devices?  Or will they use
pci_register_driver() and search for a CXL capability?

> > Doesn't that mean we will have one struct device in the pci_dev,
> > and another one in the cxl_port?
> 
> Yes, that is the proposal.

> The superfluous power/ issue can be cleaned up with
> device_set_pm_not_required().

Thanks, we might be able to use that for portdrv.  I added it to my
list to investigate.

> What are the other problems this poses, because in other areas this
> ability to subdivide a device's functionality into sub-drivers is a
> useful organization principle?

Well, I'm thinking about things like enumeration, hotplug, reset,
resource management (BARs, bridge windows, etc), interrupts, power
management (suspend, resume, etc), and error reporting.  These are all
things that PCIe defines on a per-Function basis and seem kind of hard
to cleanly subdivide.

> So much so that several device writer teams came together to create
> the auxiliary-bus for the purpose of allowing sub-drivers to be
> carved off for independent functionality similar to the portdrv
> organization.

Is "auxiliary-bus" a specific thing?  I'm not familiar with it but
again I'd like to read up on it in case it has ideas we could
leverage.

Sub-drivers *is* an issue for PCI in general, although mostly I think
it tends to be historical devices where people made the design mistake
of putting several unrelated pieces of functionality in the same PCI
function, so I don't think PCI has good infrastructure for doing that.

Bjorn
Dan Williams April 15, 2021, 5:21 a.m. UTC | #6
On Tue, Apr 13, 2021 at 6:15 PM Bjorn Helgaas <helgaas@kernel.org> wrote:
>
> On Thu, Apr 08, 2021 at 07:13:38PM -0700, Dan Williams wrote:
> > Hi Bjorn, thanks for taking a look.
> >
> > On Thu, Apr 8, 2021 at 3:42 PM Bjorn Helgaas <helgaas@kernel.org> wrote:
> > >
> > > [+cc Greg, Rafael, Matthew: device model questions]
> > >
> > > Hi Dan,
> > >
> > > On Thu, Apr 01, 2021 at 07:31:20AM -0700, Dan Williams wrote:
> > > > Once the cxl_root is established then other ports in the hierarchy can
> > > > be attached. The cxl_port object, unlike cxl_root that is associated
> > > > with host bridges, is associated with PCIE Root Ports or PCIE Switch
> > > > Ports. Add cxl_port instances for all PCIE Root Ports in an ACPI0016
> > > > host bridge.
>
> Incidentally, "PCIe" is the abbreviation used in the PCIe specs, so I
> try to use that instead of "PCIE" in drivers/pci/.

Noted.

>
> > > I'm not a device model expert, but I'm not sure about adding a new
> > > /sys/bus/cxl/devices hierarchy.  I'm under the impression that CXL
> > > devices will be enumerated by the PCI core as PCIe devices.
> >
> > Yes, PCIe is involved, but mostly only for the CXL.io slow path
> > (configuration and provisioning via mailbox) when we're talking about
> > memory expander devices (CXL calls these Type-3). So-called "Type-3"
> > support is the primary driver of this infrastructure.
> >
> > You might be thinking of CXL accelerator devices that will look like
> > plain PCIe devices that happen to participate in the CPU cache
> > hierarchy (CXL calls these Type-1). There will also be accelerator
> > devices that want to share coherent memory with the system (CXL calls
> > these Type-2).
>
> IIUC all these CXL devices will be enumerated by the PCI core.  They
> seem to have regular PCI BARs (separate from the HDM stuff), so the
> PCI core will presumably manage address allocation for them.  It looks
> like Function Level Reset and hotplug are supposed to use the regular
> PCIe code.  I guess this will all be visible via lspci just like
> regular PCI devices, right?

Yes. the CXL.io protocol is synonymous with PCIe. Hotplug is native
PCIe hotplug to negotiate getting the card online and offline.
Although, for offline an additional constraint is to deny removal
whenever the card has active pages in the page allocator. Similar to
what happens today for ACPI memory hotplug where the OS can say "nope,
there's still active pages in the range you asked to eject".

FLR has no effect on CXL.cache or CXL.mem state, only CXL.io.

> > The infrastructure being proposed here is primarily for the memory
> > expander (Type-3) device case where the PCI sysfs hierarchy is wholly
> > unsuited for modeling it. A single CXL memory region device may span
> > multiple endpoints, switches, and host bridges. It poses similar
> > stress to an OS device model as RAID where there is a driver for the
> > component contributors to an upper level device / driver that exposes
> > the RAID Volume (CXL memory region interleave set). The CXL memory
> > decode space (HDM: Host Managed Device Memory) is independent of the
> > PCIe MMIO BAR space.
>
> It looks like you add a cxl_port for each ACPI0016 device and every
> PCIe Root Port below it.  So I guess the upper level spanning is at a
> higher level than cxl_port?

A memory interleave can span any level of the hierarchy. It can be
across host bridges at the top level, but also incorporate a leaf
device at the bottom of a CXL switch hierarchy. There will be a
cxl_port instance for each side of each link.

> > That's where the /sys/bus/cxl hierarchy is needed, to manage the HDM
> > space across the CXL topology in a way that is foreign to PCIE (HDM
> > Decoder hierarchy).
>
> When we do FLR on the PCIe device, what happens to these CXL clients?
> Do they care?  Are they notified?  Do they need to do anything before
> or after the FLR?

Per CXL Spec:

"FLR has no effect on the CXL.cache and CXL.mem protocol. Any
CXL.cache and CXL.mem related control registers including CXL DVSEC
structures and state held by the CXL device are not affected by FLR.
The memory controller hosting the HDM is not reset by FLR."

> What about hotplug?  Spec says it leverages PCIe hotplug, but it looks
> like maybe this all requires ACPI hotplug (acpiphp) for adding
> ACPI0017 devices and notifying of hot remove requests?  If it uses
> PCIe native hotplug (pciehp), what connects the CXL side to the PCI
> side?

No ACPI hotplug is not involved. ACPI0017 is essentially just a dummy
anchor device to hang the interleave set coordination. The connect
from native hotplug to CXL is the cxl_mem driver. When that it detects
a new device it walks the cxl_port hierarchy to see if one is a parent
of this endpoint. Then it registers its HDM decoders with the CXL core
and the CXL core can online it as a standalone interneleave set or
consolidate it with others to make a wider set. For persistent memory
there is on-device metadata to recall whether this device was part of
a set previously. For volatile-only devices it would need to rely on
some policy to decide if devices are immediately onlined standalone,
or wait for an administrator to configure them.

> I guess the HDM address space management is entirely outside the scope
> of PCI -- the address space is not described by the CXL host bridge
> _CRS and not described by CXL endpoint BARs?

Correct.

> Where *is* it described
> and who manages and allocates it?

ACPI0017 will communicate a set of address spaces that the CXL core
can allocate interleave sets.

>  I guess any transaction routing
> through the CXL fabric for HDM space is also completely outside the
> scope of PCI -- we don't need to worry about managing PCI-to-PCI
> bridge windows, for instance?

Correct. For example a PCIe switch could disable all I/O space and
Memory (MMIO) space, but still decode Host-managed Device Memory (HDM)
space.

> Is there a cxl_register_driver() or something?  I assume there will be
> drivers that need to manage CXL devices?  Or will they use
> pci_register_driver() and search for a CXL capability?

A bit of both. The cxl_mem driver does pci_register_driver(), but for
ports there will be a driver on the CXL bus for that component
capability. Both endpoints and switches will produce cxl_port
instances to be connected / driven by a core driver and coordinated
with a root level driver for address space and interleave management.

> > > Doesn't that mean we will have one struct device in the pci_dev,
> > > and another one in the cxl_port?
> >
> > Yes, that is the proposal.
>
> > The superfluous power/ issue can be cleaned up with
> > device_set_pm_not_required().
>
> Thanks, we might be able to use that for portdrv.  I added it to my
> list to investigate.
>
> > What are the other problems this poses, because in other areas this
> > ability to subdivide a device's functionality into sub-drivers is a
> > useful organization principle?
>
> Well, I'm thinking about things like enumeration, hotplug, reset,
> resource management (BARs, bridge windows, etc), interrupts, power
> management (suspend, resume, etc), and error reporting.  These are all
> things that PCIe defines on a per-Function basis and seem kind of hard
> to cleanly subdivide.

Right, I'm hoping like FLR there is little need to coordinate PCI /
CXL.io operations with CXL.mem operations, or that once a PCI driver
registers some CXL capabilities it never needs to look back. The only
hook that violates this so far is NAKing device removal when CXL.mem
for that device is busy.

> > So much so that several device writer teams came together to create
> > the auxiliary-bus for the purpose of allowing sub-drivers to be
> > carved off for independent functionality similar to the portdrv
> > organization.
>
> Is "auxiliary-bus" a specific thing?  I'm not familiar with it but
> again I'd like to read up on it in case it has ideas we could
> leverage.

auxiliary-bus is not a specific thing, it's a generic way for any
driver to register a custom device for a sub-driver to drive. One of
the primary examples are PCI Ethernet drivers exporting RDMA device
interfaces for common RDMA functionality. So you could have multiple
generations of Ethernet devices all producing a common RDMA interface
and rather than have an equivalent RDMA driver per generation just
create a shared common one that attaches to all the different baseline
Ethernet implementations.

See:

Documentation/driver-api/auxiliary_bus.rst

That document is still a bit too generic, and I have an item on my
backlog to flesh it out with more practical guidelines.

> Sub-drivers *is* an issue for PCI in general, although mostly I think
> it tends to be historical devices where people made the design mistake
> of putting several unrelated pieces of functionality in the same PCI
> function, so I don't think PCI has good infrastructure for doing that.

Auxiliary-bus might help especially if those unrelated pieces have
been duplicated across multiple different device implementations.
Aux-bus might clean up the driver model for those pieces.
diff mbox series

Patch

diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c
index d54c2d5de730..bc2a35ae880b 100644
--- a/drivers/cxl/acpi.c
+++ b/drivers/cxl/acpi.c
@@ -5,18 +5,117 @@ 
 #include <linux/device.h>
 #include <linux/kernel.h>
 #include <linux/acpi.h>
+#include <linux/pci.h>
 #include "cxl.h"
 
+static int match_ACPI0016(struct device *dev, const void *host)
+{
+	struct acpi_device *adev = to_acpi_device(dev);
+	const char *hid = acpi_device_hid(adev);
+
+	return strcmp(hid, "ACPI0016") == 0;
+}
+
+struct cxl_walk_context {
+	struct device *dev;
+	struct pci_bus *root;
+	struct cxl_port *port;
+	int error;
+	int count;
+};
+
+static int match_add_root_ports(struct pci_dev *pdev, void *data)
+{
+	struct cxl_walk_context *ctx = data;
+	struct pci_bus *root_bus = ctx->root;
+	struct cxl_port *port = ctx->port;
+	int type = pci_pcie_type(pdev);
+	struct device *dev = ctx->dev;
+	resource_size_t cxl_regs_phys;
+	int target_id = ctx->count;
+
+	if (pdev->bus != root_bus)
+		return 0;
+	if (!pci_is_pcie(pdev))
+		return 0;
+	if (type != PCI_EXP_TYPE_ROOT_PORT)
+		return 0;
+
+	ctx->count++;
+
+	/* TODO walk DVSEC to find component register base */
+	cxl_regs_phys = -1;
+
+	port = devm_cxl_add_port(dev, port, &pdev->dev, target_id,
+				 cxl_regs_phys);
+	if (IS_ERR(port)) {
+		ctx->error = PTR_ERR(port);
+		return ctx->error;
+	}
+
+	dev_dbg(dev, "%s: register: %s\n", dev_name(&pdev->dev),
+		dev_name(&port->dev));
+
+	return 0;
+}
+
+/*
+ * A host bridge may contain one or more root ports.  Register each port
+ * as a child of the cxl_root.
+ */
+static int cxl_acpi_register_ports(struct device *dev, struct acpi_device *root,
+				   struct cxl_port *port, int idx)
+{
+	struct acpi_pci_root *pci_root = acpi_pci_find_root(root->handle);
+	struct cxl_walk_context ctx;
+
+	if (!pci_root)
+		return -ENXIO;
+
+	/* TODO: fold in CEDT.CHBS retrieval */
+	port = devm_cxl_add_port(dev, port, &root->dev, idx, ~0ULL);
+	if (IS_ERR(port))
+		return PTR_ERR(port);
+	dev_dbg(dev, "%s: register: %s\n", dev_name(&root->dev),
+		dev_name(&port->dev));
+
+	ctx = (struct cxl_walk_context) {
+		.dev = dev,
+		.root = pci_root->bus,
+		.port = port,
+	};
+	pci_walk_bus(pci_root->bus, match_add_root_ports, &ctx);
+
+	if (ctx.count == 0)
+		return -ENODEV;
+	return ctx.error;
+}
+
 static int cxl_acpi_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
+	struct acpi_device *adev = ACPI_COMPANION(dev);
+	struct device *bridge = NULL;
 	struct cxl_root *cxl_root;
+	int rc, i = 0;
 
 	cxl_root = devm_cxl_add_root(dev, NULL, 0);
 	if (IS_ERR(cxl_root))
 		return PTR_ERR(cxl_root);
 	dev_dbg(dev, "register: %s\n", dev_name(&cxl_root->port.dev));
 
+	while (true) {
+		bridge = bus_find_device(adev->dev.bus, bridge, dev,
+					 match_ACPI0016);
+		if (!bridge)
+			break;
+
+		rc = cxl_acpi_register_ports(dev, to_acpi_device(bridge),
+					     &cxl_root->port, i++);
+		if (rc)
+			return rc;
+	}
+
 	return 0;
 }
 
diff --git a/drivers/cxl/core.c b/drivers/cxl/core.c
index 46c3b2588d2f..65cd704581bc 100644
--- a/drivers/cxl/core.c
+++ b/drivers/cxl/core.c
@@ -148,6 +148,15 @@  static void cxl_root_release(struct device *dev)
 	kfree(cxl_root);
 }
 
+static void cxl_port_release(struct device *dev)
+{
+	struct cxl_port *port = to_cxl_port(dev);
+
+	ida_free(&cxl_port_ida, port->id);
+	put_device(port->port_host);
+	kfree(port);
+}
+
 static ssize_t target_id_show(struct device *dev, struct device_attribute *attr,
 			      char *buf)
 {
@@ -178,6 +187,12 @@  static const struct device_type cxl_root_type = {
 	.groups = cxl_port_attribute_groups,
 };
 
+static const struct device_type cxl_port_type = {
+	.name = "cxl_port",
+	.release = cxl_port_release,
+	.groups = cxl_port_attribute_groups,
+};
+
 struct cxl_root *to_cxl_root(struct device *dev)
 {
 	if (dev_WARN_ONCE(dev, dev->type != &cxl_root_type,
@@ -188,7 +203,9 @@  struct cxl_root *to_cxl_root(struct device *dev)
 
 struct cxl_port *to_cxl_port(struct device *dev)
 {
-	if (dev_WARN_ONCE(dev, dev->type != &cxl_root_type,
+	if (dev_WARN_ONCE(dev,
+			  dev->type != &cxl_root_type &&
+			  dev->type != &cxl_port_type,
 			  "not a cxl_port device\n"))
 		return NULL;
 	return container_of(dev, struct cxl_port, dev);
@@ -360,6 +377,108 @@  struct cxl_root *devm_cxl_add_root(struct device *host,
 }
 EXPORT_SYMBOL_GPL(devm_cxl_add_root);
 
+static void cxl_unlink_port(void *_port)
+{
+	struct cxl_port *port = _port;
+
+	sysfs_remove_link(&port->dev.kobj, "host");
+}
+
+static int devm_cxl_link_port(struct device *dev, struct cxl_port *port)
+{
+	int rc;
+
+	rc = sysfs_create_link(&port->dev.kobj, &port->port_host->kobj, "host");
+	if (rc)
+		return rc;
+	return devm_add_action_or_reset(dev, cxl_unlink_port, port);
+}
+
+static struct cxl_port *cxl_port_alloc(struct cxl_port *parent_port,
+				       struct device *port_dev, int target_id,
+				       resource_size_t component_regs_phys)
+{
+	struct cxl_port *port;
+	struct device *dev;
+	int rc;
+
+	if (!port_dev)
+		return ERR_PTR(-EINVAL);
+
+	port = kzalloc(sizeof(*port), GFP_KERNEL);
+	if (!port)
+		return ERR_PTR(-ENOMEM);
+
+	rc = ida_alloc(&cxl_port_ida, GFP_KERNEL);
+	if (rc < 0)
+		goto err;
+
+	port->id = rc;
+	port->target_id = target_id;
+	port->port_host = get_device(port_dev);
+	port->component_regs_phys = component_regs_phys;
+
+	dev = &port->dev;
+	device_initialize(dev);
+	device_set_pm_not_required(dev);
+	dev->parent = &parent_port->dev;
+	dev->bus = &cxl_bus_type;
+	dev->type = &cxl_port_type;
+
+	return port;
+
+err:
+	kfree(port);
+	return ERR_PTR(rc);
+}
+
+/**
+ * devm_cxl_add_port() - add a cxl_port to the topology
+ * @host: devm context / discovery agent
+ * @parent_port: immediate ancestor towards cxl_root
+ * @port_host: PCI or platform-firmware device hosting this port
+ * @target_id: ordinal id relative to other siblings under @parent_port
+ * @component_regs_phys: CXL component register base address
+ */
+struct cxl_port *devm_cxl_add_port(struct device *host,
+				   struct cxl_port *parent_port,
+				   struct device *port_host, int target_id,
+				   resource_size_t component_regs_phys)
+{
+	struct cxl_port *port;
+	struct device *dev;
+	int rc;
+
+	port = cxl_port_alloc(parent_port, port_host, target_id,
+			      component_regs_phys);
+	if (IS_ERR(port))
+		return port;
+
+	dev = &port->dev;
+	rc = dev_set_name(dev, "port%d", port->id);
+	if (rc)
+		goto err;
+
+	rc = device_add(dev);
+	if (rc)
+		goto err;
+
+	rc = devm_add_action_or_reset(host, unregister_dev, dev);
+	if (rc)
+		return ERR_PTR(rc);
+
+	rc = devm_cxl_link_port(host, port);
+	if (rc)
+		return ERR_PTR(rc);
+
+	return port;
+
+err:
+	put_device(dev);
+	return ERR_PTR(rc);
+}
+EXPORT_SYMBOL_GPL(devm_cxl_add_port);
+
 /*
  * cxl_setup_device_regs() - Detect CXL Device register blocks
  * @dev: Host device of the @base mapping
diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
index 559f8343fee4..0211f44c95a2 100644
--- a/drivers/cxl/cxl.h
+++ b/drivers/cxl/cxl.h
@@ -134,5 +134,10 @@  struct cxl_address_space_dev *to_cxl_address_space(struct device *dev);
 struct cxl_root *devm_cxl_add_root(struct device *parent,
 				   struct cxl_address_space *cxl_space,
 				   int nr_spaces);
+struct cxl_port *devm_cxl_add_port(struct device *host,
+				   struct cxl_port *parent_port,
+				   struct device *port_host, int target_id,
+				   resource_size_t component_regs_phys);
+
 extern struct bus_type cxl_bus_type;
 #endif /* __CXL_H__ */