diff mbox series

[v3,31/40] cxl/memdev: Add numa_node attribute

Message ID 164298428430.3018233.16409089892707993289.stgit@dwillia2-desk3.amr.corp.intel.com (mailing list archive)
State Not Applicable
Delegated to: Bjorn Helgaas
Headers show
Series CXL.mem Topology Discovery and Hotplug Support | expand

Commit Message

Dan Williams Jan. 24, 2022, 12:31 a.m. UTC
While CXL memory targets will have their own memory target node,
individual memory devices may be affinitized like other PCI devices.
Emit that attribute for memdevs.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 Documentation/ABI/testing/sysfs-bus-cxl |    9 +++++++++
 drivers/cxl/core/memdev.c               |   17 +++++++++++++++++
 tools/testing/cxl/test/cxl.c            |    1 +
 3 files changed, 27 insertions(+)

Comments

Jonathan Cameron Jan. 31, 2022, 6:41 p.m. UTC | #1
On Sun, 23 Jan 2022 16:31:24 -0800
Dan Williams <dan.j.williams@intel.com> wrote:

> While CXL memory targets will have their own memory target node,
> individual memory devices may be affinitized like other PCI devices.
> Emit that attribute for memdevs.
> 
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>

Hmm. Is this just duplicating what we can get from
the PCI device?  It feels a bit like overkill to have it here
as well.

> ---
>  Documentation/ABI/testing/sysfs-bus-cxl |    9 +++++++++
>  drivers/cxl/core/memdev.c               |   17 +++++++++++++++++
>  tools/testing/cxl/test/cxl.c            |    1 +
>  3 files changed, 27 insertions(+)
> 
> diff --git a/Documentation/ABI/testing/sysfs-bus-cxl b/Documentation/ABI/testing/sysfs-bus-cxl
> index 87c0e5e65322..0b51cfec0c66 100644
> --- a/Documentation/ABI/testing/sysfs-bus-cxl
> +++ b/Documentation/ABI/testing/sysfs-bus-cxl
> @@ -34,6 +34,15 @@ Description:
>  		capability. Mandatory for CXL devices, see CXL 2.0 8.1.12.2
>  		Memory Device PCIe Capabilities and Extended Capabilities.
>  
> +What:		/sys/bus/cxl/devices/memX/numa_node
> +Date:		January, 2022
> +KernelVersion:	v5.18
> +Contact:	linux-cxl@vger.kernel.org
> +Description:
> +		(RO) If NUMA is enabled and the platform has affinitized the
> +		host PCI device for this memory device, emit the CPU node
> +		affinity for this device.
> +
>  What:		/sys/bus/cxl/devices/*/devtype
>  Date:		June, 2021
>  KernelVersion:	v5.14
> diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c
> index 1e574b052583..b2773664e407 100644
> --- a/drivers/cxl/core/memdev.c
> +++ b/drivers/cxl/core/memdev.c
> @@ -99,11 +99,19 @@ static ssize_t serial_show(struct device *dev, struct device_attribute *attr,
>  }
>  static DEVICE_ATTR_RO(serial);
>  
> +static ssize_t numa_node_show(struct device *dev, struct device_attribute *attr,
> +			      char *buf)
> +{
> +	return sprintf(buf, "%d\n", dev_to_node(dev));
> +}
> +static DEVICE_ATTR_RO(numa_node);
> +
>  static struct attribute *cxl_memdev_attributes[] = {
>  	&dev_attr_serial.attr,
>  	&dev_attr_firmware_version.attr,
>  	&dev_attr_payload_max.attr,
>  	&dev_attr_label_storage_size.attr,
> +	&dev_attr_numa_node.attr,
>  	NULL,
>  };
>  
> @@ -117,8 +125,17 @@ static struct attribute *cxl_memdev_ram_attributes[] = {
>  	NULL,
>  };
>  
> +static umode_t cxl_memdev_visible(struct kobject *kobj, struct attribute *a,
> +				  int n)
> +{
> +	if (!IS_ENABLED(CONFIG_NUMA) && a == &dev_attr_numa_node.attr)
> +		return 0;
> +	return a->mode;
> +}
> +
>  static struct attribute_group cxl_memdev_attribute_group = {
>  	.attrs = cxl_memdev_attributes,
> +	.is_visible = cxl_memdev_visible,
>  };
>  
>  static struct attribute_group cxl_memdev_ram_attribute_group = {
> diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c
> index 40ed567952e6..cd2f20f2707f 100644
> --- a/tools/testing/cxl/test/cxl.c
> +++ b/tools/testing/cxl/test/cxl.c
> @@ -583,6 +583,7 @@ static __init int cxl_test_init(void)
>  		if (!pdev)
>  			goto err_mem;
>  		pdev->dev.parent = &port->dev;
> +		set_dev_node(&pdev->dev, i % 2);
>  
>  		rc = platform_device_add(pdev);
>  		if (rc) {
>
Ben Widawsky Feb. 1, 2022, 3:31 p.m. UTC | #2
On 22-01-23 16:31:24, Dan Williams wrote:
> While CXL memory targets will have their own memory target node,
> individual memory devices may be affinitized like other PCI devices.
> Emit that attribute for memdevs.
> 
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>

This brings up an interesting question. Are all devices in a region affinitized
to the same NUMA node? I think they must be - at which point, should this
attribute be a part of a region, rather than a device?

> ---
>  Documentation/ABI/testing/sysfs-bus-cxl |    9 +++++++++
>  drivers/cxl/core/memdev.c               |   17 +++++++++++++++++
>  tools/testing/cxl/test/cxl.c            |    1 +
>  3 files changed, 27 insertions(+)
> 
> diff --git a/Documentation/ABI/testing/sysfs-bus-cxl b/Documentation/ABI/testing/sysfs-bus-cxl
> index 87c0e5e65322..0b51cfec0c66 100644
> --- a/Documentation/ABI/testing/sysfs-bus-cxl
> +++ b/Documentation/ABI/testing/sysfs-bus-cxl
> @@ -34,6 +34,15 @@ Description:
>  		capability. Mandatory for CXL devices, see CXL 2.0 8.1.12.2
>  		Memory Device PCIe Capabilities and Extended Capabilities.
>  
> +What:		/sys/bus/cxl/devices/memX/numa_node
> +Date:		January, 2022
> +KernelVersion:	v5.18
> +Contact:	linux-cxl@vger.kernel.org
> +Description:
> +		(RO) If NUMA is enabled and the platform has affinitized the
> +		host PCI device for this memory device, emit the CPU node
> +		affinity for this device.
> +

I think you'd want to say something about the device actively decoding. Perhaps
I'm mistaken though, can you affinitize without setting up HDM decoders for the
device?

>  What:		/sys/bus/cxl/devices/*/devtype
>  Date:		June, 2021
>  KernelVersion:	v5.14
> diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c
> index 1e574b052583..b2773664e407 100644
> --- a/drivers/cxl/core/memdev.c
> +++ b/drivers/cxl/core/memdev.c
> @@ -99,11 +99,19 @@ static ssize_t serial_show(struct device *dev, struct device_attribute *attr,
>  }
>  static DEVICE_ATTR_RO(serial);
>  
> +static ssize_t numa_node_show(struct device *dev, struct device_attribute *attr,
> +			      char *buf)
> +{
> +	return sprintf(buf, "%d\n", dev_to_node(dev));
> +}
> +static DEVICE_ATTR_RO(numa_node);
> +
>  static struct attribute *cxl_memdev_attributes[] = {
>  	&dev_attr_serial.attr,
>  	&dev_attr_firmware_version.attr,
>  	&dev_attr_payload_max.attr,
>  	&dev_attr_label_storage_size.attr,
> +	&dev_attr_numa_node.attr,
>  	NULL,
>  };
>  
> @@ -117,8 +125,17 @@ static struct attribute *cxl_memdev_ram_attributes[] = {
>  	NULL,
>  };
>  
> +static umode_t cxl_memdev_visible(struct kobject *kobj, struct attribute *a,
> +				  int n)
> +{
> +	if (!IS_ENABLED(CONFIG_NUMA) && a == &dev_attr_numa_node.attr)
> +		return 0;
> +	return a->mode;
> +}
> +
>  static struct attribute_group cxl_memdev_attribute_group = {
>  	.attrs = cxl_memdev_attributes,
> +	.is_visible = cxl_memdev_visible,
>  };
>  
>  static struct attribute_group cxl_memdev_ram_attribute_group = {
> diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c
> index 40ed567952e6..cd2f20f2707f 100644
> --- a/tools/testing/cxl/test/cxl.c
> +++ b/tools/testing/cxl/test/cxl.c
> @@ -583,6 +583,7 @@ static __init int cxl_test_init(void)
>  		if (!pdev)
>  			goto err_mem;
>  		pdev->dev.parent = &port->dev;
> +		set_dev_node(&pdev->dev, i % 2);
>  
>  		rc = platform_device_add(pdev);
>  		if (rc) {
>
Jonathan Cameron Feb. 1, 2022, 3:49 p.m. UTC | #3
On Tue, 1 Feb 2022 07:31:54 -0800
Ben Widawsky <ben.widawsky@intel.com> wrote:

> On 22-01-23 16:31:24, Dan Williams wrote:
> > While CXL memory targets will have their own memory target node,
> > individual memory devices may be affinitized like other PCI devices.
> > Emit that attribute for memdevs.
> > 
> > Signed-off-by: Dan Williams <dan.j.williams@intel.com>  
> 
> This brings up an interesting question. Are all devices in a region affinitized
> to the same NUMA node? I think they must be - at which point, should this
> attribute be a part of a region, rather than a device?

No particular reason why they should be in the same NUMA node
in general. People occasionally do memory interleave across memory
controllers on different CPU sockets (in extreme cases).
Whilst, at the interleave set level, that will have a single numa
domain, the individual devices making it up could be all over
the place and it will depend on the configuration.

> 
> > ---
> >  Documentation/ABI/testing/sysfs-bus-cxl |    9 +++++++++
> >  drivers/cxl/core/memdev.c               |   17 +++++++++++++++++
> >  tools/testing/cxl/test/cxl.c            |    1 +
> >  3 files changed, 27 insertions(+)
> > 
> > diff --git a/Documentation/ABI/testing/sysfs-bus-cxl b/Documentation/ABI/testing/sysfs-bus-cxl
> > index 87c0e5e65322..0b51cfec0c66 100644
> > --- a/Documentation/ABI/testing/sysfs-bus-cxl
> > +++ b/Documentation/ABI/testing/sysfs-bus-cxl
> > @@ -34,6 +34,15 @@ Description:
> >  		capability. Mandatory for CXL devices, see CXL 2.0 8.1.12.2
> >  		Memory Device PCIe Capabilities and Extended Capabilities.
> >  
> > +What:		/sys/bus/cxl/devices/memX/numa_node
> > +Date:		January, 2022
> > +KernelVersion:	v5.18
> > +Contact:	linux-cxl@vger.kernel.org
> > +Description:
> > +		(RO) If NUMA is enabled and the platform has affinitized the
> > +		host PCI device for this memory device, emit the CPU node
> > +		affinity for this device.
> > +  
> 
> I think you'd want to say something about the device actively decoding. Perhaps
> I'm mistaken though, can you affinitize without setting up HDM decoders for the
> device?

It's possible for PCI devices (up to a bug I should dig out the fix for)
to be placed in their own NUMA domains, or gain them from the root ports / host
bridges.  The magic of generic initiators and fiddly ACPI DSDT files that
the bios might want to create.

> 
> >  What:		/sys/bus/cxl/devices/*/devtype
> >  Date:		June, 2021
> >  KernelVersion:	v5.14
> > diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c
> > index 1e574b052583..b2773664e407 100644
> > --- a/drivers/cxl/core/memdev.c
> > +++ b/drivers/cxl/core/memdev.c
> > @@ -99,11 +99,19 @@ static ssize_t serial_show(struct device *dev, struct device_attribute *attr,
> >  }
> >  static DEVICE_ATTR_RO(serial);
> >  
> > +static ssize_t numa_node_show(struct device *dev, struct device_attribute *attr,
> > +			      char *buf)
> > +{
> > +	return sprintf(buf, "%d\n", dev_to_node(dev));
> > +}
> > +static DEVICE_ATTR_RO(numa_node);
> > +
> >  static struct attribute *cxl_memdev_attributes[] = {
> >  	&dev_attr_serial.attr,
> >  	&dev_attr_firmware_version.attr,
> >  	&dev_attr_payload_max.attr,
> >  	&dev_attr_label_storage_size.attr,
> > +	&dev_attr_numa_node.attr,
> >  	NULL,
> >  };
> >  
> > @@ -117,8 +125,17 @@ static struct attribute *cxl_memdev_ram_attributes[] = {
> >  	NULL,
> >  };
> >  
> > +static umode_t cxl_memdev_visible(struct kobject *kobj, struct attribute *a,
> > +				  int n)
> > +{
> > +	if (!IS_ENABLED(CONFIG_NUMA) && a == &dev_attr_numa_node.attr)
> > +		return 0;
> > +	return a->mode;
> > +}
> > +
> >  static struct attribute_group cxl_memdev_attribute_group = {
> >  	.attrs = cxl_memdev_attributes,
> > +	.is_visible = cxl_memdev_visible,
> >  };
> >  
> >  static struct attribute_group cxl_memdev_ram_attribute_group = {
> > diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c
> > index 40ed567952e6..cd2f20f2707f 100644
> > --- a/tools/testing/cxl/test/cxl.c
> > +++ b/tools/testing/cxl/test/cxl.c
> > @@ -583,6 +583,7 @@ static __init int cxl_test_init(void)
> >  		if (!pdev)
> >  			goto err_mem;
> >  		pdev->dev.parent = &port->dev;
> > +		set_dev_node(&pdev->dev, i % 2);
> >  
> >  		rc = platform_device_add(pdev);
> >  		if (rc) {
> >
Ben Widawsky Feb. 1, 2022, 4:35 p.m. UTC | #4
On 22-02-01 15:49:41, Jonathan Cameron wrote:
> On Tue, 1 Feb 2022 07:31:54 -0800
> Ben Widawsky <ben.widawsky@intel.com> wrote:
> 
> > On 22-01-23 16:31:24, Dan Williams wrote:
> > > While CXL memory targets will have their own memory target node,
> > > individual memory devices may be affinitized like other PCI devices.
> > > Emit that attribute for memdevs.
> > > 
> > > Signed-off-by: Dan Williams <dan.j.williams@intel.com>  
> > 
> > This brings up an interesting question. Are all devices in a region affinitized
> > to the same NUMA node? I think they must be - at which point, should this
> > attribute be a part of a region, rather than a device?
> 
> No particular reason why they should be in the same NUMA node
> in general. People occasionally do memory interleave across memory
> controllers on different CPU sockets (in extreme cases).
> Whilst, at the interleave set level, that will have a single numa
> domain, the individual devices making it up could be all over
> the place and it will depend on the configuration.

There's no such thing as a non-interleave set though. Everything is a region. A
x1 region is a region with one device.

> 
> > 
> > > ---
> > >  Documentation/ABI/testing/sysfs-bus-cxl |    9 +++++++++
> > >  drivers/cxl/core/memdev.c               |   17 +++++++++++++++++
> > >  tools/testing/cxl/test/cxl.c            |    1 +
> > >  3 files changed, 27 insertions(+)
> > > 
> > > diff --git a/Documentation/ABI/testing/sysfs-bus-cxl b/Documentation/ABI/testing/sysfs-bus-cxl
> > > index 87c0e5e65322..0b51cfec0c66 100644
> > > --- a/Documentation/ABI/testing/sysfs-bus-cxl
> > > +++ b/Documentation/ABI/testing/sysfs-bus-cxl
> > > @@ -34,6 +34,15 @@ Description:
> > >  		capability. Mandatory for CXL devices, see CXL 2.0 8.1.12.2
> > >  		Memory Device PCIe Capabilities and Extended Capabilities.
> > >  
> > > +What:		/sys/bus/cxl/devices/memX/numa_node
> > > +Date:		January, 2022
> > > +KernelVersion:	v5.18
> > > +Contact:	linux-cxl@vger.kernel.org
> > > +Description:
> > > +		(RO) If NUMA is enabled and the platform has affinitized the
> > > +		host PCI device for this memory device, emit the CPU node
> > > +		affinity for this device.
> > > +  
> > 
> > I think you'd want to say something about the device actively decoding. Perhaps
> > I'm mistaken though, can you affinitize without setting up HDM decoders for the
> > device?
> 
> It's possible for PCI devices (up to a bug I should dig out the fix for)
> to be placed in their own NUMA domains, or gain them from the root ports / host
> bridges.  The magic of generic initiators and fiddly ACPI DSDT files that
> the bios might want to create.
> 
> > 
> > >  What:		/sys/bus/cxl/devices/*/devtype
> > >  Date:		June, 2021
> > >  KernelVersion:	v5.14
> > > diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c
> > > index 1e574b052583..b2773664e407 100644
> > > --- a/drivers/cxl/core/memdev.c
> > > +++ b/drivers/cxl/core/memdev.c
> > > @@ -99,11 +99,19 @@ static ssize_t serial_show(struct device *dev, struct device_attribute *attr,
> > >  }
> > >  static DEVICE_ATTR_RO(serial);
> > >  
> > > +static ssize_t numa_node_show(struct device *dev, struct device_attribute *attr,
> > > +			      char *buf)
> > > +{
> > > +	return sprintf(buf, "%d\n", dev_to_node(dev));
> > > +}
> > > +static DEVICE_ATTR_RO(numa_node);
> > > +
> > >  static struct attribute *cxl_memdev_attributes[] = {
> > >  	&dev_attr_serial.attr,
> > >  	&dev_attr_firmware_version.attr,
> > >  	&dev_attr_payload_max.attr,
> > >  	&dev_attr_label_storage_size.attr,
> > > +	&dev_attr_numa_node.attr,
> > >  	NULL,
> > >  };
> > >  
> > > @@ -117,8 +125,17 @@ static struct attribute *cxl_memdev_ram_attributes[] = {
> > >  	NULL,
> > >  };
> > >  
> > > +static umode_t cxl_memdev_visible(struct kobject *kobj, struct attribute *a,
> > > +				  int n)
> > > +{
> > > +	if (!IS_ENABLED(CONFIG_NUMA) && a == &dev_attr_numa_node.attr)
> > > +		return 0;
> > > +	return a->mode;
> > > +}
> > > +
> > >  static struct attribute_group cxl_memdev_attribute_group = {
> > >  	.attrs = cxl_memdev_attributes,
> > > +	.is_visible = cxl_memdev_visible,
> > >  };
> > >  
> > >  static struct attribute_group cxl_memdev_ram_attribute_group = {
> > > diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c
> > > index 40ed567952e6..cd2f20f2707f 100644
> > > --- a/tools/testing/cxl/test/cxl.c
> > > +++ b/tools/testing/cxl/test/cxl.c
> > > @@ -583,6 +583,7 @@ static __init int cxl_test_init(void)
> > >  		if (!pdev)
> > >  			goto err_mem;
> > >  		pdev->dev.parent = &port->dev;
> > > +		set_dev_node(&pdev->dev, i % 2);
> > >  
> > >  		rc = platform_device_add(pdev);
> > >  		if (rc) {
> > >   
>
Jonathan Cameron Feb. 1, 2022, 5:38 p.m. UTC | #5
On Tue, 1 Feb 2022 08:35:18 -0800
Ben Widawsky <ben.widawsky@intel.com> wrote:

> On 22-02-01 15:49:41, Jonathan Cameron wrote:
> > On Tue, 1 Feb 2022 07:31:54 -0800
> > Ben Widawsky <ben.widawsky@intel.com> wrote:
> >   
> > > On 22-01-23 16:31:24, Dan Williams wrote:  
> > > > While CXL memory targets will have their own memory target node,
> > > > individual memory devices may be affinitized like other PCI devices.
> > > > Emit that attribute for memdevs.
> > > > 
> > > > Signed-off-by: Dan Williams <dan.j.williams@intel.com>    
> > > 
> > > This brings up an interesting question. Are all devices in a region affinitized
> > > to the same NUMA node? I think they must be - at which point, should this
> > > attribute be a part of a region, rather than a device?  
> > 
> > No particular reason why they should be in the same NUMA node
> > in general. People occasionally do memory interleave across memory
> > controllers on different CPU sockets (in extreme cases).
> > Whilst, at the interleave set level, that will have a single numa
> > domain, the individual devices making it up could be all over
> > the place and it will depend on the configuration.  
> 
> There's no such thing as a non-interleave set though. Everything is a region. A
> x1 region is a region with one device.

Well sort of.  That is the representation we are going with, but reality
is it's made up of a number of physical devices and those may have their
own numa domains and it maybe useful to a user / admin to know what those are
(as well as the domain of a resulting region..)


> 
> >   
> > >   
> > > > ---
> > > >  Documentation/ABI/testing/sysfs-bus-cxl |    9 +++++++++
> > > >  drivers/cxl/core/memdev.c               |   17 +++++++++++++++++
> > > >  tools/testing/cxl/test/cxl.c            |    1 +
> > > >  3 files changed, 27 insertions(+)
> > > > 
> > > > diff --git a/Documentation/ABI/testing/sysfs-bus-cxl b/Documentation/ABI/testing/sysfs-bus-cxl
> > > > index 87c0e5e65322..0b51cfec0c66 100644
> > > > --- a/Documentation/ABI/testing/sysfs-bus-cxl
> > > > +++ b/Documentation/ABI/testing/sysfs-bus-cxl
> > > > @@ -34,6 +34,15 @@ Description:
> > > >  		capability. Mandatory for CXL devices, see CXL 2.0 8.1.12.2
> > > >  		Memory Device PCIe Capabilities and Extended Capabilities.
> > > >  
> > > > +What:		/sys/bus/cxl/devices/memX/numa_node
> > > > +Date:		January, 2022
> > > > +KernelVersion:	v5.18
> > > > +Contact:	linux-cxl@vger.kernel.org
> > > > +Description:
> > > > +		(RO) If NUMA is enabled and the platform has affinitized the
> > > > +		host PCI device for this memory device, emit the CPU node
> > > > +		affinity for this device.
> > > > +    
> > > 
> > > I think you'd want to say something about the device actively decoding. Perhaps
> > > I'm mistaken though, can you affinitize without setting up HDM decoders for the
> > > device?  
> > 
> > It's possible for PCI devices (up to a bug I should dig out the fix for)
> > to be placed in their own NUMA domains, or gain them from the root ports / host
> > bridges.  The magic of generic initiators and fiddly ACPI DSDT files that
> > the bios might want to create.
> >   
> > >   
> > > >  What:		/sys/bus/cxl/devices/*/devtype
> > > >  Date:		June, 2021
> > > >  KernelVersion:	v5.14
> > > > diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c
> > > > index 1e574b052583..b2773664e407 100644
> > > > --- a/drivers/cxl/core/memdev.c
> > > > +++ b/drivers/cxl/core/memdev.c
> > > > @@ -99,11 +99,19 @@ static ssize_t serial_show(struct device *dev, struct device_attribute *attr,
> > > >  }
> > > >  static DEVICE_ATTR_RO(serial);
> > > >  
> > > > +static ssize_t numa_node_show(struct device *dev, struct device_attribute *attr,
> > > > +			      char *buf)
> > > > +{
> > > > +	return sprintf(buf, "%d\n", dev_to_node(dev));
> > > > +}
> > > > +static DEVICE_ATTR_RO(numa_node);
> > > > +
> > > >  static struct attribute *cxl_memdev_attributes[] = {
> > > >  	&dev_attr_serial.attr,
> > > >  	&dev_attr_firmware_version.attr,
> > > >  	&dev_attr_payload_max.attr,
> > > >  	&dev_attr_label_storage_size.attr,
> > > > +	&dev_attr_numa_node.attr,
> > > >  	NULL,
> > > >  };
> > > >  
> > > > @@ -117,8 +125,17 @@ static struct attribute *cxl_memdev_ram_attributes[] = {
> > > >  	NULL,
> > > >  };
> > > >  
> > > > +static umode_t cxl_memdev_visible(struct kobject *kobj, struct attribute *a,
> > > > +				  int n)
> > > > +{
> > > > +	if (!IS_ENABLED(CONFIG_NUMA) && a == &dev_attr_numa_node.attr)
> > > > +		return 0;
> > > > +	return a->mode;
> > > > +}
> > > > +
> > > >  static struct attribute_group cxl_memdev_attribute_group = {
> > > >  	.attrs = cxl_memdev_attributes,
> > > > +	.is_visible = cxl_memdev_visible,
> > > >  };
> > > >  
> > > >  static struct attribute_group cxl_memdev_ram_attribute_group = {
> > > > diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c
> > > > index 40ed567952e6..cd2f20f2707f 100644
> > > > --- a/tools/testing/cxl/test/cxl.c
> > > > +++ b/tools/testing/cxl/test/cxl.c
> > > > @@ -583,6 +583,7 @@ static __init int cxl_test_init(void)
> > > >  		if (!pdev)
> > > >  			goto err_mem;
> > > >  		pdev->dev.parent = &port->dev;
> > > > +		set_dev_node(&pdev->dev, i % 2);
> > > >  
> > > >  		rc = platform_device_add(pdev);
> > > >  		if (rc) {
> > > >     
> >
Dan Williams Feb. 1, 2022, 11:57 p.m. UTC | #6
On Mon, Jan 31, 2022 at 10:41 AM Jonathan Cameron
<Jonathan.Cameron@huawei.com> wrote:
>
> On Sun, 23 Jan 2022 16:31:24 -0800
> Dan Williams <dan.j.williams@intel.com> wrote:
>
> > While CXL memory targets will have their own memory target node,
> > individual memory devices may be affinitized like other PCI devices.
> > Emit that attribute for memdevs.
> >
> > Signed-off-by: Dan Williams <dan.j.williams@intel.com>
>
> Hmm. Is this just duplicating what we can get from
> the PCI device?  It feels a bit like overkill to have it here
> as well.

Not all cxl_memdevs are associated with PCI devices.
Dan Williams Feb. 1, 2022, 11:59 p.m. UTC | #7
On Tue, Feb 1, 2022 at 7:32 AM Ben Widawsky <ben.widawsky@intel.com> wrote:
>
> On 22-01-23 16:31:24, Dan Williams wrote:
> > While CXL memory targets will have their own memory target node,
> > individual memory devices may be affinitized like other PCI devices.
> > Emit that attribute for memdevs.
> >
> > Signed-off-by: Dan Williams <dan.j.williams@intel.com>
>
> This brings up an interesting question. Are all devices in a region affinitized
> to the same NUMA node? I think they must be - at which point, should this
> attribute be a part of a region, rather than a device?

This attribute is only here so that 'cxl list' can convey what CPU
node platform firmware might have affinitized the memory device. This
is for enumeration questions like, "how many memory devices are on
socket 0". The region NUMA node / affinity is wholly separate from
this number.
Dan Williams Feb. 2, 2022, 1:18 a.m. UTC | #8
On Tue, Feb 1, 2022 at 7:32 AM Ben Widawsky <ben.widawsky@intel.com> wrote:
>
> On 22-01-23 16:31:24, Dan Williams wrote:
> > While CXL memory targets will have their own memory target node,
> > individual memory devices may be affinitized like other PCI devices.
> > Emit that attribute for memdevs.
> >
> > Signed-off-by: Dan Williams <dan.j.williams@intel.com>
>
> This brings up an interesting question. Are all devices in a region affinitized
> to the same NUMA node? I think they must be - at which point, should this
> attribute be a part of a region, rather than a device?
>
> > ---
> >  Documentation/ABI/testing/sysfs-bus-cxl |    9 +++++++++
> >  drivers/cxl/core/memdev.c               |   17 +++++++++++++++++
> >  tools/testing/cxl/test/cxl.c            |    1 +
> >  3 files changed, 27 insertions(+)
> >
> > diff --git a/Documentation/ABI/testing/sysfs-bus-cxl b/Documentation/ABI/testing/sysfs-bus-cxl
> > index 87c0e5e65322..0b51cfec0c66 100644
> > --- a/Documentation/ABI/testing/sysfs-bus-cxl
> > +++ b/Documentation/ABI/testing/sysfs-bus-cxl
> > @@ -34,6 +34,15 @@ Description:
> >               capability. Mandatory for CXL devices, see CXL 2.0 8.1.12.2
> >               Memory Device PCIe Capabilities and Extended Capabilities.
> >
> > +What:                /sys/bus/cxl/devices/memX/numa_node
> > +Date:                January, 2022
> > +KernelVersion:       v5.18
> > +Contact:     linux-cxl@vger.kernel.org
> > +Description:
> > +             (RO) If NUMA is enabled and the platform has affinitized the
> > +             host PCI device for this memory device, emit the CPU node
> > +             affinity for this device.
> > +
>
> I think you'd want to say something about the device actively decoding. Perhaps
> I'm mistaken though, can you affinitize without setting up HDM decoders for the
> device?

Missed replying to this.

No, the memory decode is independent of the CPU to device affinity.
This affinity is like the affinity of an NVME device i.e. the affinity
of PCI.mmio to a CPU, not the resulting CXL.mem node of which there
may be multiple for a single device.
Jonathan Cameron Feb. 2, 2022, 9:44 a.m. UTC | #9
On Tue, 1 Feb 2022 15:57:10 -0800
Dan Williams <dan.j.williams@intel.com> wrote:

> On Mon, Jan 31, 2022 at 10:41 AM Jonathan Cameron
> <Jonathan.Cameron@huawei.com> wrote:
> >
> > On Sun, 23 Jan 2022 16:31:24 -0800
> > Dan Williams <dan.j.williams@intel.com> wrote:
> >  
> > > While CXL memory targets will have their own memory target node,
> > > individual memory devices may be affinitized like other PCI devices.
> > > Emit that attribute for memdevs.
> > >
> > > Signed-off-by: Dan Williams <dan.j.williams@intel.com>  
> >
> > Hmm. Is this just duplicating what we can get from
> > the PCI device?  It feels a bit like overkill to have it here
> > as well.  
> 
> Not all cxl_memdevs are associated with PCI devices.

Platform devices have numa nodes too...
Dan Williams Feb. 2, 2022, 3:44 p.m. UTC | #10
On Wed, Feb 2, 2022 at 1:45 AM Jonathan Cameron
<Jonathan.Cameron@huawei.com> wrote:
>
> On Tue, 1 Feb 2022 15:57:10 -0800
> Dan Williams <dan.j.williams@intel.com> wrote:
>
> > On Mon, Jan 31, 2022 at 10:41 AM Jonathan Cameron
> > <Jonathan.Cameron@huawei.com> wrote:
> > >
> > > On Sun, 23 Jan 2022 16:31:24 -0800
> > > Dan Williams <dan.j.williams@intel.com> wrote:
> > >
> > > > While CXL memory targets will have their own memory target node,
> > > > individual memory devices may be affinitized like other PCI devices.
> > > > Emit that attribute for memdevs.
> > > >
> > > > Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> > >
> > > Hmm. Is this just duplicating what we can get from
> > > the PCI device?  It feels a bit like overkill to have it here
> > > as well.
> >
> > Not all cxl_memdevs are associated with PCI devices.
>
> Platform devices have numa nodes too...

So what's the harm in having a numa_node attribute local to the memdev?

Yes, userspace could carry complications like:

cat $(readlink -f /sys/bus/cxl/devices/mem0)/../numa_node

...but if you take that argument to its extreme, most "numa_node"
attributes in sysfs could be eliminated because userspace can keep
walking up the hierarchy to find the numa_node versus the kernel doing
it on behalf of userspace.
Jonathan Cameron Feb. 3, 2022, 9:41 a.m. UTC | #11
On Wed, 2 Feb 2022 07:44:37 -0800
Dan Williams <dan.j.williams@intel.com> wrote:

> On Wed, Feb 2, 2022 at 1:45 AM Jonathan Cameron
> <Jonathan.Cameron@huawei.com> wrote:
> >
> > On Tue, 1 Feb 2022 15:57:10 -0800
> > Dan Williams <dan.j.williams@intel.com> wrote:
> >  
> > > On Mon, Jan 31, 2022 at 10:41 AM Jonathan Cameron
> > > <Jonathan.Cameron@huawei.com> wrote:  
> > > >
> > > > On Sun, 23 Jan 2022 16:31:24 -0800
> > > > Dan Williams <dan.j.williams@intel.com> wrote:
> > > >  
> > > > > While CXL memory targets will have their own memory target node,
> > > > > individual memory devices may be affinitized like other PCI devices.
> > > > > Emit that attribute for memdevs.
> > > > >
> > > > > Signed-off-by: Dan Williams <dan.j.williams@intel.com>  
> > > >
> > > > Hmm. Is this just duplicating what we can get from
> > > > the PCI device?  It feels a bit like overkill to have it here
> > > > as well.  
> > >
> > > Not all cxl_memdevs are associated with PCI devices.  
> >
> > Platform devices have numa nodes too...  
> 
> So what's the harm in having a numa_node attribute local to the memdev?
> 

I'm not really against, it just wanted to raise the question of
whether we want these to go further than the granularity at which
numa nodes can be assigned.  Right now that at platform_device or
PCI EP (from ACPI anyway).  Sure the value might come from higher
up a hierarchy but at least in theory it can be assigned to
individual devices.

This is pushing that description beyond that point so is worth discussing.

> Yes, userspace could carry complications like:
> 
> cat $(readlink -f /sys/bus/cxl/devices/mem0)/../numa_node
> 
> ...but if you take that argument to its extreme, most "numa_node"
> attributes in sysfs could be eliminated because userspace can keep
> walking up the hierarchy to find the numa_node versus the kernel doing
> it on behalf of userspace.
Dan Williams Feb. 3, 2022, 4:59 p.m. UTC | #12
On Thu, Feb 3, 2022 at 1:41 AM Jonathan Cameron
<Jonathan.Cameron@huawei.com> wrote:
>
> On Wed, 2 Feb 2022 07:44:37 -0800
> Dan Williams <dan.j.williams@intel.com> wrote:
>
> > On Wed, Feb 2, 2022 at 1:45 AM Jonathan Cameron
> > <Jonathan.Cameron@huawei.com> wrote:
> > >
> > > On Tue, 1 Feb 2022 15:57:10 -0800
> > > Dan Williams <dan.j.williams@intel.com> wrote:
> > >
> > > > On Mon, Jan 31, 2022 at 10:41 AM Jonathan Cameron
> > > > <Jonathan.Cameron@huawei.com> wrote:
> > > > >
> > > > > On Sun, 23 Jan 2022 16:31:24 -0800
> > > > > Dan Williams <dan.j.williams@intel.com> wrote:
> > > > >
> > > > > > While CXL memory targets will have their own memory target node,
> > > > > > individual memory devices may be affinitized like other PCI devices.
> > > > > > Emit that attribute for memdevs.
> > > > > >
> > > > > > Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> > > > >
> > > > > Hmm. Is this just duplicating what we can get from
> > > > > the PCI device?  It feels a bit like overkill to have it here
> > > > > as well.
> > > >
> > > > Not all cxl_memdevs are associated with PCI devices.
> > >
> > > Platform devices have numa nodes too...
> >
> > So what's the harm in having a numa_node attribute local to the memdev?
> >
>
> I'm not really against, it just wanted to raise the question of
> whether we want these to go further than the granularity at which
> numa nodes can be assigned.

What is the "granularity at which numa nodes can be assigned"? It
sounds like you are referencing a standard / document, so maybe I
missed something. Certainly Proximity Domains != Linux NUMA nodes so
it's not ACPI.

>  Right now that at platform_device or
> PCI EP (from ACPI anyway).  Sure the value might come from higher
> up a hierarchy but at least in theory it can be assigned to
> individual devices.
>
> This is pushing that description beyond that point so is worth discussing.

To me, any device that presents a driver interface can declare its CPU
affinity with a numa_node leaf attribute. Once you start walking the
device tree to infer the node from parent information you also need to
be worried about whether the Linux device topology follows the NUMA
topology. The leaf attribute removes that ambiguity.
Jonathan Cameron Feb. 3, 2022, 6:05 p.m. UTC | #13
On Thu, 3 Feb 2022 08:59:44 -0800
Dan Williams <dan.j.williams@intel.com> wrote:

> On Thu, Feb 3, 2022 at 1:41 AM Jonathan Cameron
> <Jonathan.Cameron@huawei.com> wrote:
> >
> > On Wed, 2 Feb 2022 07:44:37 -0800
> > Dan Williams <dan.j.williams@intel.com> wrote:
> >  
> > > On Wed, Feb 2, 2022 at 1:45 AM Jonathan Cameron
> > > <Jonathan.Cameron@huawei.com> wrote:  
> > > >
> > > > On Tue, 1 Feb 2022 15:57:10 -0800
> > > > Dan Williams <dan.j.williams@intel.com> wrote:
> > > >  
> > > > > On Mon, Jan 31, 2022 at 10:41 AM Jonathan Cameron
> > > > > <Jonathan.Cameron@huawei.com> wrote:  
> > > > > >
> > > > > > On Sun, 23 Jan 2022 16:31:24 -0800
> > > > > > Dan Williams <dan.j.williams@intel.com> wrote:
> > > > > >  
> > > > > > > While CXL memory targets will have their own memory target node,
> > > > > > > individual memory devices may be affinitized like other PCI devices.
> > > > > > > Emit that attribute for memdevs.
> > > > > > >
> > > > > > > Signed-off-by: Dan Williams <dan.j.williams@intel.com>  
> > > > > >
> > > > > > Hmm. Is this just duplicating what we can get from
> > > > > > the PCI device?  It feels a bit like overkill to have it here
> > > > > > as well.  
> > > > >
> > > > > Not all cxl_memdevs are associated with PCI devices.  
> > > >
> > > > Platform devices have numa nodes too...  
> > >
> > > So what's the harm in having a numa_node attribute local to the memdev?
> > >  
> >
> > I'm not really against, it just wanted to raise the question of
> > whether we want these to go further than the granularity at which
> > numa nodes can be assigned.  
> 
> What is the "granularity at which numa nodes can be assigned"? It
> sounds like you are referencing a standard / document, so maybe I
> missed something. Certainly Proximity Domains != Linux NUMA nodes so
> it's not ACPI.

Sure, it's the fusion of a number of possible sources, one of which
is ACPI. If there is a reason why it differs to the parent device
(which can be ACPI, or can just be from a bunch of other places which
I'm sure will keep growing) then it definitely makes sense to expose
it at that level. 

> 
> >  Right now that at platform_device or
> > PCI EP (from ACPI anyway).  Sure the value might come from higher
> > up a hierarchy but at least in theory it can be assigned to
> > individual devices.
> >
> > This is pushing that description beyond that point so is worth discussing.  
> 
> To me, any device that presents a driver interface can declare its CPU
> affinity with a numa_node leaf attribute. Once you start walking the
> device tree to infer the node from parent information you also need to
> be worried about whether the Linux device topology follows the NUMA
> topology. The leaf attribute removes that ambiguity.
I'll go with 'maybe'...

Either way I'm fine with this change, just wanted to bring attention to
the duplication as it wasn't totally clear to me it was a good idea.

FWIW

Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Dan Williams Feb. 4, 2022, 4:25 a.m. UTC | #14
On Thu, Feb 3, 2022 at 10:15 AM Jonathan Cameron
<Jonathan.Cameron@huawei.com> wrote:
>
> On Thu, 3 Feb 2022 08:59:44 -0800
> Dan Williams <dan.j.williams@intel.com> wrote:
>
> > On Thu, Feb 3, 2022 at 1:41 AM Jonathan Cameron
> > <Jonathan.Cameron@huawei.com> wrote:
> > >
> > > On Wed, 2 Feb 2022 07:44:37 -0800
> > > Dan Williams <dan.j.williams@intel.com> wrote:
> > >
> > > > On Wed, Feb 2, 2022 at 1:45 AM Jonathan Cameron
> > > > <Jonathan.Cameron@huawei.com> wrote:
> > > > >
> > > > > On Tue, 1 Feb 2022 15:57:10 -0800
> > > > > Dan Williams <dan.j.williams@intel.com> wrote:
> > > > >
> > > > > > On Mon, Jan 31, 2022 at 10:41 AM Jonathan Cameron
> > > > > > <Jonathan.Cameron@huawei.com> wrote:
> > > > > > >
> > > > > > > On Sun, 23 Jan 2022 16:31:24 -0800
> > > > > > > Dan Williams <dan.j.williams@intel.com> wrote:
> > > > > > >
> > > > > > > > While CXL memory targets will have their own memory target node,
> > > > > > > > individual memory devices may be affinitized like other PCI devices.
> > > > > > > > Emit that attribute for memdevs.
> > > > > > > >
> > > > > > > > Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> > > > > > >
> > > > > > > Hmm. Is this just duplicating what we can get from
> > > > > > > the PCI device?  It feels a bit like overkill to have it here
> > > > > > > as well.
> > > > > >
> > > > > > Not all cxl_memdevs are associated with PCI devices.
> > > > >
> > > > > Platform devices have numa nodes too...
> > > >
> > > > So what's the harm in having a numa_node attribute local to the memdev?
> > > >
> > >
> > > I'm not really against, it just wanted to raise the question of
> > > whether we want these to go further than the granularity at which
> > > numa nodes can be assigned.
> >
> > What is the "granularity at which numa nodes can be assigned"? It
> > sounds like you are referencing a standard / document, so maybe I
> > missed something. Certainly Proximity Domains != Linux NUMA nodes so
> > it's not ACPI.
>
> Sure, it's the fusion of a number of possible sources, one of which
> is ACPI. If there is a reason why it differs to the parent device
> (which can be ACPI, or can just be from a bunch of other places which
> I'm sure will keep growing) then it definitely makes sense to expose
> it at that level.
>
> >
> > >  Right now that at platform_device or
> > > PCI EP (from ACPI anyway).  Sure the value might come from higher
> > > up a hierarchy but at least in theory it can be assigned to
> > > individual devices.
> > >
> > > This is pushing that description beyond that point so is worth discussing.
> >
> > To me, any device that presents a driver interface can declare its CPU
> > affinity with a numa_node leaf attribute. Once you start walking the
> > device tree to infer the node from parent information you also need to
> > be worried about whether the Linux device topology follows the NUMA
> > topology. The leaf attribute removes that ambiguity.
> I'll go with 'maybe'...
>
> Either way I'm fine with this change, just wanted to bring attention to
> the duplication as it wasn't totally clear to me it was a good idea.

If the bar to upstream something was when it was totally clear it was
a good idea... I'd have a lot less patches to send.

>
> FWIW
>
> Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>

Appreciate the discussion.
diff mbox series

Patch

diff --git a/Documentation/ABI/testing/sysfs-bus-cxl b/Documentation/ABI/testing/sysfs-bus-cxl
index 87c0e5e65322..0b51cfec0c66 100644
--- a/Documentation/ABI/testing/sysfs-bus-cxl
+++ b/Documentation/ABI/testing/sysfs-bus-cxl
@@ -34,6 +34,15 @@  Description:
 		capability. Mandatory for CXL devices, see CXL 2.0 8.1.12.2
 		Memory Device PCIe Capabilities and Extended Capabilities.
 
+What:		/sys/bus/cxl/devices/memX/numa_node
+Date:		January, 2022
+KernelVersion:	v5.18
+Contact:	linux-cxl@vger.kernel.org
+Description:
+		(RO) If NUMA is enabled and the platform has affinitized the
+		host PCI device for this memory device, emit the CPU node
+		affinity for this device.
+
 What:		/sys/bus/cxl/devices/*/devtype
 Date:		June, 2021
 KernelVersion:	v5.14
diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c
index 1e574b052583..b2773664e407 100644
--- a/drivers/cxl/core/memdev.c
+++ b/drivers/cxl/core/memdev.c
@@ -99,11 +99,19 @@  static ssize_t serial_show(struct device *dev, struct device_attribute *attr,
 }
 static DEVICE_ATTR_RO(serial);
 
+static ssize_t numa_node_show(struct device *dev, struct device_attribute *attr,
+			      char *buf)
+{
+	return sprintf(buf, "%d\n", dev_to_node(dev));
+}
+static DEVICE_ATTR_RO(numa_node);
+
 static struct attribute *cxl_memdev_attributes[] = {
 	&dev_attr_serial.attr,
 	&dev_attr_firmware_version.attr,
 	&dev_attr_payload_max.attr,
 	&dev_attr_label_storage_size.attr,
+	&dev_attr_numa_node.attr,
 	NULL,
 };
 
@@ -117,8 +125,17 @@  static struct attribute *cxl_memdev_ram_attributes[] = {
 	NULL,
 };
 
+static umode_t cxl_memdev_visible(struct kobject *kobj, struct attribute *a,
+				  int n)
+{
+	if (!IS_ENABLED(CONFIG_NUMA) && a == &dev_attr_numa_node.attr)
+		return 0;
+	return a->mode;
+}
+
 static struct attribute_group cxl_memdev_attribute_group = {
 	.attrs = cxl_memdev_attributes,
+	.is_visible = cxl_memdev_visible,
 };
 
 static struct attribute_group cxl_memdev_ram_attribute_group = {
diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c
index 40ed567952e6..cd2f20f2707f 100644
--- a/tools/testing/cxl/test/cxl.c
+++ b/tools/testing/cxl/test/cxl.c
@@ -583,6 +583,7 @@  static __init int cxl_test_init(void)
 		if (!pdev)
 			goto err_mem;
 		pdev->dev.parent = &port->dev;
+		set_dev_node(&pdev->dev, i % 2);
 
 		rc = platform_device_add(pdev);
 		if (rc) {