diff mbox series

[09/26] cxl/region: Add Dynamic Capacity CXL region support

Message ID 20240324-dcd-type2-upstream-v1-9-b7b00d623625@intel.com (mailing list archive)
State New, archived
Headers show
Series DCD: Add support for Dynamic Capacity Devices (DCD) | expand

Commit Message

Ira Weiny March 24, 2024, 11:18 p.m. UTC
From: Navneet Singh <navneet.singh@intel.com>

CXL devices optionally support dynamic capacity.  CXL Regions must be
configured correctly to access this capacity.  Similar to ram and pmem
partitions, DC Regions, as they are called in CXL 3.1, represent
different partitions of the DPA space.

Introduce the concept of a sparse DAX region.  Add the create_dc_region
sysfs entry to create sparse DC DAX regions.  Special case DC capable
regions to create a 0 sized seed DAX device to maintain backwards
compatibility with older software which needs a default DAX device to
hold the region reference.

Flag sparse DAX regions to indicate 0 capacity available until such time
as DC capacity is added.

Interleaving is deferred in this series.  Add an early check.

Signed-off-by: Navneet Singh <navneet.singh@intel.com>
Co-developed-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>

---
Changes for v1:
[djiang: mark sysfs entries to be in 6.10 kernel including date]
[djbw: change dax region typing to be 'sparse' rather than 'dynamic']
[iweiny: rebase changes to master instead of type2 patches]
---
 Documentation/ABI/testing/sysfs-bus-cxl | 22 +++++++++++-----------
 drivers/cxl/core/core.h                 |  1 +
 drivers/cxl/core/port.c                 |  1 +
 drivers/cxl/core/region.c               | 33 +++++++++++++++++++++++++++++++++
 drivers/dax/bus.c                       |  8 ++++++++
 drivers/dax/bus.h                       |  1 +
 drivers/dax/cxl.c                       | 15 +++++++++++++--
 7 files changed, 68 insertions(+), 13 deletions(-)

Comments

Fan Ni March 26, 2024, 10:31 p.m. UTC | #1
On Sun, Mar 24, 2024 at 04:18:12PM -0700, ira.weiny@intel.com wrote:
> From: Navneet Singh <navneet.singh@intel.com>
> 
> CXL devices optionally support dynamic capacity.  CXL Regions must be
> configured correctly to access this capacity.  Similar to ram and pmem
> partitions, DC Regions, as they are called in CXL 3.1, represent
> different partitions of the DPA space.
> 
> Introduce the concept of a sparse DAX region.  Add the create_dc_region
> sysfs entry to create sparse DC DAX regions.  Special case DC capable
> regions to create a 0 sized seed DAX device to maintain backwards
> compatibility with older software which needs a default DAX device to
> hold the region reference.
> 
> Flag sparse DAX regions to indicate 0 capacity available until such time
> as DC capacity is added.
> 
> Interleaving is deferred in this series.  Add an early check.
> 
> Signed-off-by: Navneet Singh <navneet.singh@intel.com>
> Co-developed-by: Ira Weiny <ira.weiny@intel.com>
> Signed-off-by: Ira Weiny <ira.weiny@intel.com>
> 
> ---
> Changes for v1:
> [djiang: mark sysfs entries to be in 6.10 kernel including date]
> [djbw: change dax region typing to be 'sparse' rather than 'dynamic']
> [iweiny: rebase changes to master instead of type2 patches]
> ---
>  Documentation/ABI/testing/sysfs-bus-cxl | 22 +++++++++++-----------
>  drivers/cxl/core/core.h                 |  1 +
>  drivers/cxl/core/port.c                 |  1 +
>  drivers/cxl/core/region.c               | 33 +++++++++++++++++++++++++++++++++
>  drivers/dax/bus.c                       |  8 ++++++++
>  drivers/dax/bus.h                       |  1 +
>  drivers/dax/cxl.c                       | 15 +++++++++++++--
>  7 files changed, 68 insertions(+), 13 deletions(-)
> 
> diff --git a/Documentation/ABI/testing/sysfs-bus-cxl b/Documentation/ABI/testing/sysfs-bus-cxl
> index 8a4f572c8498..f0cf52fff9fa 100644
> --- a/Documentation/ABI/testing/sysfs-bus-cxl
> +++ b/Documentation/ABI/testing/sysfs-bus-cxl
> @@ -411,20 +411,20 @@ Description:
>  		interleave_granularity).
>  
>  
> -What:		/sys/bus/cxl/devices/decoderX.Y/create_{pmem,ram}_region
> -Date:		May, 2022, January, 2023
> -KernelVersion:	v6.0 (pmem), v6.3 (ram)
> +What:		/sys/bus/cxl/devices/decoderX.Y/create_{pmem,ram,dc}_region
> +Date:		May, 2022, January, 2023, June 2024
> +KernelVersion:	v6.0 (pmem), v6.3 (ram), v6.10 (dc)
>  Contact:	linux-cxl@vger.kernel.org
>  Description:
>  		(RW) Write a string in the form 'regionZ' to start the process
> -		of defining a new persistent, or volatile memory region
> -		(interleave-set) within the decode range bounded by root decoder
> -		'decoderX.Y'. The value written must match the current value
> -		returned from reading this attribute. An atomic compare exchange
> -		operation is done on write to assign the requested id to a
> -		region and allocate the region-id for the next creation attempt.
> -		EBUSY is returned if the region name written does not match the
> -		current cached value.
> +		of defining a new persistent, volatile, or Dynamic Capacity
> +		(DC) memory region (interleave-set) within the decode range
> +		bounded by root decoder 'decoderX.Y'. The value written must
> +		match the current value returned from reading this attribute.
> +		An atomic compare exchange operation is done on write to assign
> +		the requested id to a region and allocate the region-id for the
> +		next creation attempt.  EBUSY is returned if the region name
> +		written does not match the current cached value.
>  
>  
>  What:		/sys/bus/cxl/devices/decoderX.Y/delete_region
> diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h
> index 3b64fb1b9ed0..91abeffbe985 100644
> --- a/drivers/cxl/core/core.h
> +++ b/drivers/cxl/core/core.h
> @@ -13,6 +13,7 @@ extern struct attribute_group cxl_base_attribute_group;
>  #ifdef CONFIG_CXL_REGION
>  extern struct device_attribute dev_attr_create_pmem_region;
>  extern struct device_attribute dev_attr_create_ram_region;
> +extern struct device_attribute dev_attr_create_dc_region;
>  extern struct device_attribute dev_attr_delete_region;
>  extern struct device_attribute dev_attr_region;
>  extern const struct device_type cxl_pmem_region_type;
> diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c
> index 036b61cb3007..661177b575f7 100644
> --- a/drivers/cxl/core/port.c
> +++ b/drivers/cxl/core/port.c
> @@ -335,6 +335,7 @@ static struct attribute *cxl_decoder_root_attrs[] = {
>  	&dev_attr_qos_class.attr,
>  	SET_CXL_REGION_ATTR(create_pmem_region)
>  	SET_CXL_REGION_ATTR(create_ram_region)
> +	SET_CXL_REGION_ATTR(create_dc_region)
>  	SET_CXL_REGION_ATTR(delete_region)
>  	NULL,
>  };
> diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
> index ec3b8c6948e9..0d7b09a49dcf 100644
> --- a/drivers/cxl/core/region.c
> +++ b/drivers/cxl/core/region.c
> @@ -2205,6 +2205,7 @@ static struct cxl_region *devm_cxl_add_region(struct cxl_root_decoder *cxlrd,
>  	switch (mode) {
>  	case CXL_REGION_RAM:
>  	case CXL_REGION_PMEM:
> +	case CXL_REGION_DC:
>  		break;
>  	default:
>  		dev_err(&cxlrd->cxlsd.cxld.dev, "unsupported mode %s\n",
> @@ -2314,6 +2315,32 @@ static ssize_t create_ram_region_store(struct device *dev,
>  }
>  DEVICE_ATTR_RW(create_ram_region);
>  
> +static ssize_t create_dc_region_show(struct device *dev,
> +				     struct device_attribute *attr, char *buf)
> +{
> +	return __create_region_show(to_cxl_root_decoder(dev), buf);
> +}
> +
> +static ssize_t create_dc_region_store(struct device *dev,
> +				      struct device_attribute *attr,
> +				      const char *buf, size_t len)
> +{
> +	struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev);
> +	struct cxl_region *cxlr;
> +	int rc, id;
> +
> +	rc = sscanf(buf, "region%d\n", &id);
> +	if (rc != 1)
> +		return -EINVAL;
> +
> +	cxlr = __create_region(cxlrd, CXL_REGION_DC, id);
> +	if (IS_ERR(cxlr))
> +		return PTR_ERR(cxlr);
> +
> +	return len;
> +}
> +DEVICE_ATTR_RW(create_dc_region);

create_ram_region_store, create_pmem_region_store and
create_dc_region_store have mostly duplicate code, should we consider
extracting out as a helper function and pass region type for ram/pmem/dc region
store?

Fan 

> +
>  static ssize_t region_show(struct device *dev, struct device_attribute *attr,
>  			   char *buf)
>  {
> @@ -2759,6 +2786,11 @@ static int devm_cxl_add_dax_region(struct cxl_region *cxlr)
>  	struct device *dev;
>  	int rc;
>  
> +	if (cxlr->mode == CXL_REGION_DC && cxlr->params.interleave_ways != 1) {
> +		dev_err(&cxlr->dev, "Interleaving DC not supported\n");
> +		return -EINVAL;
> +	}
> +
>  	cxlr_dax = cxl_dax_region_alloc(cxlr);
>  	if (IS_ERR(cxlr_dax))
>  		return PTR_ERR(cxlr_dax);
> @@ -3040,6 +3072,7 @@ static int cxl_region_probe(struct device *dev)
>  	case CXL_REGION_PMEM:
>  		return devm_cxl_add_pmem_region(cxlr);
>  	case CXL_REGION_RAM:
> +	case CXL_REGION_DC:
>  		/*
>  		 * The region can not be manged by CXL if any portion of
>  		 * it is already online as 'System RAM'
> diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
> index cb148f74ceda..903566aff5eb 100644
> --- a/drivers/dax/bus.c
> +++ b/drivers/dax/bus.c
> @@ -181,6 +181,11 @@ static bool is_static(struct dax_region *dax_region)
>  	return (dax_region->res.flags & IORESOURCE_DAX_STATIC) != 0;
>  }
>  
> +static bool is_sparse(struct dax_region *dax_region)
> +{
> +	return (dax_region->res.flags & IORESOURCE_DAX_SPARSE_CAP) != 0;
> +}
> +
>  bool static_dev_dax(struct dev_dax *dev_dax)
>  {
>  	return is_static(dev_dax->region);
> @@ -304,6 +309,9 @@ static unsigned long long dax_region_avail_size(struct dax_region *dax_region)
>  
>  	WARN_ON_ONCE(!rwsem_is_locked(&dax_region_rwsem));
>  
> +	if (is_sparse(dax_region))
> +		return 0;
> +
>  	for_each_dax_region_resource(dax_region, res)
>  		size -= resource_size(res);
>  	return size;
> diff --git a/drivers/dax/bus.h b/drivers/dax/bus.h
> index cbbf64443098..783bfeef42cc 100644
> --- a/drivers/dax/bus.h
> +++ b/drivers/dax/bus.h
> @@ -13,6 +13,7 @@ struct dax_region;
>  /* dax bus specific ioresource flags */
>  #define IORESOURCE_DAX_STATIC BIT(0)
>  #define IORESOURCE_DAX_KMEM BIT(1)
> +#define IORESOURCE_DAX_SPARSE_CAP BIT(2)
>  
>  struct dax_region *alloc_dax_region(struct device *parent, int region_id,
>  		struct range *range, int target_node, unsigned int align,
> diff --git a/drivers/dax/cxl.c b/drivers/dax/cxl.c
> index c696837ab23c..415d03fbf9b6 100644
> --- a/drivers/dax/cxl.c
> +++ b/drivers/dax/cxl.c
> @@ -13,19 +13,30 @@ static int cxl_dax_region_probe(struct device *dev)
>  	struct cxl_region *cxlr = cxlr_dax->cxlr;
>  	struct dax_region *dax_region;
>  	struct dev_dax_data data;
> +	resource_size_t dev_size;
> +	unsigned long flags;
>  
>  	if (nid == NUMA_NO_NODE)
>  		nid = memory_add_physaddr_to_nid(cxlr_dax->hpa_range.start);
>  
> +	flags = IORESOURCE_DAX_KMEM;
> +	if (cxlr->mode == CXL_REGION_DC)
> +		flags |= IORESOURCE_DAX_SPARSE_CAP;
> +
>  	dax_region = alloc_dax_region(dev, cxlr->id, &cxlr_dax->hpa_range, nid,
> -				      PMD_SIZE, IORESOURCE_DAX_KMEM);
> +				      PMD_SIZE, flags);
>  	if (!dax_region)
>  		return -ENOMEM;
>  
> +	dev_size = range_len(&cxlr_dax->hpa_range);
> +	/* Add empty seed dax device */
> +	if (cxlr->mode == CXL_REGION_DC)
> +		dev_size = 0;
> +
>  	data = (struct dev_dax_data) {
>  		.dax_region = dax_region,
>  		.id = -1,
> -		.size = range_len(&cxlr_dax->hpa_range),
> +		.size = dev_size,
>  		.memmap_on_memory = true,
>  	};
>  
> 
> -- 
> 2.44.0
>
Dave Jiang March 27, 2024, 5:27 p.m. UTC | #2
On 3/24/24 4:18 PM, ira.weiny@intel.com wrote:
> From: Navneet Singh <navneet.singh@intel.com>
> 
> CXL devices optionally support dynamic capacity.  CXL Regions must be
> configured correctly to access this capacity.  Similar to ram and pmem
> partitions, DC Regions, as they are called in CXL 3.1, represent
> different partitions of the DPA space.
> 
> Introduce the concept of a sparse DAX region.  Add the create_dc_region
> sysfs entry to create sparse DC DAX regions.  Special case DC capable
> regions to create a 0 sized seed DAX device to maintain backwards
> compatibility with older software which needs a default DAX device to
> hold the region reference.
> 
> Flag sparse DAX regions to indicate 0 capacity available until such time
> as DC capacity is added.
> 
> Interleaving is deferred in this series.  Add an early check.
> 
> Signed-off-by: Navneet Singh <navneet.singh@intel.com>
> Co-developed-by: Ira Weiny <ira.weiny@intel.com>
> Signed-off-by: Ira Weiny <ira.weiny@intel.com>
> 
> ---
> Changes for v1:
> [djiang: mark sysfs entries to be in 6.10 kernel including date]
> [djbw: change dax region typing to be 'sparse' rather than 'dynamic']
> [iweiny: rebase changes to master instead of type2 patches]
> ---
>  Documentation/ABI/testing/sysfs-bus-cxl | 22 +++++++++++-----------
>  drivers/cxl/core/core.h                 |  1 +
>  drivers/cxl/core/port.c                 |  1 +
>  drivers/cxl/core/region.c               | 33 +++++++++++++++++++++++++++++++++
>  drivers/dax/bus.c                       |  8 ++++++++
>  drivers/dax/bus.h                       |  1 +
>  drivers/dax/cxl.c                       | 15 +++++++++++++--
>  7 files changed, 68 insertions(+), 13 deletions(-)
> 
> diff --git a/Documentation/ABI/testing/sysfs-bus-cxl b/Documentation/ABI/testing/sysfs-bus-cxl
> index 8a4f572c8498..f0cf52fff9fa 100644
> --- a/Documentation/ABI/testing/sysfs-bus-cxl
> +++ b/Documentation/ABI/testing/sysfs-bus-cxl
> @@ -411,20 +411,20 @@ Description:
>  		interleave_granularity).
>  
>  
> -What:		/sys/bus/cxl/devices/decoderX.Y/create_{pmem,ram}_region
> -Date:		May, 2022, January, 2023
> -KernelVersion:	v6.0 (pmem), v6.3 (ram)
> +What:		/sys/bus/cxl/devices/decoderX.Y/create_{pmem,ram,dc}_region
> +Date:		May, 2022, January, 2023, June 2024
> +KernelVersion:	v6.0 (pmem), v6.3 (ram), v6.10 (dc)
>  Contact:	linux-cxl@vger.kernel.org
>  Description:
>  		(RW) Write a string in the form 'regionZ' to start the process
> -		of defining a new persistent, or volatile memory region
> -		(interleave-set) within the decode range bounded by root decoder
> -		'decoderX.Y'. The value written must match the current value
> -		returned from reading this attribute. An atomic compare exchange
> -		operation is done on write to assign the requested id to a
> -		region and allocate the region-id for the next creation attempt.
> -		EBUSY is returned if the region name written does not match the
> -		current cached value.
> +		of defining a new persistent, volatile, or Dynamic Capacity
> +		(DC) memory region (interleave-set) within the decode range
> +		bounded by root decoder 'decoderX.Y'. The value written must
> +		match the current value returned from reading this attribute.
> +		An atomic compare exchange operation is done on write to assign
> +		the requested id to a region and allocate the region-id for the
> +		next creation attempt.  EBUSY is returned if the region name

-EBUSY?

> +		written does not match the current cached value.
>  
>  
>  What:		/sys/bus/cxl/devices/decoderX.Y/delete_region
> diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h
> index 3b64fb1b9ed0..91abeffbe985 100644
> --- a/drivers/cxl/core/core.h
> +++ b/drivers/cxl/core/core.h
> @@ -13,6 +13,7 @@ extern struct attribute_group cxl_base_attribute_group;
>  #ifdef CONFIG_CXL_REGION
>  extern struct device_attribute dev_attr_create_pmem_region;
>  extern struct device_attribute dev_attr_create_ram_region;
> +extern struct device_attribute dev_attr_create_dc_region;
>  extern struct device_attribute dev_attr_delete_region;
>  extern struct device_attribute dev_attr_region;
>  extern const struct device_type cxl_pmem_region_type;
> diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c
> index 036b61cb3007..661177b575f7 100644
> --- a/drivers/cxl/core/port.c
> +++ b/drivers/cxl/core/port.c
> @@ -335,6 +335,7 @@ static struct attribute *cxl_decoder_root_attrs[] = {
>  	&dev_attr_qos_class.attr,
>  	SET_CXL_REGION_ATTR(create_pmem_region)
>  	SET_CXL_REGION_ATTR(create_ram_region)
> +	SET_CXL_REGION_ATTR(create_dc_region)
>  	SET_CXL_REGION_ATTR(delete_region)
>  	NULL,
>  };
> diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
> index ec3b8c6948e9..0d7b09a49dcf 100644
> --- a/drivers/cxl/core/region.c
> +++ b/drivers/cxl/core/region.c
> @@ -2205,6 +2205,7 @@ static struct cxl_region *devm_cxl_add_region(struct cxl_root_decoder *cxlrd,
>  	switch (mode) {
>  	case CXL_REGION_RAM:
>  	case CXL_REGION_PMEM:
> +	case CXL_REGION_DC:
>  		break;
>  	default:
>  		dev_err(&cxlrd->cxlsd.cxld.dev, "unsupported mode %s\n",
> @@ -2314,6 +2315,32 @@ static ssize_t create_ram_region_store(struct device *dev,
>  }
>  DEVICE_ATTR_RW(create_ram_region);
>  
> +static ssize_t create_dc_region_show(struct device *dev,
> +				     struct device_attribute *attr, char *buf)
> +{
> +	return __create_region_show(to_cxl_root_decoder(dev), buf);
> +}
> +
> +static ssize_t create_dc_region_store(struct device *dev,
> +				      struct device_attribute *attr,
> +				      const char *buf, size_t len)
> +{
> +	struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev);
> +	struct cxl_region *cxlr;
> +	int rc, id;
> +
> +	rc = sscanf(buf, "region%d\n", &id);
> +	if (rc != 1)
> +		return -EINVAL;
> +
> +	cxlr = __create_region(cxlrd, CXL_REGION_DC, id);
> +	if (IS_ERR(cxlr))
> +		return PTR_ERR(cxlr);
> +
> +	return len;
> +}
> +DEVICE_ATTR_RW(create_dc_region);
> +
>  static ssize_t region_show(struct device *dev, struct device_attribute *attr,
>  			   char *buf)
>  {
> @@ -2759,6 +2786,11 @@ static int devm_cxl_add_dax_region(struct cxl_region *cxlr)
>  	struct device *dev;
>  	int rc;
>  
> +	if (cxlr->mode == CXL_REGION_DC && cxlr->params.interleave_ways != 1) {
> +		dev_err(&cxlr->dev, "Interleaving DC not supported\n");
> +		return -EINVAL;
> +	}
> +
>  	cxlr_dax = cxl_dax_region_alloc(cxlr);
>  	if (IS_ERR(cxlr_dax))
>  		return PTR_ERR(cxlr_dax);
> @@ -3040,6 +3072,7 @@ static int cxl_region_probe(struct device *dev)
>  	case CXL_REGION_PMEM:
>  		return devm_cxl_add_pmem_region(cxlr);
>  	case CXL_REGION_RAM:
> +	case CXL_REGION_DC:
>  		/*
>  		 * The region can not be manged by CXL if any portion of
>  		 * it is already online as 'System RAM'
> diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
> index cb148f74ceda..903566aff5eb 100644
> --- a/drivers/dax/bus.c
> +++ b/drivers/dax/bus.c
> @@ -181,6 +181,11 @@ static bool is_static(struct dax_region *dax_region)
>  	return (dax_region->res.flags & IORESOURCE_DAX_STATIC) != 0;
>  }
>  
> +static bool is_sparse(struct dax_region *dax_region)
> +{
> +	return (dax_region->res.flags & IORESOURCE_DAX_SPARSE_CAP) != 0;
> +}
> +
>  bool static_dev_dax(struct dev_dax *dev_dax)
>  {
>  	return is_static(dev_dax->region);
> @@ -304,6 +309,9 @@ static unsigned long long dax_region_avail_size(struct dax_region *dax_region)
>  
>  	WARN_ON_ONCE(!rwsem_is_locked(&dax_region_rwsem));
>  
> +	if (is_sparse(dax_region))
> +		return 0;
> +
>  	for_each_dax_region_resource(dax_region, res)
>  		size -= resource_size(res);
>  	return size;
> diff --git a/drivers/dax/bus.h b/drivers/dax/bus.h
> index cbbf64443098..783bfeef42cc 100644
> --- a/drivers/dax/bus.h
> +++ b/drivers/dax/bus.h
> @@ -13,6 +13,7 @@ struct dax_region;
>  /* dax bus specific ioresource flags */
>  #define IORESOURCE_DAX_STATIC BIT(0)
>  #define IORESOURCE_DAX_KMEM BIT(1)
> +#define IORESOURCE_DAX_SPARSE_CAP BIT(2)
>  
>  struct dax_region *alloc_dax_region(struct device *parent, int region_id,
>  		struct range *range, int target_node, unsigned int align,
> diff --git a/drivers/dax/cxl.c b/drivers/dax/cxl.c
> index c696837ab23c..415d03fbf9b6 100644
> --- a/drivers/dax/cxl.c
> +++ b/drivers/dax/cxl.c
> @@ -13,19 +13,30 @@ static int cxl_dax_region_probe(struct device *dev)
>  	struct cxl_region *cxlr = cxlr_dax->cxlr;
>  	struct dax_region *dax_region;
>  	struct dev_dax_data data;
> +	resource_size_t dev_size;
> +	unsigned long flags;
>  
>  	if (nid == NUMA_NO_NODE)
>  		nid = memory_add_physaddr_to_nid(cxlr_dax->hpa_range.start);
>  
> +	flags = IORESOURCE_DAX_KMEM;
> +	if (cxlr->mode == CXL_REGION_DC)
> +		flags |= IORESOURCE_DAX_SPARSE_CAP;
> +
>  	dax_region = alloc_dax_region(dev, cxlr->id, &cxlr_dax->hpa_range, nid,
> -				      PMD_SIZE, IORESOURCE_DAX_KMEM);
> +				      PMD_SIZE, flags);
>  	if (!dax_region)
>  		return -ENOMEM;
>  
> +	dev_size = range_len(&cxlr_dax->hpa_range);
> +	/* Add empty seed dax device */
> +	if (cxlr->mode == CXL_REGION_DC)
> +		dev_size = 0;

Nit. Just do if/else so dev_size isn't set twice if mode is DC.

> +
>  	data = (struct dev_dax_data) {
>  		.dax_region = dax_region,
>  		.id = -1,
> -		.size = range_len(&cxlr_dax->hpa_range),
> +		.size = dev_size,
>  		.memmap_on_memory = true,
>  	};
>  
>
Jonathan Cameron April 4, 2024, 10:26 a.m. UTC | #3
On Sun, 24 Mar 2024 16:18:12 -0700
ira.weiny@intel.com wrote:

> From: Navneet Singh <navneet.singh@intel.com>
> 
> CXL devices optionally support dynamic capacity.  CXL Regions must be
> configured correctly to access this capacity.  Similar to ram and pmem
> partitions, DC Regions, as they are called in CXL 3.1, represent
> different partitions of the DPA space.
> 
> Introduce the concept of a sparse DAX region.  Add the create_dc_region
> sysfs entry to create sparse DC DAX regions.  Special case DC capable
> regions to create a 0 sized seed DAX device to maintain backwards
> compatibility with older software which needs a default DAX device to
> hold the region reference.
> 
> Flag sparse DAX regions to indicate 0 capacity available until such time
> as DC capacity is added.
> 
> Interleaving is deferred in this series.  Add an early check.
> 
> Signed-off-by: Navneet Singh <navneet.singh@intel.com>
> Co-developed-by: Ira Weiny <ira.weiny@intel.com>
> Signed-off-by: Ira Weiny <ira.weiny@intel.com>
With the -EBUSY others addressed LGTM.  Fan's duplication comment
might be something to tidy up later.

Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Ira Weiny April 10, 2024, 4:25 a.m. UTC | #4
fan wrote:
> On Sun, Mar 24, 2024 at 04:18:12PM -0700, ira.weiny@intel.com wrote:
> > From: Navneet Singh <navneet.singh@intel.com>
> > 
> > CXL devices optionally support dynamic capacity.  CXL Regions must be
> > configured correctly to access this capacity.  Similar to ram and pmem
> > partitions, DC Regions, as they are called in CXL 3.1, represent
> > different partitions of the DPA space.
> > 
> > Introduce the concept of a sparse DAX region.  Add the create_dc_region
> > sysfs entry to create sparse DC DAX regions.  Special case DC capable
> > regions to create a 0 sized seed DAX device to maintain backwards
> > compatibility with older software which needs a default DAX device to
> > hold the region reference.
> > 
> > Flag sparse DAX regions to indicate 0 capacity available until such time
> > as DC capacity is added.
> > 
> > Interleaving is deferred in this series.  Add an early check.
> > 
> > Signed-off-by: Navneet Singh <navneet.singh@intel.com>
> > Co-developed-by: Ira Weiny <ira.weiny@intel.com>
> > Signed-off-by: Ira Weiny <ira.weiny@intel.com>
> > 
> > ---
> > Changes for v1:
> > [djiang: mark sysfs entries to be in 6.10 kernel including date]
> > [djbw: change dax region typing to be 'sparse' rather than 'dynamic']
> > [iweiny: rebase changes to master instead of type2 patches]
> > ---
> >  Documentation/ABI/testing/sysfs-bus-cxl | 22 +++++++++++-----------
> >  drivers/cxl/core/core.h                 |  1 +
> >  drivers/cxl/core/port.c                 |  1 +
> >  drivers/cxl/core/region.c               | 33 +++++++++++++++++++++++++++++++++
> >  drivers/dax/bus.c                       |  8 ++++++++
> >  drivers/dax/bus.h                       |  1 +
> >  drivers/dax/cxl.c                       | 15 +++++++++++++--
> >  7 files changed, 68 insertions(+), 13 deletions(-)
> > 
> > diff --git a/Documentation/ABI/testing/sysfs-bus-cxl b/Documentation/ABI/testing/sysfs-bus-cxl
> > index 8a4f572c8498..f0cf52fff9fa 100644
> > --- a/Documentation/ABI/testing/sysfs-bus-cxl
> > +++ b/Documentation/ABI/testing/sysfs-bus-cxl
> > @@ -411,20 +411,20 @@ Description:
> >  		interleave_granularity).
> >  
> >  
> > -What:		/sys/bus/cxl/devices/decoderX.Y/create_{pmem,ram}_region
> > -Date:		May, 2022, January, 2023
> > -KernelVersion:	v6.0 (pmem), v6.3 (ram)
> > +What:		/sys/bus/cxl/devices/decoderX.Y/create_{pmem,ram,dc}_region
> > +Date:		May, 2022, January, 2023, June 2024
> > +KernelVersion:	v6.0 (pmem), v6.3 (ram), v6.10 (dc)
> >  Contact:	linux-cxl@vger.kernel.org
> >  Description:
> >  		(RW) Write a string in the form 'regionZ' to start the process
> > -		of defining a new persistent, or volatile memory region
> > -		(interleave-set) within the decode range bounded by root decoder
> > -		'decoderX.Y'. The value written must match the current value
> > -		returned from reading this attribute. An atomic compare exchange
> > -		operation is done on write to assign the requested id to a
> > -		region and allocate the region-id for the next creation attempt.
> > -		EBUSY is returned if the region name written does not match the
> > -		current cached value.
> > +		of defining a new persistent, volatile, or Dynamic Capacity
> > +		(DC) memory region (interleave-set) within the decode range
> > +		bounded by root decoder 'decoderX.Y'. The value written must
> > +		match the current value returned from reading this attribute.
> > +		An atomic compare exchange operation is done on write to assign
> > +		the requested id to a region and allocate the region-id for the
> > +		next creation attempt.  EBUSY is returned if the region name
> > +		written does not match the current cached value.
> >  
> >  
> >  What:		/sys/bus/cxl/devices/decoderX.Y/delete_region
> > diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h
> > index 3b64fb1b9ed0..91abeffbe985 100644
> > --- a/drivers/cxl/core/core.h
> > +++ b/drivers/cxl/core/core.h
> > @@ -13,6 +13,7 @@ extern struct attribute_group cxl_base_attribute_group;
> >  #ifdef CONFIG_CXL_REGION
> >  extern struct device_attribute dev_attr_create_pmem_region;
> >  extern struct device_attribute dev_attr_create_ram_region;
> > +extern struct device_attribute dev_attr_create_dc_region;
> >  extern struct device_attribute dev_attr_delete_region;
> >  extern struct device_attribute dev_attr_region;
> >  extern const struct device_type cxl_pmem_region_type;
> > diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c
> > index 036b61cb3007..661177b575f7 100644
> > --- a/drivers/cxl/core/port.c
> > +++ b/drivers/cxl/core/port.c
> > @@ -335,6 +335,7 @@ static struct attribute *cxl_decoder_root_attrs[] = {
> >  	&dev_attr_qos_class.attr,
> >  	SET_CXL_REGION_ATTR(create_pmem_region)
> >  	SET_CXL_REGION_ATTR(create_ram_region)
> > +	SET_CXL_REGION_ATTR(create_dc_region)
> >  	SET_CXL_REGION_ATTR(delete_region)
> >  	NULL,
> >  };
> > diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
> > index ec3b8c6948e9..0d7b09a49dcf 100644
> > --- a/drivers/cxl/core/region.c
> > +++ b/drivers/cxl/core/region.c
> > @@ -2205,6 +2205,7 @@ static struct cxl_region *devm_cxl_add_region(struct cxl_root_decoder *cxlrd,
> >  	switch (mode) {
> >  	case CXL_REGION_RAM:
> >  	case CXL_REGION_PMEM:
> > +	case CXL_REGION_DC:
> >  		break;
> >  	default:
> >  		dev_err(&cxlrd->cxlsd.cxld.dev, "unsupported mode %s\n",
> > @@ -2314,6 +2315,32 @@ static ssize_t create_ram_region_store(struct device *dev,
> >  }
> >  DEVICE_ATTR_RW(create_ram_region);
> >  
> > +static ssize_t create_dc_region_show(struct device *dev,
> > +				     struct device_attribute *attr, char *buf)
> > +{
> > +	return __create_region_show(to_cxl_root_decoder(dev), buf);
> > +}
> > +
> > +static ssize_t create_dc_region_store(struct device *dev,
> > +				      struct device_attribute *attr,
> > +				      const char *buf, size_t len)
> > +{
> > +	struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev);
> > +	struct cxl_region *cxlr;
> > +	int rc, id;
> > +
> > +	rc = sscanf(buf, "region%d\n", &id);
> > +	if (rc != 1)
> > +		return -EINVAL;
> > +
> > +	cxlr = __create_region(cxlrd, CXL_REGION_DC, id);
> > +	if (IS_ERR(cxlr))
> > +		return PTR_ERR(cxlr);
> > +
> > +	return len;
> > +}
> > +DEVICE_ATTR_RW(create_dc_region);
> 
> create_ram_region_store, create_pmem_region_store and
> create_dc_region_store have mostly duplicate code, should we consider
> extracting out as a helper function and pass region type for ram/pmem/dc region
> store?

That is mostly done with __create_region().  But the following is a nice cleanup.

I'll test it.

Ira

21:24:37 > git di
diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
index e8549af47da7..8a83a415fd0b 100644
--- a/drivers/cxl/core/region.c
+++ b/drivers/cxl/core/region.c
@@ -2638,9 +2638,8 @@ static struct cxl_region *__create_region(struct cxl_root_decoder *cxlrd,
        return devm_cxl_add_region(cxlrd, id, mode, CXL_DECODER_HOSTONLYMEM);
 }
 
-static ssize_t create_pmem_region_store(struct device *dev,
-                                       struct device_attribute *attr,
-                                       const char *buf, size_t len)
+static ssize_t create_region_store(struct device *dev, const char *buf,
+                                  size_t len, enum cxl_region_mode mode)
 {
        struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev);
        struct cxl_region *cxlr;
@@ -2650,31 +2649,26 @@ static ssize_t create_pmem_region_store(struct device *dev,
        if (rc != 1)
                return -EINVAL;
 
-       cxlr = __create_region(cxlrd, CXL_REGION_PMEM, id);
+       cxlr = __create_region(cxlrd, mode, id);
        if (IS_ERR(cxlr))
                return PTR_ERR(cxlr);
 
        return len;
 }
+
+static ssize_t create_pmem_region_store(struct device *dev,
+                                       struct device_attribute *attr,
+                                       const char *buf, size_t len)
+{
+       return create_region_store(dev, buf, len, CXL_REGION_PMEM);
+}
 DEVICE_ATTR_RW(create_pmem_region);
 
 static ssize_t create_ram_region_store(struct device *dev,
                                       struct device_attribute *attr,
                                       const char *buf, size_t len)
 {
-       struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev);
-       struct cxl_region *cxlr;
-       int rc, id;
-
-       rc = sscanf(buf, "region%d\n", &id);
-       if (rc != 1)
-               return -EINVAL;
-
-       cxlr = __create_region(cxlrd, CXL_REGION_RAM, id);
-       if (IS_ERR(cxlr))
-               return PTR_ERR(cxlr);
-
-       return len;
+       return create_region_store(dev, buf, len, CXL_REGION_RAM);
 }
 DEVICE_ATTR_RW(create_ram_region);
 
@@ -2688,19 +2682,7 @@ static ssize_t create_dc_region_store(struct device *dev,
                                      struct device_attribute *attr,
                                      const char *buf, size_t len)
 {
-       struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev);
-       struct cxl_region *cxlr;
-       int rc, id;
-
-       rc = sscanf(buf, "region%d\n", &id);
-       if (rc != 1)
-               return -EINVAL;
-
-       cxlr = __create_region(cxlrd, CXL_REGION_DC, id);
-       if (IS_ERR(cxlr))
-               return PTR_ERR(cxlr);
-
-       return len;
+       return create_region_store(dev, buf, len, CXL_REGION_DC);
 }
 DEVICE_ATTR_RW(create_dc_region);
Ira Weiny April 10, 2024, 4:35 a.m. UTC | #5
Dave Jiang wrote:
> 
> 
> On 3/24/24 4:18 PM, ira.weiny@intel.com wrote:
> > From: Navneet Singh <navneet.singh@intel.com>
> > 

[snip]

> >  
> > -What:		/sys/bus/cxl/devices/decoderX.Y/create_{pmem,ram}_region
> > -Date:		May, 2022, January, 2023
> > -KernelVersion:	v6.0 (pmem), v6.3 (ram)
> > +What:		/sys/bus/cxl/devices/decoderX.Y/create_{pmem,ram,dc}_region
> > +Date:		May, 2022, January, 2023, June 2024
> > +KernelVersion:	v6.0 (pmem), v6.3 (ram), v6.10 (dc)
> >  Contact:	linux-cxl@vger.kernel.org
> >  Description:
> >  		(RW) Write a string in the form 'regionZ' to start the process
> > -		of defining a new persistent, or volatile memory region
> > -		(interleave-set) within the decode range bounded by root decoder
> > -		'decoderX.Y'. The value written must match the current value
> > -		returned from reading this attribute. An atomic compare exchange
> > -		operation is done on write to assign the requested id to a
> > -		region and allocate the region-id for the next creation attempt.
> > -		EBUSY is returned if the region name written does not match the
> > -		current cached value.
> > +		of defining a new persistent, volatile, or Dynamic Capacity
> > +		(DC) memory region (interleave-set) within the decode range
> > +		bounded by root decoder 'decoderX.Y'. The value written must
> > +		match the current value returned from reading this attribute.
> > +		An atomic compare exchange operation is done on write to assign
> > +		the requested id to a region and allocate the region-id for the
> > +		next creation attempt.  EBUSY is returned if the region name
> 
> -EBUSY?
>

To match the other documentation I would say no.  The other docs show
ENXIO/EBUSY/EINVAL without the negative indicator.


[snip]

> > diff --git a/drivers/dax/cxl.c b/drivers/dax/cxl.c
> > index c696837ab23c..415d03fbf9b6 100644
> > --- a/drivers/dax/cxl.c
> > +++ b/drivers/dax/cxl.c
> > @@ -13,19 +13,30 @@ static int cxl_dax_region_probe(struct device *dev)
> >  	struct cxl_region *cxlr = cxlr_dax->cxlr;
> >  	struct dax_region *dax_region;
> >  	struct dev_dax_data data;
> > +	resource_size_t dev_size;
> > +	unsigned long flags;
> >  
> >  	if (nid == NUMA_NO_NODE)
> >  		nid = memory_add_physaddr_to_nid(cxlr_dax->hpa_range.start);
> >  
> > +	flags = IORESOURCE_DAX_KMEM;
> > +	if (cxlr->mode == CXL_REGION_DC)
> > +		flags |= IORESOURCE_DAX_SPARSE_CAP;
> > +
> >  	dax_region = alloc_dax_region(dev, cxlr->id, &cxlr_dax->hpa_range, nid,
> > -				      PMD_SIZE, IORESOURCE_DAX_KMEM);
> > +				      PMD_SIZE, flags);
> >  	if (!dax_region)
> >  		return -ENOMEM;
> >  
> > +	dev_size = range_len(&cxlr_dax->hpa_range);
> > +	/* Add empty seed dax device */
> > +	if (cxlr->mode == CXL_REGION_DC)
> > +		dev_size = 0;
> 
> Nit. Just do if/else so dev_size isn't set twice if mode is DC.

Ok yea.

Ira
Ira Weiny April 10, 2024, 4:40 a.m. UTC | #6
Jonathan Cameron wrote:
> On Sun, 24 Mar 2024 16:18:12 -0700
> ira.weiny@intel.com wrote:
> 
> > From: Navneet Singh <navneet.singh@intel.com>
> > 
> > CXL devices optionally support dynamic capacity.  CXL Regions must be
> > configured correctly to access this capacity.  Similar to ram and pmem
> > partitions, DC Regions, as they are called in CXL 3.1, represent
> > different partitions of the DPA space.
> > 
> > Introduce the concept of a sparse DAX region.  Add the create_dc_region
> > sysfs entry to create sparse DC DAX regions.  Special case DC capable
> > regions to create a 0 sized seed DAX device to maintain backards
> > compatibility with older software which needs a default DAX device to
> > hold the region reference.
> > 
> > Flag sparse DAX regions to indicate 0 capacity available until such time
> > as DC capacity is added.
> > 
> > Interleaving is deferred in this series.  Add an early check.
> > 
> > Signed-off-by: Navneet Singh <navneet.singh@intel.com>
> > Co-developed-by: Ira Weiny <ira.weiny@intel.com>
> > Signed-off-by: Ira Weiny <ira.weiny@intel.com>
> With the -EBUSY others addressed LGTM.

But the other docs do not have that notation.  Also the EBUSY was not changed
from the previous documentation.  Only the text for DC was added.

I'm inclined to leave this.

> Fan's duplication comment
> might be something to tidy up later.
> 
> Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>

Thanks,
Ira
diff mbox series

Patch

diff --git a/Documentation/ABI/testing/sysfs-bus-cxl b/Documentation/ABI/testing/sysfs-bus-cxl
index 8a4f572c8498..f0cf52fff9fa 100644
--- a/Documentation/ABI/testing/sysfs-bus-cxl
+++ b/Documentation/ABI/testing/sysfs-bus-cxl
@@ -411,20 +411,20 @@  Description:
 		interleave_granularity).
 
 
-What:		/sys/bus/cxl/devices/decoderX.Y/create_{pmem,ram}_region
-Date:		May, 2022, January, 2023
-KernelVersion:	v6.0 (pmem), v6.3 (ram)
+What:		/sys/bus/cxl/devices/decoderX.Y/create_{pmem,ram,dc}_region
+Date:		May, 2022, January, 2023, June 2024
+KernelVersion:	v6.0 (pmem), v6.3 (ram), v6.10 (dc)
 Contact:	linux-cxl@vger.kernel.org
 Description:
 		(RW) Write a string in the form 'regionZ' to start the process
-		of defining a new persistent, or volatile memory region
-		(interleave-set) within the decode range bounded by root decoder
-		'decoderX.Y'. The value written must match the current value
-		returned from reading this attribute. An atomic compare exchange
-		operation is done on write to assign the requested id to a
-		region and allocate the region-id for the next creation attempt.
-		EBUSY is returned if the region name written does not match the
-		current cached value.
+		of defining a new persistent, volatile, or Dynamic Capacity
+		(DC) memory region (interleave-set) within the decode range
+		bounded by root decoder 'decoderX.Y'. The value written must
+		match the current value returned from reading this attribute.
+		An atomic compare exchange operation is done on write to assign
+		the requested id to a region and allocate the region-id for the
+		next creation attempt.  EBUSY is returned if the region name
+		written does not match the current cached value.
 
 
 What:		/sys/bus/cxl/devices/decoderX.Y/delete_region
diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h
index 3b64fb1b9ed0..91abeffbe985 100644
--- a/drivers/cxl/core/core.h
+++ b/drivers/cxl/core/core.h
@@ -13,6 +13,7 @@  extern struct attribute_group cxl_base_attribute_group;
 #ifdef CONFIG_CXL_REGION
 extern struct device_attribute dev_attr_create_pmem_region;
 extern struct device_attribute dev_attr_create_ram_region;
+extern struct device_attribute dev_attr_create_dc_region;
 extern struct device_attribute dev_attr_delete_region;
 extern struct device_attribute dev_attr_region;
 extern const struct device_type cxl_pmem_region_type;
diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c
index 036b61cb3007..661177b575f7 100644
--- a/drivers/cxl/core/port.c
+++ b/drivers/cxl/core/port.c
@@ -335,6 +335,7 @@  static struct attribute *cxl_decoder_root_attrs[] = {
 	&dev_attr_qos_class.attr,
 	SET_CXL_REGION_ATTR(create_pmem_region)
 	SET_CXL_REGION_ATTR(create_ram_region)
+	SET_CXL_REGION_ATTR(create_dc_region)
 	SET_CXL_REGION_ATTR(delete_region)
 	NULL,
 };
diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
index ec3b8c6948e9..0d7b09a49dcf 100644
--- a/drivers/cxl/core/region.c
+++ b/drivers/cxl/core/region.c
@@ -2205,6 +2205,7 @@  static struct cxl_region *devm_cxl_add_region(struct cxl_root_decoder *cxlrd,
 	switch (mode) {
 	case CXL_REGION_RAM:
 	case CXL_REGION_PMEM:
+	case CXL_REGION_DC:
 		break;
 	default:
 		dev_err(&cxlrd->cxlsd.cxld.dev, "unsupported mode %s\n",
@@ -2314,6 +2315,32 @@  static ssize_t create_ram_region_store(struct device *dev,
 }
 DEVICE_ATTR_RW(create_ram_region);
 
+static ssize_t create_dc_region_show(struct device *dev,
+				     struct device_attribute *attr, char *buf)
+{
+	return __create_region_show(to_cxl_root_decoder(dev), buf);
+}
+
+static ssize_t create_dc_region_store(struct device *dev,
+				      struct device_attribute *attr,
+				      const char *buf, size_t len)
+{
+	struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev);
+	struct cxl_region *cxlr;
+	int rc, id;
+
+	rc = sscanf(buf, "region%d\n", &id);
+	if (rc != 1)
+		return -EINVAL;
+
+	cxlr = __create_region(cxlrd, CXL_REGION_DC, id);
+	if (IS_ERR(cxlr))
+		return PTR_ERR(cxlr);
+
+	return len;
+}
+DEVICE_ATTR_RW(create_dc_region);
+
 static ssize_t region_show(struct device *dev, struct device_attribute *attr,
 			   char *buf)
 {
@@ -2759,6 +2786,11 @@  static int devm_cxl_add_dax_region(struct cxl_region *cxlr)
 	struct device *dev;
 	int rc;
 
+	if (cxlr->mode == CXL_REGION_DC && cxlr->params.interleave_ways != 1) {
+		dev_err(&cxlr->dev, "Interleaving DC not supported\n");
+		return -EINVAL;
+	}
+
 	cxlr_dax = cxl_dax_region_alloc(cxlr);
 	if (IS_ERR(cxlr_dax))
 		return PTR_ERR(cxlr_dax);
@@ -3040,6 +3072,7 @@  static int cxl_region_probe(struct device *dev)
 	case CXL_REGION_PMEM:
 		return devm_cxl_add_pmem_region(cxlr);
 	case CXL_REGION_RAM:
+	case CXL_REGION_DC:
 		/*
 		 * The region can not be manged by CXL if any portion of
 		 * it is already online as 'System RAM'
diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
index cb148f74ceda..903566aff5eb 100644
--- a/drivers/dax/bus.c
+++ b/drivers/dax/bus.c
@@ -181,6 +181,11 @@  static bool is_static(struct dax_region *dax_region)
 	return (dax_region->res.flags & IORESOURCE_DAX_STATIC) != 0;
 }
 
+static bool is_sparse(struct dax_region *dax_region)
+{
+	return (dax_region->res.flags & IORESOURCE_DAX_SPARSE_CAP) != 0;
+}
+
 bool static_dev_dax(struct dev_dax *dev_dax)
 {
 	return is_static(dev_dax->region);
@@ -304,6 +309,9 @@  static unsigned long long dax_region_avail_size(struct dax_region *dax_region)
 
 	WARN_ON_ONCE(!rwsem_is_locked(&dax_region_rwsem));
 
+	if (is_sparse(dax_region))
+		return 0;
+
 	for_each_dax_region_resource(dax_region, res)
 		size -= resource_size(res);
 	return size;
diff --git a/drivers/dax/bus.h b/drivers/dax/bus.h
index cbbf64443098..783bfeef42cc 100644
--- a/drivers/dax/bus.h
+++ b/drivers/dax/bus.h
@@ -13,6 +13,7 @@  struct dax_region;
 /* dax bus specific ioresource flags */
 #define IORESOURCE_DAX_STATIC BIT(0)
 #define IORESOURCE_DAX_KMEM BIT(1)
+#define IORESOURCE_DAX_SPARSE_CAP BIT(2)
 
 struct dax_region *alloc_dax_region(struct device *parent, int region_id,
 		struct range *range, int target_node, unsigned int align,
diff --git a/drivers/dax/cxl.c b/drivers/dax/cxl.c
index c696837ab23c..415d03fbf9b6 100644
--- a/drivers/dax/cxl.c
+++ b/drivers/dax/cxl.c
@@ -13,19 +13,30 @@  static int cxl_dax_region_probe(struct device *dev)
 	struct cxl_region *cxlr = cxlr_dax->cxlr;
 	struct dax_region *dax_region;
 	struct dev_dax_data data;
+	resource_size_t dev_size;
+	unsigned long flags;
 
 	if (nid == NUMA_NO_NODE)
 		nid = memory_add_physaddr_to_nid(cxlr_dax->hpa_range.start);
 
+	flags = IORESOURCE_DAX_KMEM;
+	if (cxlr->mode == CXL_REGION_DC)
+		flags |= IORESOURCE_DAX_SPARSE_CAP;
+
 	dax_region = alloc_dax_region(dev, cxlr->id, &cxlr_dax->hpa_range, nid,
-				      PMD_SIZE, IORESOURCE_DAX_KMEM);
+				      PMD_SIZE, flags);
 	if (!dax_region)
 		return -ENOMEM;
 
+	dev_size = range_len(&cxlr_dax->hpa_range);
+	/* Add empty seed dax device */
+	if (cxlr->mode == CXL_REGION_DC)
+		dev_size = 0;
+
 	data = (struct dev_dax_data) {
 		.dax_region = dax_region,
 		.id = -1,
-		.size = range_len(&cxlr_dax->hpa_range),
+		.size = dev_size,
 		.memmap_on_memory = true,
 	};