diff mbox series

[16/26] cxl/extent: Realize extent devices

Message ID 20240324-dcd-type2-upstream-v1-16-b7b00d623625@intel.com (mailing list archive)
State New, archived
Headers show
Series DCD: Add support for Dynamic Capacity Devices (DCD) | expand

Commit Message

Ira Weiny March 24, 2024, 11:18 p.m. UTC
From: Navneet Singh <navneet.singh@intel.com>

Once all extents of an interleave set are present a region must
surface an extent to the region.

Without interleaving; endpoint decoder and region extents have a 1:1
relationship.  Future support for IW > 1 will maintain a N:1
relationship between the device extents and region extents.

Create a region extent device for every device extent found.  Release of
the extent device triggers a response to the underlying hardware extent.

There is no strong use case to support the addition of extents which
overlap previously accepted extent ranges.  Reject such new extents
until such time as a good use case emerges.

Expose the necessary details of region extents by creating the following
sysfs entries.

	/sys/bus/cxl/devices/dax_regionX/extentY
	/sys/bus/cxl/devices/dax_regionX/extentY/offset
	/sys/bus/cxl/devices/dax_regionX/extentY/length
	/sys/bus/cxl/devices/dax_regionX/extentY/label

The use of the extent devices by the DAX layer is deferred to later
patches.

Signed-off-by: Navneet Singh <navneet.singh@intel.com>
Co-developed-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>

---
Changes for v1
[iweiny: new patch]
[iweiny: Rename 'dr_extent' to 'region_extent']
---
 drivers/cxl/core/Makefile |   1 +
 drivers/cxl/core/extent.c | 133 ++++++++++++++++++++++++++++++++++++++++++++++
 drivers/cxl/core/mbox.c   |  43 +++++++++++++++
 drivers/cxl/core/region.c |  76 +++++++++++++++++++++++++-
 drivers/cxl/cxl.h         |  37 +++++++++++++
 tools/testing/cxl/Kbuild  |   1 +
 6 files changed, 290 insertions(+), 1 deletion(-)

Comments

Fan Ni March 27, 2024, 10:34 p.m. UTC | #1
On Sun, Mar 24, 2024 at 04:18:19PM -0700, ira.weiny@intel.com wrote:
> From: Navneet Singh <navneet.singh@intel.com>
> 
> Once all extents of an interleave set are present a region must
> surface an extent to the region.
> 
> Without interleaving; endpoint decoder and region extents have a 1:1
> relationship.  Future support for IW > 1 will maintain a N:1
> relationship between the device extents and region extents.
> 
> Create a region extent device for every device extent found.  Release of
> the extent device triggers a response to the underlying hardware extent.
> 
> There is no strong use case to support the addition of extents which
> overlap previously accepted extent ranges.  Reject such new extents
> until such time as a good use case emerges.
> 
> Expose the necessary details of region extents by creating the following
> sysfs entries.
> 
> 	/sys/bus/cxl/devices/dax_regionX/extentY
> 	/sys/bus/cxl/devices/dax_regionX/extentY/offset
> 	/sys/bus/cxl/devices/dax_regionX/extentY/length
> 	/sys/bus/cxl/devices/dax_regionX/extentY/label
> 
> The use of the extent devices by the DAX layer is deferred to later
> patches.
> 
> Signed-off-by: Navneet Singh <navneet.singh@intel.com>
> Co-developed-by: Ira Weiny <ira.weiny@intel.com>
> Signed-off-by: Ira Weiny <ira.weiny@intel.com>
> 

Minor comments inline,

> ---
> Changes for v1
> [iweiny: new patch]
> [iweiny: Rename 'dr_extent' to 'region_extent']
> ---
>  drivers/cxl/core/Makefile |   1 +
>  drivers/cxl/core/extent.c | 133 ++++++++++++++++++++++++++++++++++++++++++++++
>  drivers/cxl/core/mbox.c   |  43 +++++++++++++++
>  drivers/cxl/core/region.c |  76 +++++++++++++++++++++++++-
>  drivers/cxl/cxl.h         |  37 +++++++++++++
>  tools/testing/cxl/Kbuild  |   1 +
>  6 files changed, 290 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/cxl/core/Makefile b/drivers/cxl/core/Makefile
> index 9259bcc6773c..35c5c76bfcf1 100644
> --- a/drivers/cxl/core/Makefile
> +++ b/drivers/cxl/core/Makefile
> @@ -14,5 +14,6 @@ cxl_core-y += pci.o
>  cxl_core-y += hdm.o
>  cxl_core-y += pmu.o
>  cxl_core-y += cdat.o
> +cxl_core-y += extent.o
>  cxl_core-$(CONFIG_TRACING) += trace.o
>  cxl_core-$(CONFIG_CXL_REGION) += region.o
> diff --git a/drivers/cxl/core/extent.c b/drivers/cxl/core/extent.c
> new file mode 100644
> index 000000000000..487c220f1c3c
> --- /dev/null
> +++ b/drivers/cxl/core/extent.c
> @@ -0,0 +1,133 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/* Copyright(c) 2024 Intel Corporation. All rights reserved. */
> +
> +#include <linux/device.h>
> +#include <linux/slab.h>
> +#include <cxl.h>
> +
> +static DEFINE_IDA(cxl_extent_ida);
> +
> +static ssize_t offset_show(struct device *dev, struct device_attribute *attr,
> +			 char *buf)
> +{
> +	struct region_extent *reg_ext = to_region_extent(dev);
> +
> +	return sysfs_emit(buf, "%pa\n", &reg_ext->hpa_range.start);
> +}
> +static DEVICE_ATTR_RO(offset);
> +
> +static ssize_t length_show(struct device *dev, struct device_attribute *attr,
> +			 char *buf)
> +{
> +	struct region_extent *reg_ext = to_region_extent(dev);
> +	u64 length = range_len(&reg_ext->hpa_range);
> +
> +	return sysfs_emit(buf, "%pa\n", &length);
> +}
> +static DEVICE_ATTR_RO(length);
> +
> +static ssize_t label_show(struct device *dev, struct device_attribute *attr,
> +			  char *buf)
> +{
> +	struct region_extent *reg_ext = to_region_extent(dev);
> +
> +	return sysfs_emit(buf, "%s\n", reg_ext->label);
> +}
> +static DEVICE_ATTR_RO(label);
> +
> +static struct attribute *region_extent_attrs[] = {
> +	&dev_attr_offset.attr,
> +	&dev_attr_length.attr,
> +	&dev_attr_label.attr,
> +	NULL,
> +};
> +
> +static const struct attribute_group region_extent_attribute_group = {
> +	.attrs = region_extent_attrs,
> +};
> +
> +static const struct attribute_group *region_extent_attribute_groups[] = {
> +	&region_extent_attribute_group,
> +	NULL,
> +};
> +
> +static void region_extent_release(struct device *dev)
> +{
> +	struct region_extent *reg_ext = to_region_extent(dev);
> +
> +	cxl_release_ed_extent(&reg_ext->ed_ext);
> +	ida_free(&cxl_extent_ida, reg_ext->dev.id);
> +	kfree(reg_ext);
> +}
> +
> +static const struct device_type region_extent_type = {
> +	.name = "extent",
> +	.release = region_extent_release,
> +	.groups = region_extent_attribute_groups,
> +};
> +
> +bool is_region_extent(struct device *dev)
> +{
> +	return dev->type == &region_extent_type;
> +}
> +EXPORT_SYMBOL_NS_GPL(is_region_extent, CXL);
> +
> +static void region_extent_unregister(void *ext)
> +{
> +	struct region_extent *reg_ext = ext;
> +
> +	dev_dbg(&reg_ext->dev, "DAX region rm extent HPA %#llx - %#llx\n",
> +		reg_ext->hpa_range.start, reg_ext->hpa_range.end);
> +	device_unregister(&reg_ext->dev);
> +}
> +
> +int dax_region_create_ext(struct cxl_dax_region *cxlr_dax,
> +			  struct range *hpa_range,
> +			  const char *label,
> +			  struct range *dpa_range,
> +			  struct cxl_endpoint_decoder *cxled)
> +{
> +	struct region_extent *reg_ext;
> +	struct device *dev;
> +	int rc, id;
> +
> +	id = ida_alloc(&cxl_extent_ida, GFP_KERNEL);
> +	if (id < 0)
> +		return -ENOMEM;
> +
> +	reg_ext = kzalloc(sizeof(*reg_ext), GFP_KERNEL);
> +	if (!reg_ext)
> +		return -ENOMEM;
> +
> +	reg_ext->hpa_range = *hpa_range;
> +	reg_ext->ed_ext.dpa_range = *dpa_range;
> +	reg_ext->ed_ext.cxled = cxled;
> +	snprintf(reg_ext->label, DAX_EXTENT_LABEL_LEN, "%s", label);
> +
> +	dev = &reg_ext->dev;
> +	device_initialize(dev);
> +	dev->id = id;
> +	device_set_pm_not_required(dev);
> +	dev->parent = &cxlr_dax->dev;
> +	dev->type = &region_extent_type;
> +	rc = dev_set_name(dev, "extent%d", dev->id);
> +	if (rc)
> +		goto err;
> +
> +	rc = device_add(dev);
> +	if (rc)
> +		goto err;
> +
> +	dev_dbg(dev, "DAX region extent HPA %#llx - %#llx\n",
> +		reg_ext->hpa_range.start, reg_ext->hpa_range.end);
> +
> +	return devm_add_action_or_reset(&cxlr_dax->dev, region_extent_unregister,
> +	reg_ext);
> +
> +err:
> +	dev_err(&cxlr_dax->dev, "Failed to initialize DAX extent dev HPA %#llx - %#llx\n",
> +		reg_ext->hpa_range.start, reg_ext->hpa_range.end);
> +
> +	put_device(dev);
> +	return rc;
> +}
> diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
> index 9e33a0976828..6b00e717e42b 100644
> --- a/drivers/cxl/core/mbox.c
> +++ b/drivers/cxl/core/mbox.c
> @@ -1020,6 +1020,32 @@ static int cxl_clear_event_record(struct cxl_memdev_state *mds,
>  	return rc;
>  }
>  
> +static int cxl_send_dc_cap_response(struct cxl_memdev_state *mds,
> +				    struct range *extent, int opcode)
> +{
> +	struct cxl_mbox_cmd mbox_cmd;
> +	size_t size;
> +
> +	struct cxl_mbox_dc_response *dc_res __free(kfree);
> +	size = struct_size(dc_res, extent_list, 1);
> +	dc_res = kzalloc(size, GFP_KERNEL);
> +	if (!dc_res)
> +		return -ENOMEM;
> +
> +	dc_res->extent_list[0].dpa_start = cpu_to_le64(extent->start);
> +	memset(dc_res->extent_list[0].reserved, 0, 8);

The space has already been zeroed with kzalloc.

Fan

> +	dc_res->extent_list[0].length = cpu_to_le64(range_len(extent));
> +	dc_res->extent_list_size = cpu_to_le32(1);
> +
> +	mbox_cmd = (struct cxl_mbox_cmd) {
> +		.opcode = opcode,
> +		.size_in = size,
> +		.payload_in = dc_res,
> +	};
> +
> +	return cxl_internal_send_cmd(mds, &mbox_cmd);
> +}
> +
>  static struct cxl_memdev_state *
>  cxled_to_mds(struct cxl_endpoint_decoder *cxled)
>  {
> @@ -1029,6 +1055,23 @@ cxled_to_mds(struct cxl_endpoint_decoder *cxled)
>  	return container_of(cxlds, struct cxl_memdev_state, cxlds);
>  }
>  
> +void cxl_release_ed_extent(struct cxl_ed_extent *extent)
> +{
> +	struct cxl_endpoint_decoder *cxled = extent->cxled;
> +	struct cxl_memdev_state *mds = cxled_to_mds(cxled);
> +	struct device *dev = mds->cxlds.dev;
> +	int rc;
> +
> +	dev_dbg(dev, "Releasing DC extent DPA %#llx - %#llx\n",
> +		extent->dpa_range.start, extent->dpa_range.end);
> +
> +	rc = cxl_send_dc_cap_response(mds, &extent->dpa_range, CXL_MBOX_OP_RELEASE_DC);
> +	if (rc)
> +		dev_dbg(dev, "Failed to respond releasing extent DPA %#llx - %#llx; %d\n",
> +			extent->dpa_range.start, extent->dpa_range.end, rc);
> +}
> +EXPORT_SYMBOL_NS_GPL(cxl_release_ed_extent, CXL);
> +
>  static void cxl_mem_get_records_log(struct cxl_memdev_state *mds,
>  				    enum cxl_event_log_type type)
>  {
> diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
> index 3e563ab29afe..7635ff109578 100644
> --- a/drivers/cxl/core/region.c
> +++ b/drivers/cxl/core/region.c
> @@ -1450,11 +1450,81 @@ static int cxl_region_validate_position(struct cxl_region *cxlr,
>  	return 0;
>  }
>  
> +static int extent_check_overlap(struct device *dev, void *arg)
> +{
> +	struct range *new_range = arg;
> +	struct region_extent *ext;
> +
> +	if (!is_region_extent(dev))
> +		return 0;
> +
> +	ext = to_region_extent(dev);
> +	return range_overlaps(&ext->hpa_range, new_range);
> +}
> +
> +static int extent_overlaps(struct cxl_dax_region *cxlr_dax,
> +			   struct range *hpa_range)
> +{
> +	struct device *dev __free(put_device) =
> +		device_find_child(&cxlr_dax->dev, hpa_range, extent_check_overlap);
> +
> +	if (dev)
> +		return -EINVAL;
> +	return 0;
> +}
> +
>  /* Callers are expected to ensure cxled has been attached to a region */
>  int cxl_ed_add_one_extent(struct cxl_endpoint_decoder *cxled,
>  			  struct cxl_dc_extent *dc_extent)
>  {
> -	return 0;
> +	struct cxl_region *cxlr = cxled->cxld.region;
> +	struct range ext_dpa_range, ext_hpa_range;
> +	struct device *dev = &cxlr->dev;
> +	resource_size_t dpa_offset, hpa;
> +
> +	/*
> +	 * Interleave ways == 1 means this coresponds to a 1:1 mapping between
> +	 * device extents and DAX region extents.  Future implementations
> +	 * should hold DC region extents here until the full dax region extent
> +	 * can be realized.
> +	 */
> +	if (cxlr->params.interleave_ways != 1) {
> +		dev_err(dev, "Interleaving DC not supported\n");
> +		return -EINVAL;
> +	}
> +
> +	ext_dpa_range = (struct range) {
> +		.start = le64_to_cpu(dc_extent->start_dpa),
> +		.end = le64_to_cpu(dc_extent->start_dpa) +
> +			le64_to_cpu(dc_extent->length) - 1,
> +	};
> +
> +	dev_dbg(dev, "Adding DC extent DPA %#llx - %#llx\n",
> +		ext_dpa_range.start, ext_dpa_range.end);
> +
> +	/*
> +	 * Without interleave...
> +	 * HPA offset == DPA offset
> +	 * ... but do the math anyway
> +	 */
> +	dpa_offset = ext_dpa_range.start - cxled->dpa_res->start;
> +	hpa = cxled->cxld.hpa_range.start + dpa_offset;
> +
> +	ext_hpa_range = (struct range) {
> +		.start = hpa - cxlr->cxlr_dax->hpa_range.start,
> +		.end = ext_hpa_range.start + range_len(&ext_dpa_range) - 1,
> +	};
> +
> +	if (extent_overlaps(cxlr->cxlr_dax, &ext_hpa_range))
> +		return -EINVAL;
> +
> +	dev_dbg(dev, "Realizing region extent at HPA %#llx - %#llx\n",
> +		ext_hpa_range.start, ext_hpa_range.end);
> +
> +	return dax_region_create_ext(cxlr->cxlr_dax, &ext_hpa_range,
> +				     (char *)dc_extent->tag,
> +				     &ext_dpa_range,
> +				     cxled);
>  }
>  
>  static int cxl_region_attach_position(struct cxl_region *cxlr,
> @@ -2684,6 +2754,7 @@ static struct cxl_dax_region *cxl_dax_region_alloc(struct cxl_region *cxlr)
>  
>  	dev = &cxlr_dax->dev;
>  	cxlr_dax->cxlr = cxlr;
> +	cxlr->cxlr_dax = cxlr_dax;
>  	device_initialize(dev);
>  	lockdep_set_class(&dev->mutex, &cxl_dax_region_key);
>  	device_set_pm_not_required(dev);
> @@ -2799,7 +2870,10 @@ static int cxl_region_read_extents(struct cxl_region *cxlr)
>  static void cxlr_dax_unregister(void *_cxlr_dax)
>  {
>  	struct cxl_dax_region *cxlr_dax = _cxlr_dax;
> +	struct cxl_region *cxlr = cxlr_dax->cxlr;
>  
> +	cxlr->cxlr_dax = NULL;
> +	cxlr_dax->cxlr = NULL;
>  	device_unregister(&cxlr_dax->dev);
>  }
>  
> diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
> index d585f5fdd3ae..5379ad7f5852 100644
> --- a/drivers/cxl/cxl.h
> +++ b/drivers/cxl/cxl.h
> @@ -564,6 +564,7 @@ struct cxl_region_params {
>   * @type: Endpoint decoder target type
>   * @cxl_nvb: nvdimm bridge for coordinating @cxlr_pmem setup / shutdown
>   * @cxlr_pmem: (for pmem regions) cached copy of the nvdimm bridge
> + * @cxlr_dax: (for DC regions) cached copy of CXL DAX bridge
>   * @flags: Region state flags
>   * @params: active + config params for the region
>   */
> @@ -574,6 +575,7 @@ struct cxl_region {
>  	enum cxl_decoder_type type;
>  	struct cxl_nvdimm_bridge *cxl_nvb;
>  	struct cxl_pmem_region *cxlr_pmem;
> +	struct cxl_dax_region *cxlr_dax;
>  	unsigned long flags;
>  	struct cxl_region_params params;
>  };
> @@ -617,6 +619,41 @@ struct cxl_dax_region {
>  	struct range hpa_range;
>  };
>  
> +/**
> + * struct cxl_ed_extent - Extent within an endpoint decoder
> + * @dpa_range: DPA range this extent covers within the decoder
> + * @cxled: reference to the endpoint decoder
> + */
> +struct cxl_ed_extent {
> +	struct range dpa_range;
> +	struct cxl_endpoint_decoder *cxled;
> +};
> +void cxl_release_ed_extent(struct cxl_ed_extent *extent);
> +
> +/**
> + * struct region_extent - CXL DAX region extent
> + * @dev: device representing this extent
> + * @hpa_range: HPA range of this extent
> + * @label: label of the extent
> + * @ed_ext: Endpoint decoder extent which backs this extent
> + */
> +#define DAX_EXTENT_LABEL_LEN 64
> +struct region_extent {
> +	struct device dev;
> +	struct range hpa_range;
> +	char label[DAX_EXTENT_LABEL_LEN];
> +	struct cxl_ed_extent ed_ext;
> +};
> +
> +int dax_region_create_ext(struct cxl_dax_region *cxlr_dax,
> +			  struct range *hpa_range,
> +			  const char *label,
> +			  struct range *dpa_range,
> +			  struct cxl_endpoint_decoder *cxled);
> +
> +bool is_region_extent(struct device *dev);
> +#define to_region_extent(dev) container_of(dev, struct region_extent, dev)
> +
>  /**
>   * struct cxl_port - logical collection of upstream port devices and
>   *		     downstream port devices to construct a CXL memory
> diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild
> index 030b388800f0..dc0cc1d5e6a0 100644
> --- a/tools/testing/cxl/Kbuild
> +++ b/tools/testing/cxl/Kbuild
> @@ -60,6 +60,7 @@ cxl_core-y += $(CXL_CORE_SRC)/pci.o
>  cxl_core-y += $(CXL_CORE_SRC)/hdm.o
>  cxl_core-y += $(CXL_CORE_SRC)/pmu.o
>  cxl_core-y += $(CXL_CORE_SRC)/cdat.o
> +cxl_core-y += $(CXL_CORE_SRC)/extent.o
>  cxl_core-$(CONFIG_TRACING) += $(CXL_CORE_SRC)/trace.o
>  cxl_core-$(CONFIG_CXL_REGION) += $(CXL_CORE_SRC)/region.o
>  cxl_core-y += config_check.o
> 
> -- 
> 2.44.0
>
Dave Jiang March 28, 2024, 9:11 p.m. UTC | #2
On 3/24/24 4:18 PM, ira.weiny@intel.com wrote:
> From: Navneet Singh <navneet.singh@intel.com>
> 
> Once all extents of an interleave set are present a region must
> surface an extent to the region.
> 
> Without interleaving; endpoint decoder and region extents have a 1:1
> relationship.  Future support for IW > 1 will maintain a N:1
> relationship between the device extents and region extents.
> 
> Create a region extent device for every device extent found.  Release of
> the extent device triggers a response to the underlying hardware extent.
> 
> There is no strong use case to support the addition of extents which
> overlap previously accepted extent ranges.  Reject such new extents
> until such time as a good use case emerges.
> 
> Expose the necessary details of region extents by creating the following
> sysfs entries.
> 
> 	/sys/bus/cxl/devices/dax_regionX/extentY
> 	/sys/bus/cxl/devices/dax_regionX/extentY/offset
> 	/sys/bus/cxl/devices/dax_regionX/extentY/length
> 	/sys/bus/cxl/devices/dax_regionX/extentY/label
> 
> The use of the extent devices by the DAX layer is deferred to later
> patches.
> 
> Signed-off-by: Navneet Singh <navneet.singh@intel.com>
> Co-developed-by: Ira Weiny <ira.weiny@intel.com>
> Signed-off-by: Ira Weiny <ira.weiny@intel.com>
> 
> ---
> Changes for v1
> [iweiny: new patch]
> [iweiny: Rename 'dr_extent' to 'region_extent']
> ---
>  drivers/cxl/core/Makefile |   1 +
>  drivers/cxl/core/extent.c | 133 ++++++++++++++++++++++++++++++++++++++++++++++
>  drivers/cxl/core/mbox.c   |  43 +++++++++++++++
>  drivers/cxl/core/region.c |  76 +++++++++++++++++++++++++-
>  drivers/cxl/cxl.h         |  37 +++++++++++++
>  tools/testing/cxl/Kbuild  |   1 +
>  6 files changed, 290 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/cxl/core/Makefile b/drivers/cxl/core/Makefile
> index 9259bcc6773c..35c5c76bfcf1 100644
> --- a/drivers/cxl/core/Makefile
> +++ b/drivers/cxl/core/Makefile
> @@ -14,5 +14,6 @@ cxl_core-y += pci.o
>  cxl_core-y += hdm.o
>  cxl_core-y += pmu.o
>  cxl_core-y += cdat.o
> +cxl_core-y += extent.o
>  cxl_core-$(CONFIG_TRACING) += trace.o
>  cxl_core-$(CONFIG_CXL_REGION) += region.o
> diff --git a/drivers/cxl/core/extent.c b/drivers/cxl/core/extent.c
> new file mode 100644
> index 000000000000..487c220f1c3c
> --- /dev/null
> +++ b/drivers/cxl/core/extent.c
> @@ -0,0 +1,133 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/* Copyright(c) 2024 Intel Corporation. All rights reserved. */
> +
> +#include <linux/device.h>
> +#include <linux/slab.h>
> +#include <cxl.h>
> +
> +static DEFINE_IDA(cxl_extent_ida);

According to Documentation/core-api/idr.rst, IDR interface is deprecated and xarray usage is preferred.

> +
> +static ssize_t offset_show(struct device *dev, struct device_attribute *attr,
> +			 char *buf)

Parameter alignment a bit off here? and some of the other functions as well.

> +{
> +	struct region_extent *reg_ext = to_region_extent(dev);
> +
> +	return sysfs_emit(buf, "%pa\n", &reg_ext->hpa_range.start);
> +}
> +static DEVICE_ATTR_RO(offset);
> +
> +static ssize_t length_show(struct device *dev, struct device_attribute *attr,
> +			 char *buf)
> +{
> +	struct region_extent *reg_ext = to_region_extent(dev);
> +	u64 length = range_len(&reg_ext->hpa_range);
> +
> +	return sysfs_emit(buf, "%pa\n", &length);
> +}
> +static DEVICE_ATTR_RO(length);
> +
> +static ssize_t label_show(struct device *dev, struct device_attribute *attr,
> +			  char *buf)
> +{
> +	struct region_extent *reg_ext = to_region_extent(dev);
> +
> +	return sysfs_emit(buf, "%s\n", reg_ext->label);
> +}
> +static DEVICE_ATTR_RO(label);
> +
> +static struct attribute *region_extent_attrs[] = {
> +	&dev_attr_offset.attr,
> +	&dev_attr_length.attr,
> +	&dev_attr_label.attr,
> +	NULL,
> +};
> +
> +static const struct attribute_group region_extent_attribute_group = {
> +	.attrs = region_extent_attrs,
> +};
> +
> +static const struct attribute_group *region_extent_attribute_groups[] = {
> +	&region_extent_attribute_group,
> +	NULL,
> +};
> +
> +static void region_extent_release(struct device *dev)
> +{
> +	struct region_extent *reg_ext = to_region_extent(dev);
> +
> +	cxl_release_ed_extent(&reg_ext->ed_ext);
> +	ida_free(&cxl_extent_ida, reg_ext->dev.id);
> +	kfree(reg_ext);
> +}
> +
> +static const struct device_type region_extent_type = {
> +	.name = "extent",
> +	.release = region_extent_release,
> +	.groups = region_extent_attribute_groups,
> +};
> +
> +bool is_region_extent(struct device *dev)
> +{
> +	return dev->type == &region_extent_type;
> +}
> +EXPORT_SYMBOL_NS_GPL(is_region_extent, CXL);
> +
> +static void region_extent_unregister(void *ext)
> +{
> +	struct region_extent *reg_ext = ext;
> +
> +	dev_dbg(&reg_ext->dev, "DAX region rm extent HPA %#llx - %#llx\n",
> +		reg_ext->hpa_range.start, reg_ext->hpa_range.end);
> +	device_unregister(&reg_ext->dev);
> +}
> +
> +int dax_region_create_ext(struct cxl_dax_region *cxlr_dax,
> +			  struct range *hpa_range,
> +			  const char *label,
> +			  struct range *dpa_range,
> +			  struct cxl_endpoint_decoder *cxled)
> +{
> +	struct region_extent *reg_ext;
> +	struct device *dev;
> +	int rc, id;
> +
> +	id = ida_alloc(&cxl_extent_ida, GFP_KERNEL);
> +	if (id < 0)
> +		return -ENOMEM;
> +
> +	reg_ext = kzalloc(sizeof(*reg_ext), GFP_KERNEL);
> +	if (!reg_ext)
> +		return -ENOMEM;
> +
> +	reg_ext->hpa_range = *hpa_range;
> +	reg_ext->ed_ext.dpa_range = *dpa_range;
> +	reg_ext->ed_ext.cxled = cxled;
> +	snprintf(reg_ext->label, DAX_EXTENT_LABEL_LEN, "%s", label);
> +
> +	dev = &reg_ext->dev;
> +	device_initialize(dev);
> +	dev->id = id;
> +	device_set_pm_not_required(dev);
> +	dev->parent = &cxlr_dax->dev;
> +	dev->type = &region_extent_type;
> +	rc = dev_set_name(dev, "extent%d", dev->id);
> +	if (rc)
> +		goto err;
> +
> +	rc = device_add(dev);
> +	if (rc)
> +		goto err;
> +
> +	dev_dbg(dev, "DAX region extent HPA %#llx - %#llx\n",
> +		reg_ext->hpa_range.start, reg_ext->hpa_range.end);
> +
> +	return devm_add_action_or_reset(&cxlr_dax->dev, region_extent_unregister,
> +	reg_ext);
> +
> +err:
> +	dev_err(&cxlr_dax->dev, "Failed to initialize DAX extent dev HPA %#llx - %#llx\n",
> +		reg_ext->hpa_range.start, reg_ext->hpa_range.end);
> +
> +	put_device(dev);
> +	return rc;
> +}
> diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
> index 9e33a0976828..6b00e717e42b 100644
> --- a/drivers/cxl/core/mbox.c
> +++ b/drivers/cxl/core/mbox.c
> @@ -1020,6 +1020,32 @@ static int cxl_clear_event_record(struct cxl_memdev_state *mds,
>  	return rc;
>  }
>  
> +static int cxl_send_dc_cap_response(struct cxl_memdev_state *mds,
> +				    struct range *extent, int opcode)
> +{
> +	struct cxl_mbox_cmd mbox_cmd;
> +	size_t size;
> +
> +	struct cxl_mbox_dc_response *dc_res __free(kfree);
> +	size = struct_size(dc_res, extent_list, 1);
> +	dc_res = kzalloc(size, GFP_KERNEL);
> +	if (!dc_res)
> +		return -ENOMEM;
> +
> +	dc_res->extent_list[0].dpa_start = cpu_to_le64(extent->start);
> +	memset(dc_res->extent_list[0].reserved, 0, 8);

Not needed. kzalloc already zeroed.

DJ

> +	dc_res->extent_list[0].length = cpu_to_le64(range_len(extent));
> +	dc_res->extent_list_size = cpu_to_le32(1);
> +
> +	mbox_cmd = (struct cxl_mbox_cmd) {
> +		.opcode = opcode,
> +		.size_in = size,
> +		.payload_in = dc_res,
> +	};
> +
> +	return cxl_internal_send_cmd(mds, &mbox_cmd);
> +}
> +
>  static struct cxl_memdev_state *
>  cxled_to_mds(struct cxl_endpoint_decoder *cxled)
>  {
> @@ -1029,6 +1055,23 @@ cxled_to_mds(struct cxl_endpoint_decoder *cxled)
>  	return container_of(cxlds, struct cxl_memdev_state, cxlds);
>  }
>  
> +void cxl_release_ed_extent(struct cxl_ed_extent *extent)
> +{
> +	struct cxl_endpoint_decoder *cxled = extent->cxled;
> +	struct cxl_memdev_state *mds = cxled_to_mds(cxled);
> +	struct device *dev = mds->cxlds.dev;
> +	int rc;
> +
> +	dev_dbg(dev, "Releasing DC extent DPA %#llx - %#llx\n",
> +		extent->dpa_range.start, extent->dpa_range.end);
> +
> +	rc = cxl_send_dc_cap_response(mds, &extent->dpa_range, CXL_MBOX_OP_RELEASE_DC);
> +	if (rc)
> +		dev_dbg(dev, "Failed to respond releasing extent DPA %#llx - %#llx; %d\n",
> +			extent->dpa_range.start, extent->dpa_range.end, rc);
> +}
> +EXPORT_SYMBOL_NS_GPL(cxl_release_ed_extent, CXL);
> +
>  static void cxl_mem_get_records_log(struct cxl_memdev_state *mds,
>  				    enum cxl_event_log_type type)
>  {
> diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
> index 3e563ab29afe..7635ff109578 100644
> --- a/drivers/cxl/core/region.c
> +++ b/drivers/cxl/core/region.c
> @@ -1450,11 +1450,81 @@ static int cxl_region_validate_position(struct cxl_region *cxlr,
>  	return 0;
>  }
>  
> +static int extent_check_overlap(struct device *dev, void *arg)
> +{
> +	struct range *new_range = arg;
> +	struct region_extent *ext;
> +
> +	if (!is_region_extent(dev))
> +		return 0;
> +
> +	ext = to_region_extent(dev);
> +	return range_overlaps(&ext->hpa_range, new_range);
> +}
> +
> +static int extent_overlaps(struct cxl_dax_region *cxlr_dax,
> +			   struct range *hpa_range)
> +{
> +	struct device *dev __free(put_device) =
> +		device_find_child(&cxlr_dax->dev, hpa_range, extent_check_overlap);
> +
> +	if (dev)
> +		return -EINVAL;
> +	return 0;
> +}
> +
>  /* Callers are expected to ensure cxled has been attached to a region */
>  int cxl_ed_add_one_extent(struct cxl_endpoint_decoder *cxled,
>  			  struct cxl_dc_extent *dc_extent)
>  {
> -	return 0;
> +	struct cxl_region *cxlr = cxled->cxld.region;
> +	struct range ext_dpa_range, ext_hpa_range;
> +	struct device *dev = &cxlr->dev;
> +	resource_size_t dpa_offset, hpa;
> +
> +	/*
> +	 * Interleave ways == 1 means this coresponds to a 1:1 mapping between
> +	 * device extents and DAX region extents.  Future implementations
> +	 * should hold DC region extents here until the full dax region extent
> +	 * can be realized.
> +	 */
> +	if (cxlr->params.interleave_ways != 1) {
> +		dev_err(dev, "Interleaving DC not supported\n");
> +		return -EINVAL;
> +	}
> +
> +	ext_dpa_range = (struct range) {
> +		.start = le64_to_cpu(dc_extent->start_dpa),
> +		.end = le64_to_cpu(dc_extent->start_dpa) +
> +			le64_to_cpu(dc_extent->length) - 1,
> +	};
> +
> +	dev_dbg(dev, "Adding DC extent DPA %#llx - %#llx\n",
> +		ext_dpa_range.start, ext_dpa_range.end);
> +
> +	/*
> +	 * Without interleave...
> +	 * HPA offset == DPA offset
> +	 * ... but do the math anyway
> +	 */
> +	dpa_offset = ext_dpa_range.start - cxled->dpa_res->start;
> +	hpa = cxled->cxld.hpa_range.start + dpa_offset;
> +
> +	ext_hpa_range = (struct range) {
> +		.start = hpa - cxlr->cxlr_dax->hpa_range.start,
> +		.end = ext_hpa_range.start + range_len(&ext_dpa_range) - 1,
> +	};
> +
> +	if (extent_overlaps(cxlr->cxlr_dax, &ext_hpa_range))
> +		return -EINVAL;
> +
> +	dev_dbg(dev, "Realizing region extent at HPA %#llx - %#llx\n",
> +		ext_hpa_range.start, ext_hpa_range.end);
> +
> +	return dax_region_create_ext(cxlr->cxlr_dax, &ext_hpa_range,
> +				     (char *)dc_extent->tag,
> +				     &ext_dpa_range,
> +				     cxled);
>  }
>  
>  static int cxl_region_attach_position(struct cxl_region *cxlr,
> @@ -2684,6 +2754,7 @@ static struct cxl_dax_region *cxl_dax_region_alloc(struct cxl_region *cxlr)
>  
>  	dev = &cxlr_dax->dev;
>  	cxlr_dax->cxlr = cxlr;
> +	cxlr->cxlr_dax = cxlr_dax;
>  	device_initialize(dev);
>  	lockdep_set_class(&dev->mutex, &cxl_dax_region_key);
>  	device_set_pm_not_required(dev);
> @@ -2799,7 +2870,10 @@ static int cxl_region_read_extents(struct cxl_region *cxlr)
>  static void cxlr_dax_unregister(void *_cxlr_dax)
>  {
>  	struct cxl_dax_region *cxlr_dax = _cxlr_dax;
> +	struct cxl_region *cxlr = cxlr_dax->cxlr;
>  
> +	cxlr->cxlr_dax = NULL;
> +	cxlr_dax->cxlr = NULL;
>  	device_unregister(&cxlr_dax->dev);
>  }
>  
> diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
> index d585f5fdd3ae..5379ad7f5852 100644
> --- a/drivers/cxl/cxl.h
> +++ b/drivers/cxl/cxl.h
> @@ -564,6 +564,7 @@ struct cxl_region_params {
>   * @type: Endpoint decoder target type
>   * @cxl_nvb: nvdimm bridge for coordinating @cxlr_pmem setup / shutdown
>   * @cxlr_pmem: (for pmem regions) cached copy of the nvdimm bridge
> + * @cxlr_dax: (for DC regions) cached copy of CXL DAX bridge
>   * @flags: Region state flags
>   * @params: active + config params for the region
>   */
> @@ -574,6 +575,7 @@ struct cxl_region {
>  	enum cxl_decoder_type type;
>  	struct cxl_nvdimm_bridge *cxl_nvb;
>  	struct cxl_pmem_region *cxlr_pmem;
> +	struct cxl_dax_region *cxlr_dax;
>  	unsigned long flags;
>  	struct cxl_region_params params;
>  };
> @@ -617,6 +619,41 @@ struct cxl_dax_region {
>  	struct range hpa_range;
>  };
>  
> +/**
> + * struct cxl_ed_extent - Extent within an endpoint decoder
> + * @dpa_range: DPA range this extent covers within the decoder
> + * @cxled: reference to the endpoint decoder
> + */
> +struct cxl_ed_extent {
> +	struct range dpa_range;
> +	struct cxl_endpoint_decoder *cxled;
> +};
> +void cxl_release_ed_extent(struct cxl_ed_extent *extent);
> +
> +/**
> + * struct region_extent - CXL DAX region extent
> + * @dev: device representing this extent
> + * @hpa_range: HPA range of this extent
> + * @label: label of the extent
> + * @ed_ext: Endpoint decoder extent which backs this extent
> + */
> +#define DAX_EXTENT_LABEL_LEN 64
> +struct region_extent {
> +	struct device dev;
> +	struct range hpa_range;
> +	char label[DAX_EXTENT_LABEL_LEN];
> +	struct cxl_ed_extent ed_ext;
> +};
> +
> +int dax_region_create_ext(struct cxl_dax_region *cxlr_dax,
> +			  struct range *hpa_range,
> +			  const char *label,
> +			  struct range *dpa_range,
> +			  struct cxl_endpoint_decoder *cxled);
> +
> +bool is_region_extent(struct device *dev);
> +#define to_region_extent(dev) container_of(dev, struct region_extent, dev)
> +
>  /**
>   * struct cxl_port - logical collection of upstream port devices and
>   *		     downstream port devices to construct a CXL memory
> diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild
> index 030b388800f0..dc0cc1d5e6a0 100644
> --- a/tools/testing/cxl/Kbuild
> +++ b/tools/testing/cxl/Kbuild
> @@ -60,6 +60,7 @@ cxl_core-y += $(CXL_CORE_SRC)/pci.o
>  cxl_core-y += $(CXL_CORE_SRC)/hdm.o
>  cxl_core-y += $(CXL_CORE_SRC)/pmu.o
>  cxl_core-y += $(CXL_CORE_SRC)/cdat.o
> +cxl_core-y += $(CXL_CORE_SRC)/extent.o
>  cxl_core-$(CONFIG_TRACING) += $(CXL_CORE_SRC)/trace.o
>  cxl_core-$(CONFIG_CXL_REGION) += $(CXL_CORE_SRC)/region.o
>  cxl_core-y += config_check.o
>
Jonathan Cameron April 4, 2024, 4:32 p.m. UTC | #3
On Sun, 24 Mar 2024 16:18:19 -0700
ira.weiny@intel.com wrote:

> From: Navneet Singh <navneet.singh@intel.com>
> 
> Once all extents of an interleave set are present a region must
> surface an extent to the region.
> 
> Without interleaving; endpoint decoder and region extents have a 1:1
> relationship.  Future support for IW > 1 will maintain a N:1
> relationship between the device extents and region extents.
> 
> Create a region extent device for every device extent found.  Release of
> the extent device triggers a response to the underlying hardware extent.
> 
> There is no strong use case to support the addition of extents which
> overlap previously accepted extent ranges.  Reject such new extents
> until such time as a good use case emerges.
> 
> Expose the necessary details of region extents by creating the following
> sysfs entries.
> 
> 	/sys/bus/cxl/devices/dax_regionX/extentY
> 	/sys/bus/cxl/devices/dax_regionX/extentY/offset
> 	/sys/bus/cxl/devices/dax_regionX/extentY/length
> 	/sys/bus/cxl/devices/dax_regionX/extentY/label

Docs?  The label in particular worries me a little as I'm not sure what
is in it.  If it's the tag one possible format is a uuid (not a coincidence
that it is the same length) and interpreting that as characters isn't
going to get us far.  I wonder if we have to treat it as a binary attr
given we have no idea what it is.

Otherwise a query inline that may well be answered in later patches.

> 
> The use of the extent devices by the DAX layer is deferred to later
> patches.
> 
> Signed-off-by: Navneet Singh <navneet.singh@intel.com>
> Co-developed-by: Ira Weiny <ira.weiny@intel.com>
> Signed-off-by: Ira Weiny <ira.weiny@intel.com>
> 



> +int dax_region_create_ext(struct cxl_dax_region *cxlr_dax,
> +			  struct range *hpa_range,
> +			  const char *label,
> +			  struct range *dpa_range,
> +			  struct cxl_endpoint_decoder *cxled)
> +{
> +	struct region_extent *reg_ext;
> +	struct device *dev;
> +	int rc, id;
> +
> +	id = ida_alloc(&cxl_extent_ida, GFP_KERNEL);
> +	if (id < 0)
> +		return -ENOMEM;

Whilst it doesn't matter hugely, it's nice if the release does things
in opposite order of the creation. So perhaps move the ida_alloc
after kzalloc or reg_ext?

> +
> +	reg_ext = kzalloc(sizeof(*reg_ext), GFP_KERNEL);
> +	if (!reg_ext)
> +		return -ENOMEM;
> +
> +	reg_ext->hpa_range = *hpa_range;
> +	reg_ext->ed_ext.dpa_range = *dpa_range;
> +	reg_ext->ed_ext.cxled = cxled;
> +	snprintf(reg_ext->label, DAX_EXTENT_LABEL_LEN, "%s", label);
> +
> +	dev = &reg_ext->dev;
> +	device_initialize(dev);
> +	dev->id = id;
> +	device_set_pm_not_required(dev);
> +	dev->parent = &cxlr_dax->dev;
> +	dev->type = &region_extent_type;
> +	rc = dev_set_name(dev, "extent%d", dev->id);
> +	if (rc)
> +		goto err;
> +
> +	rc = device_add(dev);
> +	if (rc)
> +		goto err;
> +
> +	dev_dbg(dev, "DAX region extent HPA %#llx - %#llx\n",
> +		reg_ext->hpa_range.start, reg_ext->hpa_range.end);
> +
> +	return devm_add_action_or_reset(&cxlr_dax->dev, region_extent_unregister,
> +	reg_ext);

Indent

> +
> +err:
> +	dev_err(&cxlr_dax->dev, "Failed to initialize DAX extent dev HPA %#llx - %#llx\n",
> +		reg_ext->hpa_range.start, reg_ext->hpa_range.end);
> +
> +	put_device(dev);
> +	return rc;
> +}
> diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
> index 9e33a0976828..6b00e717e42b 100644
> --- a/drivers/cxl/core/mbox.c
> +++ b/drivers/cxl/core/mbox.c
> @@ -1020,6 +1020,32 @@ static int cxl_clear_event_record(struct cxl_memdev_state *mds,
>  	return rc;
>  }
>  
> +static int cxl_send_dc_cap_response(struct cxl_memdev_state *mds,
> +				    struct range *extent, int opcode)
> +{
> +	struct cxl_mbox_cmd mbox_cmd;
> +	size_t size;
> +
> +	struct cxl_mbox_dc_response *dc_res __free(kfree);
> +	size = struct_size(dc_res, extent_list, 1);
> +	dc_res = kzalloc(size, GFP_KERNEL);
> +	if (!dc_res)
> +		return -ENOMEM;
> +
> +	dc_res->extent_list[0].dpa_start = cpu_to_le64(extent->start);
> +	memset(dc_res->extent_list[0].reserved, 0, 8);
> +	dc_res->extent_list[0].length = cpu_to_le64(range_len(extent));
> +	dc_res->extent_list_size = cpu_to_le32(1);

I guess this comes up later, but such a response means that if we are offered
multiple extents in an add with the more flag set then we always reject all
but the first one.

> +
> +	mbox_cmd = (struct cxl_mbox_cmd) {
> +		.opcode = opcode,
> +		.size_in = size,
> +		.payload_in = dc_res,
> +	};
> +
> +	return cxl_internal_send_cmd(mds, &mbox_cmd);
> +}
> +
>  static struct cxl_memdev_state *
>  cxled_to_mds(struct cxl_endpoint_decoder *cxled)
>  {
> @@ -1029,6 +1055,23 @@ cxled_to_mds(struct cxl_endpoint_decoder *cxled)
>  	return container_of(cxlds, struct cxl_memdev_state, cxlds);
>  }
>  
> +void cxl_release_ed_extent(struct cxl_ed_extent *extent)
> +{
> +	struct cxl_endpoint_decoder *cxled = extent->cxled;
> +	struct cxl_memdev_state *mds = cxled_to_mds(cxled);
> +	struct device *dev = mds->cxlds.dev;
> +	int rc;
> +
> +	dev_dbg(dev, "Releasing DC extent DPA %#llx - %#llx\n",
> +		extent->dpa_range.start, extent->dpa_range.end);
> +
> +	rc = cxl_send_dc_cap_response(mds, &extent->dpa_range, CXL_MBOX_OP_RELEASE_DC);

Long line that doesn't really need to be.

> +	if (rc)
> +		dev_dbg(dev, "Failed to respond releasing extent DPA %#llx - %#llx; %d\n",
> +			extent->dpa_range.start, extent->dpa_range.end, rc);
> +}
> +EXPORT_SYMBOL_NS_GPL(cxl_release_ed_extent, CXL);
> +
>  static void cxl_mem_get_records_log(struct cxl_memdev_state *mds,
>  				    enum cxl_event_log_type type)
>  {
> diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
> index 3e563ab29afe..7635ff109578 100644
> --- a/drivers/cxl/core/region.c
> +++ b/drivers/cxl/core/region.c
> @@ -1450,11 +1450,81 @@ static int cxl_region_validate_position(struct cxl_region *cxlr,
>  	return 0;
>  }

>  static int cxl_region_attach_position(struct cxl_region *cxlr,
> @@ -2684,6 +2754,7 @@ static struct cxl_dax_region *cxl_dax_region_alloc(struct cxl_region *cxlr)
>  
>  	dev = &cxlr_dax->dev;
>  	cxlr_dax->cxlr = cxlr;
> +	cxlr->cxlr_dax = cxlr_dax;
>  	device_initialize(dev);
>  	lockdep_set_class(&dev->mutex, &cxl_dax_region_key);
>  	device_set_pm_not_required(dev);
> @@ -2799,7 +2870,10 @@ static int cxl_region_read_extents(struct cxl_region *cxlr)
>  static void cxlr_dax_unregister(void *_cxlr_dax)
>  {
>  	struct cxl_dax_region *cxlr_dax = _cxlr_dax;
> +	struct cxl_region *cxlr = cxlr_dax->cxlr;
>  
> +	cxlr->cxlr_dax = NULL;
> +	cxlr_dax->cxlr = NULL;
>  	device_unregister(&cxlr_dax->dev);
>  }
>  
> diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
> index d585f5fdd3ae..5379ad7f5852 100644
> --- a/drivers/cxl/cxl.h
> +++ b/drivers/cxl/cxl.h


> +/**
> + * struct region_extent - CXL DAX region extent
> + * @dev: device representing this extent
> + * @hpa_range: HPA range of this extent
> + * @label: label of the extent
> + * @ed_ext: Endpoint decoder extent which backs this extent
> + */
> +#define DAX_EXTENT_LABEL_LEN 64

Something called DAX_* doesn't belong in this header...
Either give a CXL_DAX_ prefix or move the definition if appropriate.
Alison Schofield April 11, 2024, 12:09 a.m. UTC | #4
On Sun, Mar 24, 2024 at 04:18:19PM -0700, Ira Weiny wrote:
> From: Navneet Singh <navneet.singh@intel.com>
> 
> Once all extents of an interleave set are present a region must
> surface an extent to the region.

Why the vague words - realize and surface?

Maybe slip the word DCD in the commit msg:
	cxl/extent: Create DCD region extent devices

And be more explicit about where we are in the setup process:

Once the region driver discovers all the extents of an interleave set
a region extent device must be created for every device extent found.

Maybe rough example, but my intent is to seqway from the fact that
the region driver has done it's part, now we are here, creating the
region extent devices. Should that be 'DAX' region extent devices?

> 
> Without interleaving; endpoint decoder and region extents have a 1:1
> relationship.  Future support for IW > 1 will maintain a N:1
> relationship between the device extents and region extents.
> 
> Create a region extent device for every device extent found.  Release of
> the extent device triggers a response to the underlying hardware extent.
> 
> There is no strong use case to support the addition of extents which
> overlap previously accepted extent ranges.  Reject such new extents
> until such time as a good use case emerges.
> 
> Expose the necessary details of region extents by creating the following
> sysfs entries.
> 
> 	/sys/bus/cxl/devices/dax_regionX/extentY
> 	/sys/bus/cxl/devices/dax_regionX/extentY/offset
> 	/sys/bus/cxl/devices/dax_regionX/extentY/length
> 	/sys/bus/cxl/devices/dax_regionX/extentY/label
> 
> The use of the extent devices by the DAX layer is deferred to later
> patches.

You mean later in this set, right?


> 
> Signed-off-by: Navneet Singh <navneet.singh@intel.com>
> Co-developed-by: Ira Weiny <ira.weiny@intel.com>
> Signed-off-by: Ira Weiny <ira.weiny@intel.com>
> 
> ---
> Changes for v1
> [iweiny: new patch]
> [iweiny: Rename 'dr_extent' to 'region_extent']
> ---
>  drivers/cxl/core/Makefile |   1 +
>  drivers/cxl/core/extent.c | 133 ++++++++++++++++++++++++++++++++++++++++++++++
>  drivers/cxl/core/mbox.c   |  43 +++++++++++++++
>  drivers/cxl/core/region.c |  76 +++++++++++++++++++++++++-
>  drivers/cxl/cxl.h         |  37 +++++++++++++
>  tools/testing/cxl/Kbuild  |   1 +
>  6 files changed, 290 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/cxl/core/Makefile b/drivers/cxl/core/Makefile
> index 9259bcc6773c..35c5c76bfcf1 100644
> --- a/drivers/cxl/core/Makefile
> +++ b/drivers/cxl/core/Makefile
> @@ -14,5 +14,6 @@ cxl_core-y += pci.o
>  cxl_core-y += hdm.o
>  cxl_core-y += pmu.o
>  cxl_core-y += cdat.o
> +cxl_core-y += extent.o
>  cxl_core-$(CONFIG_TRACING) += trace.o
>  cxl_core-$(CONFIG_CXL_REGION) += region.o
> diff --git a/drivers/cxl/core/extent.c b/drivers/cxl/core/extent.c
> new file mode 100644
> index 000000000000..487c220f1c3c
> --- /dev/null
> +++ b/drivers/cxl/core/extent.c
> @@ -0,0 +1,133 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/* Copyright(c) 2024 Intel Corporation. All rights reserved. */
> +
> +#include <linux/device.h>
> +#include <linux/slab.h>
> +#include <cxl.h>
> +
> +static DEFINE_IDA(cxl_extent_ida);
> +
> +static ssize_t offset_show(struct device *dev, struct device_attribute *attr,
> +			 char *buf)
> +{
> +	struct region_extent *reg_ext = to_region_extent(dev);
> +
> +	return sysfs_emit(buf, "%pa\n", &reg_ext->hpa_range.start);
> +}
> +static DEVICE_ATTR_RO(offset);
> +
> +static ssize_t length_show(struct device *dev, struct device_attribute *attr,
> +			 char *buf)
> +{
> +	struct region_extent *reg_ext = to_region_extent(dev);
> +	u64 length = range_len(&reg_ext->hpa_range);
> +
> +	return sysfs_emit(buf, "%pa\n", &length);
> +}
> +static DEVICE_ATTR_RO(length);
> +
> +static ssize_t label_show(struct device *dev, struct device_attribute *attr,
> +			  char *buf)
> +{
> +	struct region_extent *reg_ext = to_region_extent(dev);
> +
> +	return sysfs_emit(buf, "%s\n", reg_ext->label);
> +}
> +static DEVICE_ATTR_RO(label);
> +
> +static struct attribute *region_extent_attrs[] = {
> +	&dev_attr_offset.attr,
> +	&dev_attr_length.attr,
> +	&dev_attr_label.attr,
> +	NULL,
> +};
> +
> +static const struct attribute_group region_extent_attribute_group = {
> +	.attrs = region_extent_attrs,
> +};
> +
> +static const struct attribute_group *region_extent_attribute_groups[] = {
> +	&region_extent_attribute_group,
> +	NULL,
> +};
> +
> +static void region_extent_release(struct device *dev)
> +{
> +	struct region_extent *reg_ext = to_region_extent(dev);
> +
> +	cxl_release_ed_extent(&reg_ext->ed_ext);
> +	ida_free(&cxl_extent_ida, reg_ext->dev.id);
> +	kfree(reg_ext);
> +}
> +
> +static const struct device_type region_extent_type = {
> +	.name = "extent",
> +	.release = region_extent_release,
> +	.groups = region_extent_attribute_groups,
> +};
> +
> +bool is_region_extent(struct device *dev)
> +{
> +	return dev->type == &region_extent_type;
> +}
> +EXPORT_SYMBOL_NS_GPL(is_region_extent, CXL);
> +
> +static void region_extent_unregister(void *ext)
> +{
> +	struct region_extent *reg_ext = ext;
> +
> +	dev_dbg(&reg_ext->dev, "DAX region rm extent HPA %#llx - %#llx\n",
> +		reg_ext->hpa_range.start, reg_ext->hpa_range.end);
> +	device_unregister(&reg_ext->dev);
> +}
> +
> +int dax_region_create_ext(struct cxl_dax_region *cxlr_dax,
> +			  struct range *hpa_range,
> +			  const char *label,
> +			  struct range *dpa_range,
> +			  struct cxl_endpoint_decoder *cxled)
> +{
> +	struct region_extent *reg_ext;
> +	struct device *dev;
> +	int rc, id;
> +
> +	id = ida_alloc(&cxl_extent_ida, GFP_KERNEL);
> +	if (id < 0)
> +		return -ENOMEM;
> +
> +	reg_ext = kzalloc(sizeof(*reg_ext), GFP_KERNEL);
> +	if (!reg_ext)
> +		return -ENOMEM;
> +
> +	reg_ext->hpa_range = *hpa_range;
> +	reg_ext->ed_ext.dpa_range = *dpa_range;
> +	reg_ext->ed_ext.cxled = cxled;
> +	snprintf(reg_ext->label, DAX_EXTENT_LABEL_LEN, "%s", label);
> +
> +	dev = &reg_ext->dev;
> +	device_initialize(dev);
> +	dev->id = id;
> +	device_set_pm_not_required(dev);
> +	dev->parent = &cxlr_dax->dev;
> +	dev->type = &region_extent_type;
> +	rc = dev_set_name(dev, "extent%d", dev->id);
> +	if (rc)
> +		goto err;
> +
> +	rc = device_add(dev);
> +	if (rc)
> +		goto err;
> +
> +	dev_dbg(dev, "DAX region extent HPA %#llx - %#llx\n",
> +		reg_ext->hpa_range.start, reg_ext->hpa_range.end);
> +
> +	return devm_add_action_or_reset(&cxlr_dax->dev, region_extent_unregister,
> +	reg_ext);
> +
> +err:
> +	dev_err(&cxlr_dax->dev, "Failed to initialize DAX extent dev HPA %#llx - %#llx\n",
> +		reg_ext->hpa_range.start, reg_ext->hpa_range.end);
> +
> +	put_device(dev);
> +	return rc;
> +}
> diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
> index 9e33a0976828..6b00e717e42b 100644
> --- a/drivers/cxl/core/mbox.c
> +++ b/drivers/cxl/core/mbox.c
> @@ -1020,6 +1020,32 @@ static int cxl_clear_event_record(struct cxl_memdev_state *mds,
>  	return rc;
>  }
>  
> +static int cxl_send_dc_cap_response(struct cxl_memdev_state *mds,
> +				    struct range *extent, int opcode)
> +{
> +	struct cxl_mbox_cmd mbox_cmd;
> +	size_t size;
> +
> +	struct cxl_mbox_dc_response *dc_res __free(kfree);
> +	size = struct_size(dc_res, extent_list, 1);
> +	dc_res = kzalloc(size, GFP_KERNEL);
> +	if (!dc_res)
> +		return -ENOMEM;
> +
> +	dc_res->extent_list[0].dpa_start = cpu_to_le64(extent->start);
> +	memset(dc_res->extent_list[0].reserved, 0, 8);
> +	dc_res->extent_list[0].length = cpu_to_le64(range_len(extent));
> +	dc_res->extent_list_size = cpu_to_le32(1);

Is the cpu_to_le32(1) necessary?  
I notice similar for .extent_cnt, .start_extent_index in mbox.c


> +
> +	mbox_cmd = (struct cxl_mbox_cmd) {
> +		.opcode = opcode,
> +		.size_in = size,
> +		.payload_in = dc_res,
> +	};
> +
> +	return cxl_internal_send_cmd(mds, &mbox_cmd);
> +}
> +
>  static struct cxl_memdev_state *
>  cxled_to_mds(struct cxl_endpoint_decoder *cxled)
>  {
> @@ -1029,6 +1055,23 @@ cxled_to_mds(struct cxl_endpoint_decoder *cxled)
>  	return container_of(cxlds, struct cxl_memdev_state, cxlds);
>  }
>  
> +void cxl_release_ed_extent(struct cxl_ed_extent *extent)
> +{
> +	struct cxl_endpoint_decoder *cxled = extent->cxled;
> +	struct cxl_memdev_state *mds = cxled_to_mds(cxled);
> +	struct device *dev = mds->cxlds.dev;
> +	int rc;
> +
> +	dev_dbg(dev, "Releasing DC extent DPA %#llx - %#llx\n",
> +		extent->dpa_range.start, extent->dpa_range.end);
> +
> +	rc = cxl_send_dc_cap_response(mds, &extent->dpa_range, CXL_MBOX_OP_RELEASE_DC);
> +	if (rc)
> +		dev_dbg(dev, "Failed to respond releasing extent DPA %#llx - %#llx; %d\n",
> +			extent->dpa_range.start, extent->dpa_range.end, rc);

Don't repeat the start/end details on the second dev_dbg(), add rc value
only.

> +}
> +EXPORT_SYMBOL_NS_GPL(cxl_release_ed_extent, CXL);
> +
>  static void cxl_mem_get_records_log(struct cxl_memdev_state *mds,
>  				    enum cxl_event_log_type type)
>  {
> diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
> index 3e563ab29afe..7635ff109578 100644
> --- a/drivers/cxl/core/region.c
> +++ b/drivers/cxl/core/region.c
> @@ -1450,11 +1450,81 @@ static int cxl_region_validate_position(struct cxl_region *cxlr,
>  	return 0;
>  }
>  
> +static int extent_check_overlap(struct device *dev, void *arg)
> +{
> +	struct range *new_range = arg;
> +	struct region_extent *ext;
> +
> +	if (!is_region_extent(dev))
> +		return 0;
> +
> +	ext = to_region_extent(dev);
> +	return range_overlaps(&ext->hpa_range, new_range);
> +}
> +
> +static int extent_overlaps(struct cxl_dax_region *cxlr_dax,
> +			   struct range *hpa_range)
> +{
> +	struct device *dev __free(put_device) =
> +		device_find_child(&cxlr_dax->dev, hpa_range, extent_check_overlap);
> +
> +	if (dev)
> +		return -EINVAL;
> +	return 0;
> +}
> +
>  /* Callers are expected to ensure cxled has been attached to a region */
>  int cxl_ed_add_one_extent(struct cxl_endpoint_decoder *cxled,
>  			  struct cxl_dc_extent *dc_extent)
>  {
> -	return 0;
> +	struct cxl_region *cxlr = cxled->cxld.region;
> +	struct range ext_dpa_range, ext_hpa_range;
> +	struct device *dev = &cxlr->dev;
> +	resource_size_t dpa_offset, hpa;
> +
> +	/*
> +	 * Interleave ways == 1 means this coresponds to a 1:1 mapping between
> +	 * device extents and DAX region extents.  Future implementations
> +	 * should hold DC region extents here until the full dax region extent
> +	 * can be realized.
> +	 */
> +	if (cxlr->params.interleave_ways != 1) {
> +		dev_err(dev, "Interleaving DC not supported\n");
> +		return -EINVAL;
> +	}
> +
> +	ext_dpa_range = (struct range) {
> +		.start = le64_to_cpu(dc_extent->start_dpa),
> +		.end = le64_to_cpu(dc_extent->start_dpa) +
> +			le64_to_cpu(dc_extent->length) - 1,
> +	};
> +
> +	dev_dbg(dev, "Adding DC extent DPA %#llx - %#llx\n",
> +		ext_dpa_range.start, ext_dpa_range.end);
> +
> +	/*
> +	 * Without interleave...
> +	 * HPA offset == DPA offset
> +	 * ... but do the math anyway
> +	 */
> +	dpa_offset = ext_dpa_range.start - cxled->dpa_res->start;
> +	hpa = cxled->cxld.hpa_range.start + dpa_offset;
> +
> +	ext_hpa_range = (struct range) {
> +		.start = hpa - cxlr->cxlr_dax->hpa_range.start,
> +		.end = ext_hpa_range.start + range_len(&ext_dpa_range) - 1,
> +	};
> +
> +	if (extent_overlaps(cxlr->cxlr_dax, &ext_hpa_range))
> +		return -EINVAL;
> +
> +	dev_dbg(dev, "Realizing region extent at HPA %#llx - %#llx\n",
> +		ext_hpa_range.start, ext_hpa_range.end);
> +
> +	return dax_region_create_ext(cxlr->cxlr_dax, &ext_hpa_range,
> +				     (char *)dc_extent->tag,
> +				     &ext_dpa_range,
> +				     cxled);
>  }
>  
>  static int cxl_region_attach_position(struct cxl_region *cxlr,
> @@ -2684,6 +2754,7 @@ static struct cxl_dax_region *cxl_dax_region_alloc(struct cxl_region *cxlr)
>  
>  	dev = &cxlr_dax->dev;
>  	cxlr_dax->cxlr = cxlr;
> +	cxlr->cxlr_dax = cxlr_dax;
>  	device_initialize(dev);
>  	lockdep_set_class(&dev->mutex, &cxl_dax_region_key);
>  	device_set_pm_not_required(dev);
> @@ -2799,7 +2870,10 @@ static int cxl_region_read_extents(struct cxl_region *cxlr)
>  static void cxlr_dax_unregister(void *_cxlr_dax)
>  {
>  	struct cxl_dax_region *cxlr_dax = _cxlr_dax;
> +	struct cxl_region *cxlr = cxlr_dax->cxlr;
>  
> +	cxlr->cxlr_dax = NULL;
> +	cxlr_dax->cxlr = NULL;
>  	device_unregister(&cxlr_dax->dev);
>  }
>  
> diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
> index d585f5fdd3ae..5379ad7f5852 100644
> --- a/drivers/cxl/cxl.h
> +++ b/drivers/cxl/cxl.h
> @@ -564,6 +564,7 @@ struct cxl_region_params {
>   * @type: Endpoint decoder target type
>   * @cxl_nvb: nvdimm bridge for coordinating @cxlr_pmem setup / shutdown
>   * @cxlr_pmem: (for pmem regions) cached copy of the nvdimm bridge
> + * @cxlr_dax: (for DC regions) cached copy of CXL DAX bridge
>   * @flags: Region state flags
>   * @params: active + config params for the region
>   */
> @@ -574,6 +575,7 @@ struct cxl_region {
>  	enum cxl_decoder_type type;
>  	struct cxl_nvdimm_bridge *cxl_nvb;
>  	struct cxl_pmem_region *cxlr_pmem;
> +	struct cxl_dax_region *cxlr_dax;
>  	unsigned long flags;
>  	struct cxl_region_params params;
>  };
> @@ -617,6 +619,41 @@ struct cxl_dax_region {
>  	struct range hpa_range;
>  };
>  
> +/**
> + * struct cxl_ed_extent - Extent within an endpoint decoder
> + * @dpa_range: DPA range this extent covers within the decoder
> + * @cxled: reference to the endpoint decoder
> + */
> +struct cxl_ed_extent {
> +	struct range dpa_range;
> +	struct cxl_endpoint_decoder *cxled;
> +};
> +void cxl_release_ed_extent(struct cxl_ed_extent *extent);
> +
> +/**
> + * struct region_extent - CXL DAX region extent
> + * @dev: device representing this extent
> + * @hpa_range: HPA range of this extent
> + * @label: label of the extent
> + * @ed_ext: Endpoint decoder extent which backs this extent
> + */
> +#define DAX_EXTENT_LABEL_LEN 64
> +struct region_extent {
> +	struct device dev;
> +	struct range hpa_range;
> +	char label[DAX_EXTENT_LABEL_LEN];
> +	struct cxl_ed_extent ed_ext;
> +};
> +
> +int dax_region_create_ext(struct cxl_dax_region *cxlr_dax,
> +			  struct range *hpa_range,
> +			  const char *label,
> +			  struct range *dpa_range,
> +			  struct cxl_endpoint_decoder *cxled);
> +
> +bool is_region_extent(struct device *dev);
> +#define to_region_extent(dev) container_of(dev, struct region_extent, dev)
> +
>  /**
>   * struct cxl_port - logical collection of upstream port devices and
>   *		     downstream port devices to construct a CXL memory
> diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild
> index 030b388800f0..dc0cc1d5e6a0 100644
> --- a/tools/testing/cxl/Kbuild
> +++ b/tools/testing/cxl/Kbuild
> @@ -60,6 +60,7 @@ cxl_core-y += $(CXL_CORE_SRC)/pci.o
>  cxl_core-y += $(CXL_CORE_SRC)/hdm.o
>  cxl_core-y += $(CXL_CORE_SRC)/pmu.o
>  cxl_core-y += $(CXL_CORE_SRC)/cdat.o
> +cxl_core-y += $(CXL_CORE_SRC)/extent.o
>  cxl_core-$(CONFIG_TRACING) += $(CXL_CORE_SRC)/trace.o
>  cxl_core-$(CONFIG_CXL_REGION) += $(CXL_CORE_SRC)/region.o
>  cxl_core-y += config_check.o
> 
> -- 
> 2.44.0
>
Ira Weiny April 24, 2024, 7:57 p.m. UTC | #5
Dave Jiang wrote:
> 
> 
> On 3/24/24 4:18 PM, ira.weiny@intel.com wrote:
> > From: Navneet Singh <navneet.singh@intel.com>
> > 

[snip]

> > diff --git a/drivers/cxl/core/extent.c b/drivers/cxl/core/extent.c
> > new file mode 100644
> > index 000000000000..487c220f1c3c
> > --- /dev/null
> > +++ b/drivers/cxl/core/extent.c
> > @@ -0,0 +1,133 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/* Copyright(c) 2024 Intel Corporation. All rights reserved. */
> > +
> > +#include <linux/device.h>
> > +#include <linux/slab.h>
> > +#include <cxl.h>
> > +
> > +static DEFINE_IDA(cxl_extent_ida);
> 
> According to Documentation/core-api/idr.rst, IDR interface is deprecated and
> xarray usage is preferred.

IDA != IDR

ida_alloc() provides a unique, unused id for the device.  I worked hard to
eliminate all extra references to the extent objects so as to ensure object
lifetimes.  So I'm keeping this for now.

> > +
> > +static ssize_t offset_show(struct device *dev, struct device_attribute *attr,
> > +			 char *buf)
> 
> Parameter alignment a bit off here? and some of the other functions as well.

Thanks, fixed.

[snip]

> > diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
> > index 9e33a0976828..6b00e717e42b 100644
> > --- a/drivers/cxl/core/mbox.c
> > +++ b/drivers/cxl/core/mbox.c
> > @@ -1020,6 +1020,32 @@ static int cxl_clear_event_record(struct cxl_memdev_state *mds,
> >  	return rc;
> >  }
> >  
> > +static int cxl_send_dc_cap_response(struct cxl_memdev_state *mds,
> > +				    struct range *extent, int opcode)
> > +{
> > +	struct cxl_mbox_cmd mbox_cmd;
> > +	size_t size;
> > +
> > +	struct cxl_mbox_dc_response *dc_res __free(kfree);
> > +	size = struct_size(dc_res, extent_list, 1);
> > +	dc_res = kzalloc(size, GFP_KERNEL);
> > +	if (!dc_res)
> > +		return -ENOMEM;
> > +
> > +	dc_res->extent_list[0].dpa_start = cpu_to_le64(extent->start);
> > +	memset(dc_res->extent_list[0].reserved, 0, 8);
> 
> Not needed. kzalloc already zeroed.

Thanks, Fan mentioned it too.

Ira
Ira Weiny April 30, 2024, 3:23 a.m. UTC | #6
Jonathan Cameron wrote:
> On Sun, 24 Mar 2024 16:18:19 -0700
> ira.weiny@intel.com wrote:
> 
> > From: Navneet Singh <navneet.singh@intel.com>
> > 
> > Once all extents of an interleave set are present a region must
> > surface an extent to the region.
> > 
> > Without interleaving; endpoint decoder and region extents have a 1:1
> > relationship.  Future support for IW > 1 will maintain a N:1
> > relationship between the device extents and region extents.
> > 
> > Create a region extent device for every device extent found.  Release of
> > the extent device triggers a response to the underlying hardware extent.
> > 
> > There is no strong use case to support the addition of extents which
> > overlap previously accepted extent ranges.  Reject such new extents
> > until such time as a good use case emerges.
> > 
> > Expose the necessary details of region extents by creating the following
> > sysfs entries.
> > 
> > 	/sys/bus/cxl/devices/dax_regionX/extentY
> > 	/sys/bus/cxl/devices/dax_regionX/extentY/offset
> > 	/sys/bus/cxl/devices/dax_regionX/extentY/length
> > 	/sys/bus/cxl/devices/dax_regionX/extentY/label
> 
> Docs?

That is a good idea.

> The label in particular worries me a little as I'm not sure what
> is in it.

I envisioned a pass through of the tag.

>
> If it's the tag one possible format is a uuid (not a coincidence
> that it is the same length) and interpreting that as characters isn't
> going to get us far.  I wonder if we have to treat it as a binary attr
> given we have no idea what it is.

In thinking about this more (and running some experiments): none of these are
strictly necessary in this initial implementation.  No code currently uses
them directly.

I questioned these in the past and I've done so again over the weekend.

I was about to rip them out entirely when I remembered Gregory Price's
comments on Discord.  There he indicating a desire to very carefully place
dax devices.  Without at least the offset and length above (and to a
lesser extent the label) this can't be done.

One still has to create and delete dax devices carefully
to place a dax device in a specific place.  But the above give the user
the information to do so.  Without it the user must coordinate with the FM
even more (which we could require initially).

On particular issue is the simplification I made within the kernel to
track extents.  The extents are no longer ordered within an xarray.

This means a user can't accurately predict which extent will be used when
allocating a dax device.  One has to experiment and look at the resulting
mappings of the dax device to see if it got allocated in the right place.

For example:


|      DC region                                      |
|-----------------------------------------------------|
|--------|          |--------|                        |
| (ext0) |          | (ext1) |                        |
| (1G)   |          | (1G)   |                        |

If the above extents were surfaced in the following order:

	ext1
	ext0

Then a dax device of size 1G was created.  The dax mapping would be:


|      DC region                                      |
|-----------------------------------------------------|
|--------|          |--------|                        |
| (ext0) |          | (ext1) |                        |
| (1G)   |          | (1G)   |                        |
|                   |(daxX.1)|                        |

Allocating another dax device would result in:

|(daxX.2)}          |(daxX.1)|                        |

I don't think this is exactly what the user is going to expect.  This can
be resolved by by looking at the dax device mappings though.[0]  So I'm going
to leave this for now.  But I expect some additional porcelain is going to
be required to fully meet Gregory's requirements.

[0]
	/sys/bus/dax/devices/daxX.Y/mapping[0..N]/start
	/sys/bus/dax/devices/daxX.Y/mapping[0..N]/end

Back to the label field:  It is currently just the 'tag' of the individual
extent (because no interleaving).  My vision for the interleave case would
be for the kernel to assemble device extents into a region extent only if
the tags match and export that.

Thinking on it more though we should leave label out for now.  This is the
second time it has been questioned.

> Otherwise a query inline that may well be answered in later patches.
> 
> > 
> > The use of the extent devices by the DAX layer is deferred to later
> > patches.
> > 
> > Signed-off-by: Navneet Singh <navneet.singh@intel.com>
> > Co-developed-by: Ira Weiny <ira.weiny@intel.com>
> > Signed-off-by: Ira Weiny <ira.weiny@intel.com>
> > 

[snip]

> > +int dax_region_create_ext(struct cxl_dax_region *cxlr_dax,
> > +			  struct range *hpa_range,
> > +			  const char *label,
> > +			  struct range *dpa_range,
> > +			  struct cxl_endpoint_decoder *cxled)
> > +{
> > +	struct region_extent *reg_ext;
> > +	struct device *dev;
> > +	int rc, id;
> > +
> > +	id = ida_alloc(&cxl_extent_ida, GFP_KERNEL);
> > +	if (id < 0)
> > +		return -ENOMEM;
> 
> Whilst it doesn't matter hugely, it's nice if the release does things
> in opposite order of the creation. So perhaps move the ida_alloc
> after kzalloc or reg_ext?

Actually there is an ida resource leak here if the alloc fails.  I'll fix
that too.

> 
> > +
> > +	reg_ext = kzalloc(sizeof(*reg_ext), GFP_KERNEL);
> > +	if (!reg_ext)
> > +		return -ENOMEM;
> > +
> > +	reg_ext->hpa_range = *hpa_range;
> > +	reg_ext->ed_ext.dpa_range = *dpa_range;
> > +	reg_ext->ed_ext.cxled = cxled;
> > +	snprintf(reg_ext->label, DAX_EXTENT_LABEL_LEN, "%s", label);
> > +
> > +	dev = &reg_ext->dev;
> > +	device_initialize(dev);
> > +	dev->id = id;
> > +	device_set_pm_not_required(dev);
> > +	dev->parent = &cxlr_dax->dev;
> > +	dev->type = &region_extent_type;
> > +	rc = dev_set_name(dev, "extent%d", dev->id);
> > +	if (rc)
> > +		goto err;
> > +
> > +	rc = device_add(dev);
> > +	if (rc)
> > +		goto err;
> > +
> > +	dev_dbg(dev, "DAX region extent HPA %#llx - %#llx\n",
> > +		reg_ext->hpa_range.start, reg_ext->hpa_range.end);
> > +
> > +	return devm_add_action_or_reset(&cxlr_dax->dev, region_extent_unregister,
> > +	reg_ext);
> 
> Indent

Yep.

> 
> > +
> > +err:
> > +	dev_err(&cxlr_dax->dev, "Failed to initialize DAX extent dev HPA %#llx - %#llx\n",
> > +		reg_ext->hpa_range.start, reg_ext->hpa_range.end);
> > +
> > +	put_device(dev);
> > +	return rc;
> > +}
> > diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
> > index 9e33a0976828..6b00e717e42b 100644
> > --- a/drivers/cxl/core/mbox.c
> > +++ b/drivers/cxl/core/mbox.c
> > @@ -1020,6 +1020,32 @@ static int cxl_clear_event_record(struct cxl_memdev_state *mds,
> >  	return rc;
> >  }
> >  
> > +static int cxl_send_dc_cap_response(struct cxl_memdev_state *mds,
> > +				    struct range *extent, int opcode)
> > +{
> > +	struct cxl_mbox_cmd mbox_cmd;
> > +	size_t size;
> > +
> > +	struct cxl_mbox_dc_response *dc_res __free(kfree);
> > +	size = struct_size(dc_res, extent_list, 1);
> > +	dc_res = kzalloc(size, GFP_KERNEL);
> > +	if (!dc_res)
> > +		return -ENOMEM;
> > +
> > +	dc_res->extent_list[0].dpa_start = cpu_to_le64(extent->start);
> > +	memset(dc_res->extent_list[0].reserved, 0, 8);
> > +	dc_res->extent_list[0].length = cpu_to_le64(range_len(extent));
> > +	dc_res->extent_list_size = cpu_to_le32(1);
> 
> I guess this comes up later, but such a response means that if we are offered
> multiple extents in an add with the more flag set then we always reject all
> but the first one.

I've thought about how to best support for the more flag without major
complications.  So far the use of the more flag is IMO more trouble than
it is worth.  I agree that the spec is clear WRT the grouping of a
response with the more flag set but it is very vague on __why__.

> 
> > +
> > +	mbox_cmd = (struct cxl_mbox_cmd) {
> > +		.opcode = opcode,
> > +		.size_in = size,
> > +		.payload_in = dc_res,
> > +	};
> > +
> > +	return cxl_internal_send_cmd(mds, &mbox_cmd);
> > +}
> > +
> >  static struct cxl_memdev_state *
> >  cxled_to_mds(struct cxl_endpoint_decoder *cxled)
> >  {
> > @@ -1029,6 +1055,23 @@ cxled_to_mds(struct cxl_endpoint_decoder *cxled)
> >  	return container_of(cxlds, struct cxl_memdev_state, cxlds);
> >  }
> >  
> > +void cxl_release_ed_extent(struct cxl_ed_extent *extent)
> > +{
> > +	struct cxl_endpoint_decoder *cxled = extent->cxled;
> > +	struct cxl_memdev_state *mds = cxled_to_mds(cxled);
> > +	struct device *dev = mds->cxlds.dev;
> > +	int rc;
> > +
> > +	dev_dbg(dev, "Releasing DC extent DPA %#llx - %#llx\n",
> > +		extent->dpa_range.start, extent->dpa_range.end);
> > +
> > +	rc = cxl_send_dc_cap_response(mds, &extent->dpa_range, CXL_MBOX_OP_RELEASE_DC);
> 
> Long line that doesn't really need to be.

Yep

> 
> > +	if (rc)
> > +		dev_dbg(dev, "Failed to respond releasing extent DPA %#llx - %#llx; %d\n",
> > +			extent->dpa_range.start, extent->dpa_range.end, rc);
> > +}
> > +EXPORT_SYMBOL_NS_GPL(cxl_release_ed_extent, CXL);
> > +
> >  static void cxl_mem_get_records_log(struct cxl_memdev_state *mds,
> >  				    enum cxl_event_log_type type)
> >  {
> > diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
> > index 3e563ab29afe..7635ff109578 100644
> > --- a/drivers/cxl/core/region.c
> > +++ b/drivers/cxl/core/region.c
> > @@ -1450,11 +1450,81 @@ static int cxl_region_validate_position(struct cxl_region *cxlr,
> >  	return 0;
> >  }
> 
> >  static int cxl_region_attach_position(struct cxl_region *cxlr,
> > @@ -2684,6 +2754,7 @@ static struct cxl_dax_region *cxl_dax_region_alloc(struct cxl_region *cxlr)
> >  
> >  	dev = &cxlr_dax->dev;
> >  	cxlr_dax->cxlr = cxlr;
> > +	cxlr->cxlr_dax = cxlr_dax;
> >  	device_initialize(dev);
> >  	lockdep_set_class(&dev->mutex, &cxl_dax_region_key);
> >  	device_set_pm_not_required(dev);
> > @@ -2799,7 +2870,10 @@ static int cxl_region_read_extents(struct cxl_region *cxlr)
> >  static void cxlr_dax_unregister(void *_cxlr_dax)
> >  {
> >  	struct cxl_dax_region *cxlr_dax = _cxlr_dax;
> > +	struct cxl_region *cxlr = cxlr_dax->cxlr;
> >  
> > +	cxlr->cxlr_dax = NULL;
> > +	cxlr_dax->cxlr = NULL;
> >  	device_unregister(&cxlr_dax->dev);
> >  }
> >  
> > diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
> > index d585f5fdd3ae..5379ad7f5852 100644
> > --- a/drivers/cxl/cxl.h
> > +++ b/drivers/cxl/cxl.h
> 
> 
> > +/**
> > + * struct region_extent - CXL DAX region extent
> > + * @dev: device representing this extent
> > + * @hpa_range: HPA range of this extent
> > + * @label: label of the extent
> > + * @ed_ext: Endpoint decoder extent which backs this extent
> > + */
> > +#define DAX_EXTENT_LABEL_LEN 64
> 
> Something called DAX_* doesn't belong in this header...
> Either give a CXL_DAX_ prefix or move the definition if appropriate.
> 

I've remove this as well as the label sysfs instead.

Ira
Dan Williams May 2, 2024, 9:12 p.m. UTC | #7
Ira Weiny wrote:
> Jonathan Cameron wrote:
> > On Sun, 24 Mar 2024 16:18:19 -0700
> > ira.weiny@intel.com wrote:
> > 
> > > From: Navneet Singh <navneet.singh@intel.com>
> > > 
> > > Once all extents of an interleave set are present a region must
> > > surface an extent to the region.
> > > 
> > > Without interleaving; endpoint decoder and region extents have a 1:1
> > > relationship.  Future support for IW > 1 will maintain a N:1
> > > relationship between the device extents and region extents.
> > > 
> > > Create a region extent device for every device extent found.  Release of
> > > the extent device triggers a response to the underlying hardware extent.
> > > 
> > > There is no strong use case to support the addition of extents which
> > > overlap previously accepted extent ranges.  Reject such new extents
> > > until such time as a good use case emerges.
> > > 
> > > Expose the necessary details of region extents by creating the following
> > > sysfs entries.
> > > 
> > > 	/sys/bus/cxl/devices/dax_regionX/extentY
> > > 	/sys/bus/cxl/devices/dax_regionX/extentY/offset
> > > 	/sys/bus/cxl/devices/dax_regionX/extentY/length
> > > 	/sys/bus/cxl/devices/dax_regionX/extentY/label
> > 
> > Docs?
> 
> That is a good idea.
> 
> > The label in particular worries me a little as I'm not sure what
> > is in it.
> 
> I envisioned a pass through of the tag.
> 
> >
> > If it's the tag one possible format is a uuid (not a coincidence
> > that it is the same length) and interpreting that as characters isn't
> > going to get us far.  I wonder if we have to treat it as a binary attr
> > given we have no idea what it is.
> 
> In thinking about this more (and running some experiments): none of these are
> strictly necessary in this initial implementation.  No code currently uses
> them directly.
> 
> I questioned these in the past and I've done so again over the weekend.
> 
> I was about to rip them out entirely when I remembered Gregory Price's
> comments on Discord.  There he indicating a desire to very carefully place
> dax devices.  Without at least the offset and length above (and to a
> lesser extent the label) this can't be done.

Careful placement of dax-devices physically requires an entirely new
allocation ABI. There is the mapping_store() interface that was added
for a specific kexec / VMM fast restore use case, but that never
envisioned the sparse region case. So I do think it is worthwhile to
punt on that question to a later add-on feature.
 
[..]
> I don't think this is exactly what the user is going to expect.  This can
> be resolved by by looking at the dax device mappings though.[0]  So I'm going
> to leave this for now.  But I expect some additional porcelain is going to
> be required to fully meet Gregory's requirements.

Not sure what the exact requirement is, but if it's the typical, "I want
to allocate by tag", then I think there is another potential coarse
grained solution that probably covers most cases. Allow multiple
dax_regions per cxl_dcd_regions, where each dax_region manages an
exclusive set of tags.

The host negotiates a dax_region tag layout with the orchestrator and
can then trust that all of the extents that show up in a given dax_region
belong to a given tag or set of tags.

This is not something that needs to be considered in the initial
enabling, but is potentially a way to avoid bolting-on a new fine grained
allocation api after the fact.


> [0]
> 	/sys/bus/dax/devices/daxX.Y/mapping[0..N]/start
> 	/sys/bus/dax/devices/daxX.Y/mapping[0..N]/end
> 
> Back to the label field:  It is currently just the 'tag' of the individual
> extent (because no interleaving).  My vision for the interleave case would
> be for the kernel to assemble device extents into a region extent only if
> the tags match and export that.
> 
> Thinking on it more though we should leave label out for now.  This is the
> second time it has been questioned.

I don't understand the issue. That is a critical piece of information
and it is at the cxl device level

/sys/bus/cxl/devices/dax_regionX/extentY/label

...now I would just call that "tag" and UUID format it (to Jonathan's
point), but I see no rationale to hide what is most likely the most
useful information about an extent.
Ira Weiny May 6, 2024, 4:35 a.m. UTC | #8
Dan Williams wrote:
> Ira Weiny wrote:
> > Jonathan Cameron wrote:
> > > On Sun, 24 Mar 2024 16:18:19 -0700
> > > ira.weiny@intel.com wrote:
> > > 
> > > > From: Navneet Singh <navneet.singh@intel.com>
> > > > 
> > > > Once all extents of an interleave set are present a region must
> > > > surface an extent to the region.
> > > > 
> > > > Without interleaving; endpoint decoder and region extents have a 1:1
> > > > relationship.  Future support for IW > 1 will maintain a N:1
> > > > relationship between the device extents and region extents.
> > > > 
> > > > Create a region extent device for every device extent found.  Release of
> > > > the extent device triggers a response to the underlying hardware extent.
> > > > 
> > > > There is no strong use case to support the addition of extents which
> > > > overlap previously accepted extent ranges.  Reject such new extents
> > > > until such time as a good use case emerges.
> > > > 
> > > > Expose the necessary details of region extents by creating the following
> > > > sysfs entries.
> > > > 
> > > > 	/sys/bus/cxl/devices/dax_regionX/extentY
> > > > 	/sys/bus/cxl/devices/dax_regionX/extentY/offset
> > > > 	/sys/bus/cxl/devices/dax_regionX/extentY/length
> > > > 	/sys/bus/cxl/devices/dax_regionX/extentY/label
> > > 
> > > Docs?
> > 
> > That is a good idea.
> > 
> > > The label in particular worries me a little as I'm not sure what
> > > is in it.
> > 
> > I envisioned a pass through of the tag.
> > 
> > >
> > > If it's the tag one possible format is a uuid (not a coincidence
> > > that it is the same length) and interpreting that as characters isn't
> > > going to get us far.  I wonder if we have to treat it as a binary attr
> > > given we have no idea what it is.
> > 
> > In thinking about this more (and running some experiments): none of these are
> > strictly necessary in this initial implementation.  No code currently uses
> > them directly.
> > 
> > I questioned these in the past and I've done so again over the weekend.
> > 
> > I was about to rip them out entirely when I remembered Gregory Price's
> > comments on Discord.  There he indicating a desire to very carefully place
> > dax devices.  Without at least the offset and length above (and to a
> > lesser extent the label) this can't be done.
> 
> Careful placement of dax-devices physically requires an entirely new
> allocation ABI. There is the mapping_store() interface that was added
> for a specific kexec / VMM fast restore use case, but that never
> envisioned the sparse region case. So I do think it is worthwhile to
> punt on that question to a later add-on feature.

Agreed.

>  
> [..]
> > I don't think this is exactly what the user is going to expect.  This can
> > be resolved by by looking at the dax device mappings though.[0]  So I'm going
> > to leave this for now.  But I expect some additional porcelain is going to
> > be required to fully meet Gregory's requirements.
> 
> Not sure what the exact requirement is, but if it's the typical, "I want
> to allocate by tag",

I'm extrapolating that it will be.  I want to allocate on the first extent.
Tags were removed from the series a while ago.

> then I think there is another potential coarse
> grained solution that probably covers most cases. Allow multiple
> dax_regions per cxl_dcd_regions, where each dax_region manages an
> exclusive set of tags.

I'll have to think on that because don't dax regions map to specific dpas on
creation?

> 
> The host negotiates a dax_region tag layout with the orchestrator and
> can then trust that all of the extents that show up in a given dax_region
> belong to a given tag or set of tags.
> 
> This is not something that needs to be considered in the initial
> enabling, but is potentially a way to avoid bolting-on a new fine grained
> allocation api after the fact.
> 

The point is I'm not trying to bolt anything on.  Just trying to explain what
can and can't be done.   The purpose of these entries was to give the user the
ability to see what extents existed and by correlating the dax mappings could
see where their dax mappings landed.  Careful allocation of dax devices could
result in the use of some extents and not others.  But this was __not__ at all
intended to be done initially.  Just use the space as it come available without
any tag use at all.

> 
> > [0]
> > 	/sys/bus/dax/devices/daxX.Y/mapping[0..N]/start
> > 	/sys/bus/dax/devices/daxX.Y/mapping[0..N]/end
> > 
> > Back to the label field:  It is currently just the 'tag' of the individual
> > extent (because no interleaving).  My vision for the interleave case would
> > be for the kernel to assemble device extents into a region extent only if
> > the tags match and export that.
> > 
> > Thinking on it more though we should leave label out for now.  This is the
> > second time it has been questioned.
> 
> I don't understand the issue. That is a critical piece of information
> and it is at the cxl device level
> 
> /sys/bus/cxl/devices/dax_regionX/extentY/label
> 
> ...now I would just call that "tag" and UUID format it (to Jonathan's
> point), but I see no rationale to hide what is most likely the most
> useful information about an extent.

The rationale is that the user was not going to use it.  So no use case == no
reason to have it...  yet.  I had code a while back to allocate dax devices on
specific tags.  But that was deemed to different from the current dax
allocation mechanism so it was scrapped...  for now.

I can add it back...  But I'm just getting a bit testy about who wants what and
how this is all going to get used.

Ira
Dan Williams May 7, 2024, 1:30 a.m. UTC | #9
ira.weiny@ wrote:
> From: Navneet Singh <navneet.singh@intel.com>
> 
> Once all extents of an interleave set are present a region must
> surface an extent to the region.
> 
> Without interleaving; endpoint decoder and region extents have a 1:1
> relationship.  Future support for IW > 1 will maintain a N:1
> relationship between the device extents and region extents.
> 
> Create a region extent device for every device extent found.  Release of
> the extent device triggers a response to the underlying hardware extent.
> 
> There is no strong use case to support the addition of extents which
> overlap previously accepted extent ranges.  Reject such new extents
> until such time as a good use case emerges.
> 
> Expose the necessary details of region extents by creating the following
> sysfs entries.
> 
> 	/sys/bus/cxl/devices/dax_regionX/extentY
> 	/sys/bus/cxl/devices/dax_regionX/extentY/offset
> 	/sys/bus/cxl/devices/dax_regionX/extentY/length
> 	/sys/bus/cxl/devices/dax_regionX/extentY/label
> 
> The use of the extent devices by the DAX layer is deferred to later
> patches.
> 
> Signed-off-by: Navneet Singh <navneet.singh@intel.com>
> Co-developed-by: Ira Weiny <ira.weiny@intel.com>
> Signed-off-by: Ira Weiny <ira.weiny@intel.com>
> 
> ---
> Changes for v1
> [iweiny: new patch]
> [iweiny: Rename 'dr_extent' to 'region_extent']
> ---
>  drivers/cxl/core/Makefile |   1 +
>  drivers/cxl/core/extent.c | 133 ++++++++++++++++++++++++++++++++++++++++++++++
>  drivers/cxl/core/mbox.c   |  43 +++++++++++++++
>  drivers/cxl/core/region.c |  76 +++++++++++++++++++++++++-
>  drivers/cxl/cxl.h         |  37 +++++++++++++
>  tools/testing/cxl/Kbuild  |   1 +
>  6 files changed, 290 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/cxl/core/Makefile b/drivers/cxl/core/Makefile
> index 9259bcc6773c..35c5c76bfcf1 100644
> --- a/drivers/cxl/core/Makefile
> +++ b/drivers/cxl/core/Makefile
> @@ -14,5 +14,6 @@ cxl_core-y += pci.o
>  cxl_core-y += hdm.o
>  cxl_core-y += pmu.o
>  cxl_core-y += cdat.o
> +cxl_core-y += extent.o
>  cxl_core-$(CONFIG_TRACING) += trace.o
>  cxl_core-$(CONFIG_CXL_REGION) += region.o
> diff --git a/drivers/cxl/core/extent.c b/drivers/cxl/core/extent.c
> new file mode 100644
> index 000000000000..487c220f1c3c
> --- /dev/null
> +++ b/drivers/cxl/core/extent.c
> @@ -0,0 +1,133 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/* Copyright(c) 2024 Intel Corporation. All rights reserved. */
> +
> +#include <linux/device.h>
> +#include <linux/slab.h>
> +#include <cxl.h>
> +
> +static DEFINE_IDA(cxl_extent_ida);

I would have expected this to be region scoped, that would also allow
them to all be listed as extents on the "cxl" bus because they are CXL
objects.

So at a minimum they would be named:

    dev_set_name(dev, "extent%d.%d", cxlr->id, extent_id)

...but given the idea to have multiple dax_region as the management
mechanism for coarse routing of tags by regions this might want to be a
triplet of

    cxlr->id, dcd_dax_region_id, extent_id

...at a minimum lets give some freedom to figure out the tag routing
mechanism especially with the threat of multiple dax_region's per
cxl_region.

> +
> +static ssize_t offset_show(struct device *dev, struct device_attribute *attr,
> +			 char *buf)

For something called offset I would expect the output to be relative to
the base HPA of the parent cxl_region. Implementation below looks like
an absolute, not an offset.

Now it might be an offset but "hpa_range" refers to an absolute range
everywhere else it appears in the driver.

> +{
> +	struct region_extent *reg_ext = to_region_extent(dev);
> +
> +	return sysfs_emit(buf, "%pa\n", &reg_ext->hpa_range.start);

Note "%pa" is for "phys_addr_t" or "resource_size_t" because those
alternate between "unsigned long" and "unsigned long long". Likely this
is subtly safe because there is currently no place where a phys_addr_t
is larger than a u64, but I would feel better if this assigned to
phys_addr_t or just did %#llx since I think u64 is always an "unsigned
long long"

> +}
> +static DEVICE_ATTR_RO(offset);
> +
> +static ssize_t length_show(struct device *dev, struct device_attribute *attr,
> +			 char *buf)
> +{
> +	struct region_extent *reg_ext = to_region_extent(dev);
> +	u64 length = range_len(&reg_ext->hpa_range);
> +
> +	return sysfs_emit(buf, "%pa\n", &length);

Same %pa vs phys_addr_t vs u64 comment.

> +}
> +static DEVICE_ATTR_RO(length);
> +
> +static ssize_t label_show(struct device *dev, struct device_attribute *attr,
> +			  char *buf)
> +{
> +	struct region_extent *reg_ext = to_region_extent(dev);
> +
> +	return sysfs_emit(buf, "%s\n", reg_ext->label);

I like Jonathan's suggestion of just uuid formatting this and calling it
"tag" since these things are CXL device objects they can have CXL
attributes.

If the tag is empty then just hide this attribute.

> +}
> +static DEVICE_ATTR_RO(label);
> +
> +static struct attribute *region_extent_attrs[] = {
> +	&dev_attr_offset.attr,
> +	&dev_attr_length.attr,
> +	&dev_attr_label.attr,
> +	NULL,
> +};
> +
> +static const struct attribute_group region_extent_attribute_group = {
> +	.attrs = region_extent_attrs,
> +};
> +
> +static const struct attribute_group *region_extent_attribute_groups[] = {
> +	&region_extent_attribute_group,
> +	NULL,
> +};

Just use __ATTRIBUTE_GROUPS() helper for this. Note I recommended that
over ATTRIBUTE_GROUPS() since the latter does not allow an is_visible()
callback to be specified.

> +static void region_extent_release(struct device *dev)
> +{
> +	struct region_extent *reg_ext = to_region_extent(dev);

Does the reg_ext abbreviation really buy that much versus just typing
out region_extent?

> +
> +	cxl_release_ed_extent(&reg_ext->ed_ext);

No, Linux object release time is too late to touch the hardware state.
Unregister time is more appropriate. However, this gets back to whole
question about "why should the kernel automatically send a release for
extents that it found present at the beginning of time?"

For example the driver does not reset endpoint decoders on driver
shutdown, why should it free extents just because the driver got
unbound?

Now the question becomes when should it free extents? I am thinking the
policy to start should be identical to endpoint decoders. I.e. explicit
decommitting the region causes all decode state including extents to be
freed and FM release dynamic capacity events to idle capacity are the
only methods that extents get released.

> +	ida_free(&cxl_extent_ida, reg_ext->dev.id);
> +	kfree(reg_ext);
> +}
> +
> +static const struct device_type region_extent_type = {
> +	.name = "extent",
> +	.release = region_extent_release,
> +	.groups = region_extent_attribute_groups,
> +};
> +
> +bool is_region_extent(struct device *dev)
> +{
> +	return dev->type == &region_extent_type;
> +}
> +EXPORT_SYMBOL_NS_GPL(is_region_extent, CXL);

What outside the core needs this export?

> +static void region_extent_unregister(void *ext)
> +{
> +	struct region_extent *reg_ext = ext;
> +
> +	dev_dbg(&reg_ext->dev, "DAX region rm extent HPA %#llx - %#llx\n",
> +		reg_ext->hpa_range.start, reg_ext->hpa_range.end);

Another use for a %par range print helper.

> +	device_unregister(&reg_ext->dev);
> +}
> +
> +int dax_region_create_ext(struct cxl_dax_region *cxlr_dax,
> +			  struct range *hpa_range,
> +			  const char *label,
> +			  struct range *dpa_range,
> +			  struct cxl_endpoint_decoder *cxled)
> +{
> +	struct region_extent *reg_ext;
> +	struct device *dev;
> +	int rc, id;
> +
> +	id = ida_alloc(&cxl_extent_ida, GFP_KERNEL);
> +	if (id < 0)
> +		return -ENOMEM;
> +
> +	reg_ext = kzalloc(sizeof(*reg_ext), GFP_KERNEL);
> +	if (!reg_ext)
> +		return -ENOMEM;

@id leak?

> +
> +	reg_ext->hpa_range = *hpa_range;
> +	reg_ext->ed_ext.dpa_range = *dpa_range;
> +	reg_ext->ed_ext.cxled = cxled;
> +	snprintf(reg_ext->label, DAX_EXTENT_LABEL_LEN, "%s", label);

another instance of the "tag as uuid" feedback.

> +
> +	dev = &reg_ext->dev;
> +	device_initialize(dev);
> +	dev->id = id;
> +	device_set_pm_not_required(dev);
> +	dev->parent = &cxlr_dax->dev;
> +	dev->type = &region_extent_type;

Lets also place these objects on the cxl_bus_type alongside endpoint
decoders etc...

> +	rc = dev_set_name(dev, "extent%d", dev->id);

...but that does require a naming convention that will not collide in
/sys/bus/cxl/devices

> +	if (rc)
> +		goto err;
> +
> +	rc = device_add(dev);
> +	if (rc)
> +		goto err;
> +
> +	dev_dbg(dev, "DAX region extent HPA %#llx - %#llx\n",
> +		reg_ext->hpa_range.start, reg_ext->hpa_range.end);
> +
> +	return devm_add_action_or_reset(&cxlr_dax->dev, region_extent_unregister,

It is awkward to use &cxlr_dax->dev as the devm host here. How do you
know that that @cxlr_dax is or is not attached to its driver at this
point?

Likely if you want this to be the devm host this device enumeration
should be deferred to cxl_dax_region_probe() in drivers/dax/cxl.c.

> +	reg_ext);
> +
> +err:
> +	dev_err(&cxlr_dax->dev, "Failed to initialize DAX extent dev HPA %#llx - %#llx\n",
> +		reg_ext->hpa_range.start, reg_ext->hpa_range.end);
> +
> +	put_device(dev);
> +	return rc;
> +}
> diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
> index 9e33a0976828..6b00e717e42b 100644
> --- a/drivers/cxl/core/mbox.c
> +++ b/drivers/cxl/core/mbox.c
> @@ -1020,6 +1020,32 @@ static int cxl_clear_event_record(struct cxl_memdev_state *mds,
>  	return rc;
>  }
>  
> +static int cxl_send_dc_cap_response(struct cxl_memdev_state *mds,
> +				    struct range *extent, int opcode)
> +{
> +	struct cxl_mbox_cmd mbox_cmd;
> +	size_t size;
> +
> +	struct cxl_mbox_dc_response *dc_res __free(kfree);
> +	size = struct_size(dc_res, extent_list, 1);
> +	dc_res = kzalloc(size, GFP_KERNEL);
> +	if (!dc_res)
> +		return -ENOMEM;
> +
> +	dc_res->extent_list[0].dpa_start = cpu_to_le64(extent->start);
> +	memset(dc_res->extent_list[0].reserved, 0, 8);
> +	dc_res->extent_list[0].length = cpu_to_le64(range_len(extent));
> +	dc_res->extent_list_size = cpu_to_le32(1);
> +
> +	mbox_cmd = (struct cxl_mbox_cmd) {
> +		.opcode = opcode,
> +		.size_in = size,
> +		.payload_in = dc_res,
> +	};
> +
> +	return cxl_internal_send_cmd(mds, &mbox_cmd);
> +}
> +
>  static struct cxl_memdev_state *
>  cxled_to_mds(struct cxl_endpoint_decoder *cxled)
>  {
> @@ -1029,6 +1055,23 @@ cxled_to_mds(struct cxl_endpoint_decoder *cxled)
>  	return container_of(cxlds, struct cxl_memdev_state, cxlds);
>  }
>  
> +void cxl_release_ed_extent(struct cxl_ed_extent *extent)

I am failing to grok this naming see comments on 'struct cxl_ed_extent'

> +{
> +	struct cxl_endpoint_decoder *cxled = extent->cxled;
> +	struct cxl_memdev_state *mds = cxled_to_mds(cxled);
> +	struct device *dev = mds->cxlds.dev;
> +	int rc;
> +
> +	dev_dbg(dev, "Releasing DC extent DPA %#llx - %#llx\n",
> +		extent->dpa_range.start, extent->dpa_range.end);
> +
> +	rc = cxl_send_dc_cap_response(mds, &extent->dpa_range, CXL_MBOX_OP_RELEASE_DC);
> +	if (rc)
> +		dev_dbg(dev, "Failed to respond releasing extent DPA %#llx - %#llx; %d\n",
> +			extent->dpa_range.start, extent->dpa_range.end, rc);
> +}
> +EXPORT_SYMBOL_NS_GPL(cxl_release_ed_extent, CXL);
> +
>  static void cxl_mem_get_records_log(struct cxl_memdev_state *mds,
>  				    enum cxl_event_log_type type)
>  {
> diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
> index 3e563ab29afe..7635ff109578 100644
> --- a/drivers/cxl/core/region.c
> +++ b/drivers/cxl/core/region.c
> @@ -1450,11 +1450,81 @@ static int cxl_region_validate_position(struct cxl_region *cxlr,
>  	return 0;
>  }
>  
> +static int extent_check_overlap(struct device *dev, void *arg)
> +{
> +	struct range *new_range = arg;
> +	struct region_extent *ext;
> +
> +	if (!is_region_extent(dev))
> +		return 0;
> +
> +	ext = to_region_extent(dev);
> +	return range_overlaps(&ext->hpa_range, new_range);
> +}
> +
> +static int extent_overlaps(struct cxl_dax_region *cxlr_dax,
> +			   struct range *hpa_range)
> +{
> +	struct device *dev __free(put_device) =
> +		device_find_child(&cxlr_dax->dev, hpa_range, extent_check_overlap);
> +
> +	if (dev)
> +		return -EINVAL;
> +	return 0;
> +}
> +
>  /* Callers are expected to ensure cxled has been attached to a region */
>  int cxl_ed_add_one_extent(struct cxl_endpoint_decoder *cxled,
>  			  struct cxl_dc_extent *dc_extent)
>  {
> -	return 0;

...and here is the danger of predefining stub functions in one patch and
filling them in later. The validation of extents needed to be moved
earlier in the flow, closer to cxl_region_attach() time, and review of
dax_region_create_ext() identified it needed to be later in the flow.

Empty stubs like this run the risk of not having enough context to
justify when and where they are called.

> +	struct cxl_region *cxlr = cxled->cxld.region;
> +	struct range ext_dpa_range, ext_hpa_range;
> +	struct device *dev = &cxlr->dev;
> +	resource_size_t dpa_offset, hpa;
> +
> +	/*
> +	 * Interleave ways == 1 means this coresponds to a 1:1 mapping between
> +	 * device extents and DAX region extents.  Future implementations
> +	 * should hold DC region extents here until the full dax region extent
> +	 * can be realized.
> +	 */
> +	if (cxlr->params.interleave_ways != 1) {
> +		dev_err(dev, "Interleaving DC not supported\n");
> +		return -EINVAL;
> +	}
> +
> +	ext_dpa_range = (struct range) {
> +		.start = le64_to_cpu(dc_extent->start_dpa),
> +		.end = le64_to_cpu(dc_extent->start_dpa) +
> +			le64_to_cpu(dc_extent->length) - 1,
> +	};
> +
> +	dev_dbg(dev, "Adding DC extent DPA %#llx - %#llx\n",
> +		ext_dpa_range.start, ext_dpa_range.end);
> +
> +	/*
> +	 * Without interleave...
> +	 * HPA offset == DPA offset
> +	 * ... but do the math anyway

The full math would walk the extents of all targets in the
region_extent.

> +	 */
> +	dpa_offset = ext_dpa_range.start - cxled->dpa_res->start;
> +	hpa = cxled->cxld.hpa_range.start + dpa_offset;
> +
> +	ext_hpa_range = (struct range) {
> +		.start = hpa - cxlr->cxlr_dax->hpa_range.start,
> +		.end = ext_hpa_range.start + range_len(&ext_dpa_range) - 1,
> +	};
> +
> +	if (extent_overlaps(cxlr->cxlr_dax, &ext_hpa_range))
> +		return -EINVAL;
> +
> +	dev_dbg(dev, "Realizing region extent at HPA %#llx - %#llx\n",
> +		ext_hpa_range.start, ext_hpa_range.end);
> +
> +	return dax_region_create_ext(cxlr->cxlr_dax, &ext_hpa_range,
> +				     (char *)dc_extent->tag,
> +				     &ext_dpa_range,
> +				     cxled);
>  }
>  
>  static int cxl_region_attach_position(struct cxl_region *cxlr,
> @@ -2684,6 +2754,7 @@ static struct cxl_dax_region *cxl_dax_region_alloc(struct cxl_region *cxlr)
>  
>  	dev = &cxlr_dax->dev;
>  	cxlr_dax->cxlr = cxlr;
> +	cxlr->cxlr_dax = cxlr_dax;
>  	device_initialize(dev);
>  	lockdep_set_class(&dev->mutex, &cxl_dax_region_key);
>  	device_set_pm_not_required(dev);
> @@ -2799,7 +2870,10 @@ static int cxl_region_read_extents(struct cxl_region *cxlr)
>  static void cxlr_dax_unregister(void *_cxlr_dax)
>  {
>  	struct cxl_dax_region *cxlr_dax = _cxlr_dax;
> +	struct cxl_region *cxlr = cxlr_dax->cxlr;
>  
> +	cxlr->cxlr_dax = NULL;
> +	cxlr_dax->cxlr = NULL;
>  	device_unregister(&cxlr_dax->dev);
>  }
>  
> diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
> index d585f5fdd3ae..5379ad7f5852 100644
> --- a/drivers/cxl/cxl.h
> +++ b/drivers/cxl/cxl.h
> @@ -564,6 +564,7 @@ struct cxl_region_params {
>   * @type: Endpoint decoder target type
>   * @cxl_nvb: nvdimm bridge for coordinating @cxlr_pmem setup / shutdown
>   * @cxlr_pmem: (for pmem regions) cached copy of the nvdimm bridge
> + * @cxlr_dax: (for DC regions) cached copy of CXL DAX bridge
>   * @flags: Region state flags
>   * @params: active + config params for the region
>   */
> @@ -574,6 +575,7 @@ struct cxl_region {
>  	enum cxl_decoder_type type;
>  	struct cxl_nvdimm_bridge *cxl_nvb;
>  	struct cxl_pmem_region *cxlr_pmem;
> +	struct cxl_dax_region *cxlr_dax;
>  	unsigned long flags;
>  	struct cxl_region_params params;
>  };
> @@ -617,6 +619,41 @@ struct cxl_dax_region {
>  	struct range hpa_range;
>  };
>  
> +/**
> + * struct cxl_ed_extent - Extent within an endpoint decoder
> + * @dpa_range: DPA range this extent covers within the decoder
> + * @cxled: reference to the endpoint decoder
> + */
> +struct cxl_ed_extent {

Why is _ed_ in the name. It feels like 'struct cxl_extent' is the lowest
level "extent" type to worry about, and a 'struct cxl_region_extent is
an object that represents an interleave-set of extents in HPA space.

> +	struct range dpa_range;
> +	struct cxl_endpoint_decoder *cxled;
> +};
> +void cxl_release_ed_extent(struct cxl_ed_extent *extent);
> +
> +/**
> + * struct region_extent - CXL DAX region extent
> + * @dev: device representing this extent
> + * @hpa_range: HPA range of this extent
> + * @label: label of the extent
> + * @ed_ext: Endpoint decoder extent which backs this extent
> + */
> +#define DAX_EXTENT_LABEL_LEN 64
> +struct region_extent {
> +	struct device dev;
> +	struct range hpa_range;
> +	char label[DAX_EXTENT_LABEL_LEN];
> +	struct cxl_ed_extent ed_ext;

This should always be an array, even if interleaves > 1 are not
supported in this initial enabling it should be an x1 interleave array
from day 1.

> +};
> +
> +int dax_region_create_ext(struct cxl_dax_region *cxlr_dax,
> +			  struct range *hpa_range,
> +			  const char *label,
> +			  struct range *dpa_range,
> +			  struct cxl_endpoint_decoder *cxled);
> +
> +bool is_region_extent(struct device *dev);
> +#define to_region_extent(dev) container_of(dev, struct region_extent, dev)

All the other to_<object>() helpers do runtime object type-safety which
is why all the is_<object>() helpers exist.

Maybe a future patch needs these to be used outside the core, but seems
a premature export at this point.
diff mbox series

Patch

diff --git a/drivers/cxl/core/Makefile b/drivers/cxl/core/Makefile
index 9259bcc6773c..35c5c76bfcf1 100644
--- a/drivers/cxl/core/Makefile
+++ b/drivers/cxl/core/Makefile
@@ -14,5 +14,6 @@  cxl_core-y += pci.o
 cxl_core-y += hdm.o
 cxl_core-y += pmu.o
 cxl_core-y += cdat.o
+cxl_core-y += extent.o
 cxl_core-$(CONFIG_TRACING) += trace.o
 cxl_core-$(CONFIG_CXL_REGION) += region.o
diff --git a/drivers/cxl/core/extent.c b/drivers/cxl/core/extent.c
new file mode 100644
index 000000000000..487c220f1c3c
--- /dev/null
+++ b/drivers/cxl/core/extent.c
@@ -0,0 +1,133 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2024 Intel Corporation. All rights reserved. */
+
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <cxl.h>
+
+static DEFINE_IDA(cxl_extent_ida);
+
+static ssize_t offset_show(struct device *dev, struct device_attribute *attr,
+			 char *buf)
+{
+	struct region_extent *reg_ext = to_region_extent(dev);
+
+	return sysfs_emit(buf, "%pa\n", &reg_ext->hpa_range.start);
+}
+static DEVICE_ATTR_RO(offset);
+
+static ssize_t length_show(struct device *dev, struct device_attribute *attr,
+			 char *buf)
+{
+	struct region_extent *reg_ext = to_region_extent(dev);
+	u64 length = range_len(&reg_ext->hpa_range);
+
+	return sysfs_emit(buf, "%pa\n", &length);
+}
+static DEVICE_ATTR_RO(length);
+
+static ssize_t label_show(struct device *dev, struct device_attribute *attr,
+			  char *buf)
+{
+	struct region_extent *reg_ext = to_region_extent(dev);
+
+	return sysfs_emit(buf, "%s\n", reg_ext->label);
+}
+static DEVICE_ATTR_RO(label);
+
+static struct attribute *region_extent_attrs[] = {
+	&dev_attr_offset.attr,
+	&dev_attr_length.attr,
+	&dev_attr_label.attr,
+	NULL,
+};
+
+static const struct attribute_group region_extent_attribute_group = {
+	.attrs = region_extent_attrs,
+};
+
+static const struct attribute_group *region_extent_attribute_groups[] = {
+	&region_extent_attribute_group,
+	NULL,
+};
+
+static void region_extent_release(struct device *dev)
+{
+	struct region_extent *reg_ext = to_region_extent(dev);
+
+	cxl_release_ed_extent(&reg_ext->ed_ext);
+	ida_free(&cxl_extent_ida, reg_ext->dev.id);
+	kfree(reg_ext);
+}
+
+static const struct device_type region_extent_type = {
+	.name = "extent",
+	.release = region_extent_release,
+	.groups = region_extent_attribute_groups,
+};
+
+bool is_region_extent(struct device *dev)
+{
+	return dev->type == &region_extent_type;
+}
+EXPORT_SYMBOL_NS_GPL(is_region_extent, CXL);
+
+static void region_extent_unregister(void *ext)
+{
+	struct region_extent *reg_ext = ext;
+
+	dev_dbg(&reg_ext->dev, "DAX region rm extent HPA %#llx - %#llx\n",
+		reg_ext->hpa_range.start, reg_ext->hpa_range.end);
+	device_unregister(&reg_ext->dev);
+}
+
+int dax_region_create_ext(struct cxl_dax_region *cxlr_dax,
+			  struct range *hpa_range,
+			  const char *label,
+			  struct range *dpa_range,
+			  struct cxl_endpoint_decoder *cxled)
+{
+	struct region_extent *reg_ext;
+	struct device *dev;
+	int rc, id;
+
+	id = ida_alloc(&cxl_extent_ida, GFP_KERNEL);
+	if (id < 0)
+		return -ENOMEM;
+
+	reg_ext = kzalloc(sizeof(*reg_ext), GFP_KERNEL);
+	if (!reg_ext)
+		return -ENOMEM;
+
+	reg_ext->hpa_range = *hpa_range;
+	reg_ext->ed_ext.dpa_range = *dpa_range;
+	reg_ext->ed_ext.cxled = cxled;
+	snprintf(reg_ext->label, DAX_EXTENT_LABEL_LEN, "%s", label);
+
+	dev = &reg_ext->dev;
+	device_initialize(dev);
+	dev->id = id;
+	device_set_pm_not_required(dev);
+	dev->parent = &cxlr_dax->dev;
+	dev->type = &region_extent_type;
+	rc = dev_set_name(dev, "extent%d", dev->id);
+	if (rc)
+		goto err;
+
+	rc = device_add(dev);
+	if (rc)
+		goto err;
+
+	dev_dbg(dev, "DAX region extent HPA %#llx - %#llx\n",
+		reg_ext->hpa_range.start, reg_ext->hpa_range.end);
+
+	return devm_add_action_or_reset(&cxlr_dax->dev, region_extent_unregister,
+	reg_ext);
+
+err:
+	dev_err(&cxlr_dax->dev, "Failed to initialize DAX extent dev HPA %#llx - %#llx\n",
+		reg_ext->hpa_range.start, reg_ext->hpa_range.end);
+
+	put_device(dev);
+	return rc;
+}
diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
index 9e33a0976828..6b00e717e42b 100644
--- a/drivers/cxl/core/mbox.c
+++ b/drivers/cxl/core/mbox.c
@@ -1020,6 +1020,32 @@  static int cxl_clear_event_record(struct cxl_memdev_state *mds,
 	return rc;
 }
 
+static int cxl_send_dc_cap_response(struct cxl_memdev_state *mds,
+				    struct range *extent, int opcode)
+{
+	struct cxl_mbox_cmd mbox_cmd;
+	size_t size;
+
+	struct cxl_mbox_dc_response *dc_res __free(kfree);
+	size = struct_size(dc_res, extent_list, 1);
+	dc_res = kzalloc(size, GFP_KERNEL);
+	if (!dc_res)
+		return -ENOMEM;
+
+	dc_res->extent_list[0].dpa_start = cpu_to_le64(extent->start);
+	memset(dc_res->extent_list[0].reserved, 0, 8);
+	dc_res->extent_list[0].length = cpu_to_le64(range_len(extent));
+	dc_res->extent_list_size = cpu_to_le32(1);
+
+	mbox_cmd = (struct cxl_mbox_cmd) {
+		.opcode = opcode,
+		.size_in = size,
+		.payload_in = dc_res,
+	};
+
+	return cxl_internal_send_cmd(mds, &mbox_cmd);
+}
+
 static struct cxl_memdev_state *
 cxled_to_mds(struct cxl_endpoint_decoder *cxled)
 {
@@ -1029,6 +1055,23 @@  cxled_to_mds(struct cxl_endpoint_decoder *cxled)
 	return container_of(cxlds, struct cxl_memdev_state, cxlds);
 }
 
+void cxl_release_ed_extent(struct cxl_ed_extent *extent)
+{
+	struct cxl_endpoint_decoder *cxled = extent->cxled;
+	struct cxl_memdev_state *mds = cxled_to_mds(cxled);
+	struct device *dev = mds->cxlds.dev;
+	int rc;
+
+	dev_dbg(dev, "Releasing DC extent DPA %#llx - %#llx\n",
+		extent->dpa_range.start, extent->dpa_range.end);
+
+	rc = cxl_send_dc_cap_response(mds, &extent->dpa_range, CXL_MBOX_OP_RELEASE_DC);
+	if (rc)
+		dev_dbg(dev, "Failed to respond releasing extent DPA %#llx - %#llx; %d\n",
+			extent->dpa_range.start, extent->dpa_range.end, rc);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_release_ed_extent, CXL);
+
 static void cxl_mem_get_records_log(struct cxl_memdev_state *mds,
 				    enum cxl_event_log_type type)
 {
diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
index 3e563ab29afe..7635ff109578 100644
--- a/drivers/cxl/core/region.c
+++ b/drivers/cxl/core/region.c
@@ -1450,11 +1450,81 @@  static int cxl_region_validate_position(struct cxl_region *cxlr,
 	return 0;
 }
 
+static int extent_check_overlap(struct device *dev, void *arg)
+{
+	struct range *new_range = arg;
+	struct region_extent *ext;
+
+	if (!is_region_extent(dev))
+		return 0;
+
+	ext = to_region_extent(dev);
+	return range_overlaps(&ext->hpa_range, new_range);
+}
+
+static int extent_overlaps(struct cxl_dax_region *cxlr_dax,
+			   struct range *hpa_range)
+{
+	struct device *dev __free(put_device) =
+		device_find_child(&cxlr_dax->dev, hpa_range, extent_check_overlap);
+
+	if (dev)
+		return -EINVAL;
+	return 0;
+}
+
 /* Callers are expected to ensure cxled has been attached to a region */
 int cxl_ed_add_one_extent(struct cxl_endpoint_decoder *cxled,
 			  struct cxl_dc_extent *dc_extent)
 {
-	return 0;
+	struct cxl_region *cxlr = cxled->cxld.region;
+	struct range ext_dpa_range, ext_hpa_range;
+	struct device *dev = &cxlr->dev;
+	resource_size_t dpa_offset, hpa;
+
+	/*
+	 * Interleave ways == 1 means this coresponds to a 1:1 mapping between
+	 * device extents and DAX region extents.  Future implementations
+	 * should hold DC region extents here until the full dax region extent
+	 * can be realized.
+	 */
+	if (cxlr->params.interleave_ways != 1) {
+		dev_err(dev, "Interleaving DC not supported\n");
+		return -EINVAL;
+	}
+
+	ext_dpa_range = (struct range) {
+		.start = le64_to_cpu(dc_extent->start_dpa),
+		.end = le64_to_cpu(dc_extent->start_dpa) +
+			le64_to_cpu(dc_extent->length) - 1,
+	};
+
+	dev_dbg(dev, "Adding DC extent DPA %#llx - %#llx\n",
+		ext_dpa_range.start, ext_dpa_range.end);
+
+	/*
+	 * Without interleave...
+	 * HPA offset == DPA offset
+	 * ... but do the math anyway
+	 */
+	dpa_offset = ext_dpa_range.start - cxled->dpa_res->start;
+	hpa = cxled->cxld.hpa_range.start + dpa_offset;
+
+	ext_hpa_range = (struct range) {
+		.start = hpa - cxlr->cxlr_dax->hpa_range.start,
+		.end = ext_hpa_range.start + range_len(&ext_dpa_range) - 1,
+	};
+
+	if (extent_overlaps(cxlr->cxlr_dax, &ext_hpa_range))
+		return -EINVAL;
+
+	dev_dbg(dev, "Realizing region extent at HPA %#llx - %#llx\n",
+		ext_hpa_range.start, ext_hpa_range.end);
+
+	return dax_region_create_ext(cxlr->cxlr_dax, &ext_hpa_range,
+				     (char *)dc_extent->tag,
+				     &ext_dpa_range,
+				     cxled);
 }
 
 static int cxl_region_attach_position(struct cxl_region *cxlr,
@@ -2684,6 +2754,7 @@  static struct cxl_dax_region *cxl_dax_region_alloc(struct cxl_region *cxlr)
 
 	dev = &cxlr_dax->dev;
 	cxlr_dax->cxlr = cxlr;
+	cxlr->cxlr_dax = cxlr_dax;
 	device_initialize(dev);
 	lockdep_set_class(&dev->mutex, &cxl_dax_region_key);
 	device_set_pm_not_required(dev);
@@ -2799,7 +2870,10 @@  static int cxl_region_read_extents(struct cxl_region *cxlr)
 static void cxlr_dax_unregister(void *_cxlr_dax)
 {
 	struct cxl_dax_region *cxlr_dax = _cxlr_dax;
+	struct cxl_region *cxlr = cxlr_dax->cxlr;
 
+	cxlr->cxlr_dax = NULL;
+	cxlr_dax->cxlr = NULL;
 	device_unregister(&cxlr_dax->dev);
 }
 
diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
index d585f5fdd3ae..5379ad7f5852 100644
--- a/drivers/cxl/cxl.h
+++ b/drivers/cxl/cxl.h
@@ -564,6 +564,7 @@  struct cxl_region_params {
  * @type: Endpoint decoder target type
  * @cxl_nvb: nvdimm bridge for coordinating @cxlr_pmem setup / shutdown
  * @cxlr_pmem: (for pmem regions) cached copy of the nvdimm bridge
+ * @cxlr_dax: (for DC regions) cached copy of CXL DAX bridge
  * @flags: Region state flags
  * @params: active + config params for the region
  */
@@ -574,6 +575,7 @@  struct cxl_region {
 	enum cxl_decoder_type type;
 	struct cxl_nvdimm_bridge *cxl_nvb;
 	struct cxl_pmem_region *cxlr_pmem;
+	struct cxl_dax_region *cxlr_dax;
 	unsigned long flags;
 	struct cxl_region_params params;
 };
@@ -617,6 +619,41 @@  struct cxl_dax_region {
 	struct range hpa_range;
 };
 
+/**
+ * struct cxl_ed_extent - Extent within an endpoint decoder
+ * @dpa_range: DPA range this extent covers within the decoder
+ * @cxled: reference to the endpoint decoder
+ */
+struct cxl_ed_extent {
+	struct range dpa_range;
+	struct cxl_endpoint_decoder *cxled;
+};
+void cxl_release_ed_extent(struct cxl_ed_extent *extent);
+
+/**
+ * struct region_extent - CXL DAX region extent
+ * @dev: device representing this extent
+ * @hpa_range: HPA range of this extent
+ * @label: label of the extent
+ * @ed_ext: Endpoint decoder extent which backs this extent
+ */
+#define DAX_EXTENT_LABEL_LEN 64
+struct region_extent {
+	struct device dev;
+	struct range hpa_range;
+	char label[DAX_EXTENT_LABEL_LEN];
+	struct cxl_ed_extent ed_ext;
+};
+
+int dax_region_create_ext(struct cxl_dax_region *cxlr_dax,
+			  struct range *hpa_range,
+			  const char *label,
+			  struct range *dpa_range,
+			  struct cxl_endpoint_decoder *cxled);
+
+bool is_region_extent(struct device *dev);
+#define to_region_extent(dev) container_of(dev, struct region_extent, dev)
+
 /**
  * struct cxl_port - logical collection of upstream port devices and
  *		     downstream port devices to construct a CXL memory
diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild
index 030b388800f0..dc0cc1d5e6a0 100644
--- a/tools/testing/cxl/Kbuild
+++ b/tools/testing/cxl/Kbuild
@@ -60,6 +60,7 @@  cxl_core-y += $(CXL_CORE_SRC)/pci.o
 cxl_core-y += $(CXL_CORE_SRC)/hdm.o
 cxl_core-y += $(CXL_CORE_SRC)/pmu.o
 cxl_core-y += $(CXL_CORE_SRC)/cdat.o
+cxl_core-y += $(CXL_CORE_SRC)/extent.o
 cxl_core-$(CONFIG_TRACING) += $(CXL_CORE_SRC)/trace.o
 cxl_core-$(CONFIG_CXL_REGION) += $(CXL_CORE_SRC)/region.o
 cxl_core-y += config_check.o