Message ID | 20240324-dcd-type2-upstream-v1-14-b7b00d623625@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | DCD: Add support for Dynamic Capacity Devices (DCD) | expand |
On Sun, Mar 24, 2024 at 04:18:17PM -0700, ira.weiny@intel.com wrote: > From: Navneet Singh <navneet.singh@intel.com> > > Dynamic capacity device extents may be left in an accepted state on a > device due to an unexpected host crash. In this case creation of a new > region on top of the DC partition (region) is expected to expose those > extents for continued use. > > Once all endpoint decoders are part of a region and the region is being > realized read the device extent list. For ease of review, this patch > stops after reading the extent list and leaves realization of the region > extents to a future patch. > > Signed-off-by: Navneet Singh <navneet.singh@intel.com> > Co-developed-by: Ira Weiny <ira.weiny@intel.com> > Signed-off-by: Ira Weiny <ira.weiny@intel.com> > > --- > Changes for v1: > [iweiny: remove extent list xarray] > [iweiny: Update spec references to 3.1] > [iweiny: use struct range in extents] > [iweiny: remove all reference tracking and let regions track extents > through the extent devices.] > [djbw/Jonathan/Fan: move extent tracking to endpoint decoders] > --- > drivers/cxl/core/core.h | 9 +++ > drivers/cxl/core/mbox.c | 192 ++++++++++++++++++++++++++++++++++++++++++++++ > drivers/cxl/core/region.c | 29 +++++++ > drivers/cxl/cxlmem.h | 49 ++++++++++++ > 4 files changed, 279 insertions(+) > > diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h > index 91abeffbe985..119b12362977 100644 > --- a/drivers/cxl/core/core.h > +++ b/drivers/cxl/core/core.h > @@ -4,6 +4,8 @@ > #ifndef __CXL_CORE_H__ > #define __CXL_CORE_H__ > > +#include <cxlmem.h> > + > extern const struct device_type cxl_nvdimm_bridge_type; > extern const struct device_type cxl_nvdimm_type; > extern const struct device_type cxl_pmu_type; > @@ -28,6 +30,8 @@ void cxl_decoder_kill_region(struct cxl_endpoint_decoder *cxled); > int cxl_region_init(void); > void cxl_region_exit(void); > int cxl_get_poison_by_endpoint(struct cxl_port *port); > +int cxl_ed_add_one_extent(struct cxl_endpoint_decoder *cxled, > + struct cxl_dc_extent *dc_extent); > #else > static inline int cxl_get_poison_by_endpoint(struct cxl_port *port) > { > @@ -43,6 +47,11 @@ static inline int cxl_region_init(void) > static inline void cxl_region_exit(void) > { > } > +static inline int cxl_ed_add_one_extent(struct cxl_endpoint_decoder *cxled, > + struct cxl_dc_extent *dc_extent) > +{ > + return 0; > +} > #define CXL_REGION_ATTR(x) NULL > #define CXL_REGION_TYPE(x) NULL > #define SET_CXL_REGION_ATTR(x) > diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c > index 58b31fa47b93..9e33a0976828 100644 > --- a/drivers/cxl/core/mbox.c > +++ b/drivers/cxl/core/mbox.c > @@ -870,6 +870,53 @@ int cxl_enumerate_cmds(struct cxl_memdev_state *mds) > } > EXPORT_SYMBOL_NS_GPL(cxl_enumerate_cmds, CXL); > > +static int cxl_validate_extent(struct cxl_memdev_state *mds, > + struct cxl_dc_extent *dc_extent) > +{ > + struct device *dev = mds->cxlds.dev; > + uint64_t start, len; > + > + start = le64_to_cpu(dc_extent->start_dpa); > + len = le64_to_cpu(dc_extent->length); > + > + /* Extents must not cross region boundary's */ > + for (int i = 0; i < mds->nr_dc_region; i++) { > + struct cxl_dc_region_info *dcr = &mds->dc_region[i]; > + > + if (dcr->base <= start && > + (start + len) <= (dcr->base + dcr->decode_len)) { Why not use range_contains here as below? Fan > + dev_dbg(dev, "DC extent DPA %#llx - %#llx (DCR:%d:%#llx)\n", > + start, start + len - 1, i, start - dcr->base); > + return 0; > + } > + } > + > + dev_err_ratelimited(dev, > + "DC extent DPA %#llx - %#llx is not in any DC region\n", > + start, start + len - 1); > + return -EINVAL; > +} > + > +static bool cxl_dc_extent_in_ed(struct cxl_endpoint_decoder *cxled, > + struct cxl_dc_extent *extent) > +{ > + uint64_t start = le64_to_cpu(extent->start_dpa); > + uint64_t length = le64_to_cpu(extent->length); > + struct range ext_range = (struct range){ > + .start = start, > + .end = start + length - 1, > + }; > + struct range ed_range = (struct range) { > + .start = cxled->dpa_res->start, > + .end = cxled->dpa_res->end, > + }; > + > + dev_dbg(&cxled->cxld.dev, "Checking ED (%pr) for extent DPA:%#llx LEN:%#llx\n", > + cxled->dpa_res, start, length); > + > + return range_contains(&ed_range, &ext_range); > +} > + > void cxl_event_trace_record(const struct cxl_memdev *cxlmd, > enum cxl_event_log_type type, > enum cxl_event_type event_type, > @@ -973,6 +1020,15 @@ static int cxl_clear_event_record(struct cxl_memdev_state *mds, > return rc; > } > > +static struct cxl_memdev_state * > +cxled_to_mds(struct cxl_endpoint_decoder *cxled) > +{ > + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); > + struct cxl_dev_state *cxlds = cxlmd->cxlds; > + > + return container_of(cxlds, struct cxl_memdev_state, cxlds); > +} > + > static void cxl_mem_get_records_log(struct cxl_memdev_state *mds, > enum cxl_event_log_type type) > { > @@ -1406,6 +1462,142 @@ int cxl_dev_dynamic_capacity_identify(struct cxl_memdev_state *mds) > } > EXPORT_SYMBOL_NS_GPL(cxl_dev_dynamic_capacity_identify, CXL); > > +static int cxl_dev_get_dc_extent_cnt(struct cxl_memdev_state *mds, > + unsigned int *extent_gen_num) > +{ > + struct cxl_mbox_get_dc_extent_in get_dc_extent; > + struct cxl_mbox_get_dc_extent_out dc_extents; > + struct cxl_mbox_cmd mbox_cmd; > + unsigned int count; > + int rc; > + > + get_dc_extent = (struct cxl_mbox_get_dc_extent_in) { > + .extent_cnt = cpu_to_le32(0), > + .start_extent_index = cpu_to_le32(0), > + }; > + > + mbox_cmd = (struct cxl_mbox_cmd) { > + .opcode = CXL_MBOX_OP_GET_DC_EXTENT_LIST, > + .payload_in = &get_dc_extent, > + .size_in = sizeof(get_dc_extent), > + .size_out = sizeof(dc_extents), > + .payload_out = &dc_extents, > + .min_out = 1, > + }; > + > + rc = cxl_internal_send_cmd(mds, &mbox_cmd); > + if (rc < 0) > + return rc; > + > + count = le32_to_cpu(dc_extents.total_extent_cnt); > + *extent_gen_num = le32_to_cpu(dc_extents.extent_list_num); > + > + return count; > +} > + > +static int cxl_dev_get_dc_extents(struct cxl_endpoint_decoder *cxled, > + unsigned int start_gen_num, > + unsigned int exp_cnt) > +{ > + struct cxl_memdev_state *mds = cxled_to_mds(cxled); > + unsigned int start_index, total_read; > + struct device *dev = mds->cxlds.dev; > + struct cxl_mbox_cmd mbox_cmd; > + > + struct cxl_mbox_get_dc_extent_out *dc_extents __free(kfree) = > + kvmalloc(mds->payload_size, GFP_KERNEL); > + if (!dc_extents) > + return -ENOMEM; > + > + total_read = 0; > + start_index = 0; > + do { > + unsigned int nr_ext, total_extent_cnt, gen_num; > + struct cxl_mbox_get_dc_extent_in get_dc_extent; > + int rc; > + > + get_dc_extent = (struct cxl_mbox_get_dc_extent_in) { > + .extent_cnt = cpu_to_le32(exp_cnt - start_index), > + .start_extent_index = cpu_to_le32(start_index), > + }; > + > + mbox_cmd = (struct cxl_mbox_cmd) { > + .opcode = CXL_MBOX_OP_GET_DC_EXTENT_LIST, > + .payload_in = &get_dc_extent, > + .size_in = sizeof(get_dc_extent), > + .size_out = mds->payload_size, > + .payload_out = dc_extents, > + .min_out = 1, > + }; > + > + rc = cxl_internal_send_cmd(mds, &mbox_cmd); > + if (rc < 0) > + return rc; > + > + nr_ext = le32_to_cpu(dc_extents->ret_extent_cnt); > + total_read += nr_ext; > + total_extent_cnt = le32_to_cpu(dc_extents->total_extent_cnt); > + gen_num = le32_to_cpu(dc_extents->extent_list_num); > + > + dev_dbg(dev, "Get extent list count:%d generation Num:%d\n", > + total_extent_cnt, gen_num); > + > + if (gen_num != start_gen_num || exp_cnt != total_extent_cnt) { > + dev_err(dev, "Possible incomplete extent list; gen %u != %u : cnt %u != %u\n", > + gen_num, start_gen_num, exp_cnt, total_extent_cnt); > + return -EIO; > + } > + > + for (int i = 0; i < nr_ext ; i++) { > + dev_dbg(dev, "Processing extent %d/%d\n", > + start_index + i, exp_cnt); > + rc = cxl_validate_extent(mds, &dc_extents->extent[i]); > + if (rc) > + continue; > + if (!cxl_dc_extent_in_ed(cxled, &dc_extents->extent[i])) > + continue; > + rc = cxl_ed_add_one_extent(cxled, &dc_extents->extent[i]); > + if (rc) > + return rc; > + } > + > + start_index += nr_ext; > + } while (exp_cnt > total_read); > + > + return 0; > +} > + > +/** > + * cxl_read_dc_extents() - Read any existing extents > + * @cxled: Endpoint decoder which is part of a region > + * > + * Issue the Get Dynamic Capacity Extent List command to the device > + * and add any existing extents found which belong to this decoder. > + * > + * Return: 0 if command was executed successfully, -ERRNO on error. > + */ > +int cxl_read_dc_extents(struct cxl_endpoint_decoder *cxled) > +{ > + struct cxl_memdev_state *mds = cxled_to_mds(cxled); > + struct device *dev = mds->cxlds.dev; > + unsigned int extent_gen_num; > + int rc; > + > + if (!cxl_dcd_supported(mds)) { > + dev_dbg(dev, "DCD unsupported\n"); > + return 0; > + } > + > + rc = cxl_dev_get_dc_extent_cnt(mds, &extent_gen_num); > + dev_dbg(mds->cxlds.dev, "Extent count: %d Generation Num: %d\n", > + rc, extent_gen_num); > + if (rc <= 0) /* 0 == no records found */ > + return rc; > + > + return cxl_dev_get_dc_extents(cxled, extent_gen_num, rc); > +} > +EXPORT_SYMBOL_NS_GPL(cxl_read_dc_extents, CXL); > + > static int add_dpa_res(struct device *dev, struct resource *parent, > struct resource *res, resource_size_t start, > resource_size_t size, const char *type) > diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c > index 0d7b09a49dcf..3e563ab29afe 100644 > --- a/drivers/cxl/core/region.c > +++ b/drivers/cxl/core/region.c > @@ -1450,6 +1450,13 @@ static int cxl_region_validate_position(struct cxl_region *cxlr, > return 0; > } > > +/* Callers are expected to ensure cxled has been attached to a region */ > +int cxl_ed_add_one_extent(struct cxl_endpoint_decoder *cxled, > + struct cxl_dc_extent *dc_extent) > +{ > + return 0; > +} > + > static int cxl_region_attach_position(struct cxl_region *cxlr, > struct cxl_root_decoder *cxlrd, > struct cxl_endpoint_decoder *cxled, > @@ -2773,6 +2780,22 @@ static int devm_cxl_add_pmem_region(struct cxl_region *cxlr) > return rc; > } > > +static int cxl_region_read_extents(struct cxl_region *cxlr) > +{ > + struct cxl_region_params *p = &cxlr->params; > + int i; > + > + for (i = 0; i < p->nr_targets; i++) { > + int rc; > + > + rc = cxl_read_dc_extents(p->targets[i]); > + if (rc) > + return rc; > + } > + > + return 0; > +} > + > static void cxlr_dax_unregister(void *_cxlr_dax) > { > struct cxl_dax_region *cxlr_dax = _cxlr_dax; > @@ -2807,6 +2830,12 @@ static int devm_cxl_add_dax_region(struct cxl_region *cxlr) > dev_dbg(&cxlr->dev, "%s: register %s\n", dev_name(dev->parent), > dev_name(dev)); > > + if (cxlr->mode == CXL_REGION_DC) { > + rc = cxl_region_read_extents(cxlr); > + if (rc) > + goto err; > + } > + > return devm_add_action_or_reset(&cxlr->dev, cxlr_dax_unregister, > cxlr_dax); > err: > diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h > index 01bee6eedff3..8f2d8944d334 100644 > --- a/drivers/cxl/cxlmem.h > +++ b/drivers/cxl/cxlmem.h > @@ -604,6 +604,54 @@ enum cxl_opcode { > UUID_INIT(0xe1819d9, 0x11a9, 0x400c, 0x81, 0x1f, 0xd6, 0x07, 0x19, \ > 0x40, 0x3d, 0x86) > > +/* > + * Add Dynamic Capacity Response > + * CXL rev 3.1 section 8.2.9.9.9.3; Table 8-168 & Table 8-169 > + */ > +struct cxl_mbox_dc_response { > + __le32 extent_list_size; > + u8 flags; > + u8 reserved[3]; > + struct updated_extent_list { > + __le64 dpa_start; > + __le64 length; > + u8 reserved[8]; > + } __packed extent_list[]; > +} __packed; > + > +/* > + * CXL rev 3.1 section 8.2.9.2.1.6; Table 8-51 > + */ > +#define CXL_DC_EXTENT_TAG_LEN 0x10 > +struct cxl_dc_extent { > + __le64 start_dpa; > + __le64 length; > + u8 tag[CXL_DC_EXTENT_TAG_LEN]; > + __le16 shared_extn_seq; > + u8 reserved[6]; > +} __packed; > + > +/* > + * Get Dynamic Capacity Extent List; Input Payload > + * CXL rev 3.1 section 8.2.9.9.9.2; Table 8-166 > + */ > +struct cxl_mbox_get_dc_extent_in { > + __le32 extent_cnt; > + __le32 start_extent_index; > +} __packed; > + > +/* > + * Get Dynamic Capacity Extent List; Output Payload > + * CXL rev 3.1 section 8.2.9.9.9.2; Table 8-167 > + */ > +struct cxl_mbox_get_dc_extent_out { > + __le32 ret_extent_cnt; > + __le32 total_extent_cnt; > + __le32 extent_list_num; > + u8 rsvd[4]; > + struct cxl_dc_extent extent[]; > +} __packed; > + > struct cxl_mbox_get_supported_logs { > __le16 entries; > u8 rsvd[6]; > @@ -879,6 +927,7 @@ int cxl_internal_send_cmd(struct cxl_memdev_state *mds, > struct cxl_mbox_cmd *cmd); > int cxl_dev_state_identify(struct cxl_memdev_state *mds); > int cxl_dev_dynamic_capacity_identify(struct cxl_memdev_state *mds); > +int cxl_read_dc_extents(struct cxl_endpoint_decoder *cxled); > int cxl_await_media_ready(struct cxl_dev_state *cxlds); > int cxl_enumerate_cmds(struct cxl_memdev_state *mds); > int cxl_mem_create_range_info(struct cxl_memdev_state *mds); > > -- > 2.44.0 >
On Sun, Mar 24, 2024 at 04:18:17PM -0700, ira.weiny@intel.com wrote: > From: Navneet Singh <navneet.singh@intel.com> > > Dynamic capacity device extents may be left in an accepted state on a > device due to an unexpected host crash. In this case creation of a new > region on top of the DC partition (region) is expected to expose those > extents for continued use. > > Once all endpoint decoders are part of a region and the region is being > realized read the device extent list. For ease of review, this patch > stops after reading the extent list and leaves realization of the region > extents to a future patch. > > Signed-off-by: Navneet Singh <navneet.singh@intel.com> > Co-developed-by: Ira Weiny <ira.weiny@intel.com> > Signed-off-by: Ira Weiny <ira.weiny@intel.com> > > --- > Changes for v1: > [iweiny: remove extent list xarray] > [iweiny: Update spec references to 3.1] > [iweiny: use struct range in extents] > [iweiny: remove all reference tracking and let regions track extents > through the extent devices.] > [djbw/Jonathan/Fan: move extent tracking to endpoint decoders] > --- > drivers/cxl/core/core.h | 9 +++ > drivers/cxl/core/mbox.c | 192 ++++++++++++++++++++++++++++++++++++++++++++++ > drivers/cxl/core/region.c | 29 +++++++ > drivers/cxl/cxlmem.h | 49 ++++++++++++ > 4 files changed, 279 insertions(+) > > diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h > index 91abeffbe985..119b12362977 100644 > --- a/drivers/cxl/core/core.h > +++ b/drivers/cxl/core/core.h > @@ -4,6 +4,8 @@ > #ifndef __CXL_CORE_H__ > #define __CXL_CORE_H__ > > +#include <cxlmem.h> > + > extern const struct device_type cxl_nvdimm_bridge_type; > extern const struct device_type cxl_nvdimm_type; > extern const struct device_type cxl_pmu_type; > @@ -28,6 +30,8 @@ void cxl_decoder_kill_region(struct cxl_endpoint_decoder *cxled); > int cxl_region_init(void); > void cxl_region_exit(void); > int cxl_get_poison_by_endpoint(struct cxl_port *port); > +int cxl_ed_add_one_extent(struct cxl_endpoint_decoder *cxled, > + struct cxl_dc_extent *dc_extent); > #else > static inline int cxl_get_poison_by_endpoint(struct cxl_port *port) > { > @@ -43,6 +47,11 @@ static inline int cxl_region_init(void) > static inline void cxl_region_exit(void) > { > } > +static inline int cxl_ed_add_one_extent(struct cxl_endpoint_decoder *cxled, > + struct cxl_dc_extent *dc_extent) > +{ > + return 0; > +} > #define CXL_REGION_ATTR(x) NULL > #define CXL_REGION_TYPE(x) NULL > #define SET_CXL_REGION_ATTR(x) > diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c > index 58b31fa47b93..9e33a0976828 100644 > --- a/drivers/cxl/core/mbox.c > +++ b/drivers/cxl/core/mbox.c > @@ -870,6 +870,53 @@ int cxl_enumerate_cmds(struct cxl_memdev_state *mds) > } > EXPORT_SYMBOL_NS_GPL(cxl_enumerate_cmds, CXL); > > +static int cxl_validate_extent(struct cxl_memdev_state *mds, > + struct cxl_dc_extent *dc_extent) > +{ > + struct device *dev = mds->cxlds.dev; > + uint64_t start, len; > + > + start = le64_to_cpu(dc_extent->start_dpa); > + len = le64_to_cpu(dc_extent->length); > + > + /* Extents must not cross region boundary's */ > + for (int i = 0; i < mds->nr_dc_region; i++) { > + struct cxl_dc_region_info *dcr = &mds->dc_region[i]; > + > + if (dcr->base <= start && > + (start + len) <= (dcr->base + dcr->decode_len)) { > + dev_dbg(dev, "DC extent DPA %#llx - %#llx (DCR:%d:%#llx)\n", > + start, start + len - 1, i, start - dcr->base); > + return 0; > + } > + } > + > + dev_err_ratelimited(dev, > + "DC extent DPA %#llx - %#llx is not in any DC region\n", > + start, start + len - 1); > + return -EINVAL; > +} > + > +static bool cxl_dc_extent_in_ed(struct cxl_endpoint_decoder *cxled, > + struct cxl_dc_extent *extent) > +{ > + uint64_t start = le64_to_cpu(extent->start_dpa); > + uint64_t length = le64_to_cpu(extent->length); > + struct range ext_range = (struct range){ > + .start = start, > + .end = start + length - 1, > + }; > + struct range ed_range = (struct range) { > + .start = cxled->dpa_res->start, > + .end = cxled->dpa_res->end, > + }; > + > + dev_dbg(&cxled->cxld.dev, "Checking ED (%pr) for extent DPA:%#llx LEN:%#llx\n", > + cxled->dpa_res, start, length); > + > + return range_contains(&ed_range, &ext_range); > +} > + > void cxl_event_trace_record(const struct cxl_memdev *cxlmd, > enum cxl_event_log_type type, > enum cxl_event_type event_type, > @@ -973,6 +1020,15 @@ static int cxl_clear_event_record(struct cxl_memdev_state *mds, > return rc; > } > > +static struct cxl_memdev_state * > +cxled_to_mds(struct cxl_endpoint_decoder *cxled) > +{ > + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); > + struct cxl_dev_state *cxlds = cxlmd->cxlds; > + > + return container_of(cxlds, struct cxl_memdev_state, cxlds); > +} > + > static void cxl_mem_get_records_log(struct cxl_memdev_state *mds, > enum cxl_event_log_type type) > { > @@ -1406,6 +1462,142 @@ int cxl_dev_dynamic_capacity_identify(struct cxl_memdev_state *mds) > } > EXPORT_SYMBOL_NS_GPL(cxl_dev_dynamic_capacity_identify, CXL); > > +static int cxl_dev_get_dc_extent_cnt(struct cxl_memdev_state *mds, > + unsigned int *extent_gen_num) > +{ > + struct cxl_mbox_get_dc_extent_in get_dc_extent; > + struct cxl_mbox_get_dc_extent_out dc_extents; > + struct cxl_mbox_cmd mbox_cmd; > + unsigned int count; > + int rc; > + > + get_dc_extent = (struct cxl_mbox_get_dc_extent_in) { > + .extent_cnt = cpu_to_le32(0), > + .start_extent_index = cpu_to_le32(0), > + }; > + > + mbox_cmd = (struct cxl_mbox_cmd) { > + .opcode = CXL_MBOX_OP_GET_DC_EXTENT_LIST, > + .payload_in = &get_dc_extent, > + .size_in = sizeof(get_dc_extent), > + .size_out = sizeof(dc_extents), > + .payload_out = &dc_extents, > + .min_out = 1, > + }; > + > + rc = cxl_internal_send_cmd(mds, &mbox_cmd); > + if (rc < 0) > + return rc; > + > + count = le32_to_cpu(dc_extents.total_extent_cnt); > + *extent_gen_num = le32_to_cpu(dc_extents.extent_list_num); > + > + return count; > +} > + > +static int cxl_dev_get_dc_extents(struct cxl_endpoint_decoder *cxled, > + unsigned int start_gen_num, > + unsigned int exp_cnt) > +{ > + struct cxl_memdev_state *mds = cxled_to_mds(cxled); > + unsigned int start_index, total_read; > + struct device *dev = mds->cxlds.dev; > + struct cxl_mbox_cmd mbox_cmd; > + > + struct cxl_mbox_get_dc_extent_out *dc_extents __free(kfree) = > + kvmalloc(mds->payload_size, GFP_KERNEL); > + if (!dc_extents) > + return -ENOMEM; > + > + total_read = 0; > + start_index = 0; > + do { > + unsigned int nr_ext, total_extent_cnt, gen_num; > + struct cxl_mbox_get_dc_extent_in get_dc_extent; > + int rc; > + > + get_dc_extent = (struct cxl_mbox_get_dc_extent_in) { > + .extent_cnt = cpu_to_le32(exp_cnt - start_index), > + .start_extent_index = cpu_to_le32(start_index), > + }; > + > + mbox_cmd = (struct cxl_mbox_cmd) { > + .opcode = CXL_MBOX_OP_GET_DC_EXTENT_LIST, > + .payload_in = &get_dc_extent, > + .size_in = sizeof(get_dc_extent), > + .size_out = mds->payload_size, > + .payload_out = dc_extents, > + .min_out = 1, > + }; > + > + rc = cxl_internal_send_cmd(mds, &mbox_cmd); > + if (rc < 0) > + return rc; > + > + nr_ext = le32_to_cpu(dc_extents->ret_extent_cnt); > + total_read += nr_ext; > + total_extent_cnt = le32_to_cpu(dc_extents->total_extent_cnt); > + gen_num = le32_to_cpu(dc_extents->extent_list_num); > + > + dev_dbg(dev, "Get extent list count:%d generation Num:%d\n", > + total_extent_cnt, gen_num); > + > + if (gen_num != start_gen_num || exp_cnt != total_extent_cnt) { > + dev_err(dev, "Possible incomplete extent list; gen %u != %u : cnt %u != %u\n", > + gen_num, start_gen_num, exp_cnt, total_extent_cnt); > + return -EIO; > + } > + > + for (int i = 0; i < nr_ext ; i++) { > + dev_dbg(dev, "Processing extent %d/%d\n", > + start_index + i, exp_cnt); > + rc = cxl_validate_extent(mds, &dc_extents->extent[i]); > + if (rc) > + continue; > + if (!cxl_dc_extent_in_ed(cxled, &dc_extents->extent[i])) > + continue; > + rc = cxl_ed_add_one_extent(cxled, &dc_extents->extent[i]); > + if (rc) > + return rc; > + } > + > + start_index += nr_ext; > + } while (exp_cnt > total_read); > + > + return 0; > +} > + > +/** > + * cxl_read_dc_extents() - Read any existing extents > + * @cxled: Endpoint decoder which is part of a region > + * > + * Issue the Get Dynamic Capacity Extent List command to the device > + * and add any existing extents found which belong to this decoder. > + * > + * Return: 0 if command was executed successfully, -ERRNO on error. > + */ > +int cxl_read_dc_extents(struct cxl_endpoint_decoder *cxled) > +{ > + struct cxl_memdev_state *mds = cxled_to_mds(cxled); > + struct device *dev = mds->cxlds.dev; > + unsigned int extent_gen_num; > + int rc; > + > + if (!cxl_dcd_supported(mds)) { > + dev_dbg(dev, "DCD unsupported\n"); > + return 0; > + } > + > + rc = cxl_dev_get_dc_extent_cnt(mds, &extent_gen_num); > + dev_dbg(mds->cxlds.dev, "Extent count: %d Generation Num: %d\n", > + rc, extent_gen_num); > + if (rc <= 0) /* 0 == no records found */ > + return rc; > + > + return cxl_dev_get_dc_extents(cxled, extent_gen_num, rc); Not sure about the behaviour here. From the cxl_dev_get_dc_extents implementation below, if gen_num changed or the expected extent count changed, it will return error. If I understand it correctly, if the above two values change, it means the extent list has been updated due to extent add/release since last time we read the extent list info (cxl_dev_get_dc_extent_cnt), do we need to fail the operation or try again? Fan > +} > +EXPORT_SYMBOL_NS_GPL(cxl_read_dc_extents, CXL); > + > static int add_dpa_res(struct device *dev, struct resource *parent, > struct resource *res, resource_size_t start, > resource_size_t size, const char *type) > diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c > index 0d7b09a49dcf..3e563ab29afe 100644 > --- a/drivers/cxl/core/region.c > +++ b/drivers/cxl/core/region.c > @@ -1450,6 +1450,13 @@ static int cxl_region_validate_position(struct cxl_region *cxlr, > return 0; > } > > +/* Callers are expected to ensure cxled has been attached to a region */ > +int cxl_ed_add_one_extent(struct cxl_endpoint_decoder *cxled, > + struct cxl_dc_extent *dc_extent) > +{ > + return 0; > +} > + > static int cxl_region_attach_position(struct cxl_region *cxlr, > struct cxl_root_decoder *cxlrd, > struct cxl_endpoint_decoder *cxled, > @@ -2773,6 +2780,22 @@ static int devm_cxl_add_pmem_region(struct cxl_region *cxlr) > return rc; > } > > +static int cxl_region_read_extents(struct cxl_region *cxlr) > +{ > + struct cxl_region_params *p = &cxlr->params; > + int i; > + > + for (i = 0; i < p->nr_targets; i++) { > + int rc; > + > + rc = cxl_read_dc_extents(p->targets[i]); > + if (rc) > + return rc; > + } > + > + return 0; > +} > + > static void cxlr_dax_unregister(void *_cxlr_dax) > { > struct cxl_dax_region *cxlr_dax = _cxlr_dax; > @@ -2807,6 +2830,12 @@ static int devm_cxl_add_dax_region(struct cxl_region *cxlr) > dev_dbg(&cxlr->dev, "%s: register %s\n", dev_name(dev->parent), > dev_name(dev)); > > + if (cxlr->mode == CXL_REGION_DC) { > + rc = cxl_region_read_extents(cxlr); > + if (rc) > + goto err; > + } > + > return devm_add_action_or_reset(&cxlr->dev, cxlr_dax_unregister, > cxlr_dax); > err: > diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h > index 01bee6eedff3..8f2d8944d334 100644 > --- a/drivers/cxl/cxlmem.h > +++ b/drivers/cxl/cxlmem.h > @@ -604,6 +604,54 @@ enum cxl_opcode { > UUID_INIT(0xe1819d9, 0x11a9, 0x400c, 0x81, 0x1f, 0xd6, 0x07, 0x19, \ > 0x40, 0x3d, 0x86) > > +/* > + * Add Dynamic Capacity Response > + * CXL rev 3.1 section 8.2.9.9.9.3; Table 8-168 & Table 8-169 > + */ > +struct cxl_mbox_dc_response { > + __le32 extent_list_size; > + u8 flags; > + u8 reserved[3]; > + struct updated_extent_list { > + __le64 dpa_start; > + __le64 length; > + u8 reserved[8]; > + } __packed extent_list[]; > +} __packed; > + > +/* > + * CXL rev 3.1 section 8.2.9.2.1.6; Table 8-51 > + */ > +#define CXL_DC_EXTENT_TAG_LEN 0x10 > +struct cxl_dc_extent { > + __le64 start_dpa; > + __le64 length; > + u8 tag[CXL_DC_EXTENT_TAG_LEN]; > + __le16 shared_extn_seq; > + u8 reserved[6]; > +} __packed; > + > +/* > + * Get Dynamic Capacity Extent List; Input Payload > + * CXL rev 3.1 section 8.2.9.9.9.2; Table 8-166 > + */ > +struct cxl_mbox_get_dc_extent_in { > + __le32 extent_cnt; > + __le32 start_extent_index; > +} __packed; > + > +/* > + * Get Dynamic Capacity Extent List; Output Payload > + * CXL rev 3.1 section 8.2.9.9.9.2; Table 8-167 > + */ > +struct cxl_mbox_get_dc_extent_out { > + __le32 ret_extent_cnt; > + __le32 total_extent_cnt; > + __le32 extent_list_num; > + u8 rsvd[4]; > + struct cxl_dc_extent extent[]; > +} __packed; > + > struct cxl_mbox_get_supported_logs { > __le16 entries; > u8 rsvd[6]; > @@ -879,6 +927,7 @@ int cxl_internal_send_cmd(struct cxl_memdev_state *mds, > struct cxl_mbox_cmd *cmd); > int cxl_dev_state_identify(struct cxl_memdev_state *mds); > int cxl_dev_dynamic_capacity_identify(struct cxl_memdev_state *mds); > +int cxl_read_dc_extents(struct cxl_endpoint_decoder *cxled); > int cxl_await_media_ready(struct cxl_dev_state *cxlds); > int cxl_enumerate_cmds(struct cxl_memdev_state *mds); > int cxl_mem_create_range_info(struct cxl_memdev_state *mds); > > -- > 2.44.0 >
On 3/24/24 4:18 PM, ira.weiny@intel.com wrote: > From: Navneet Singh <navneet.singh@intel.com> > > Dynamic capacity device extents may be left in an accepted state on a > device due to an unexpected host crash. In this case creation of a new > region on top of the DC partition (region) is expected to expose those > extents for continued use. > > Once all endpoint decoders are part of a region and the region is being > realized read the device extent list. For ease of review, this patch > stops after reading the extent list and leaves realization of the region > extents to a future patch. > > Signed-off-by: Navneet Singh <navneet.singh@intel.com> > Co-developed-by: Ira Weiny <ira.weiny@intel.com> > Signed-off-by: Ira Weiny <ira.weiny@intel.com> > > --- > Changes for v1: > [iweiny: remove extent list xarray] > [iweiny: Update spec references to 3.1] > [iweiny: use struct range in extents] > [iweiny: remove all reference tracking and let regions track extents > through the extent devices.] > [djbw/Jonathan/Fan: move extent tracking to endpoint decoders] > --- > drivers/cxl/core/core.h | 9 +++ > drivers/cxl/core/mbox.c | 192 ++++++++++++++++++++++++++++++++++++++++++++++ > drivers/cxl/core/region.c | 29 +++++++ > drivers/cxl/cxlmem.h | 49 ++++++++++++ > 4 files changed, 279 insertions(+) > > diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h > index 91abeffbe985..119b12362977 100644 > --- a/drivers/cxl/core/core.h > +++ b/drivers/cxl/core/core.h > @@ -4,6 +4,8 @@ > #ifndef __CXL_CORE_H__ > #define __CXL_CORE_H__ > > +#include <cxlmem.h> > + > extern const struct device_type cxl_nvdimm_bridge_type; > extern const struct device_type cxl_nvdimm_type; > extern const struct device_type cxl_pmu_type; > @@ -28,6 +30,8 @@ void cxl_decoder_kill_region(struct cxl_endpoint_decoder *cxled); > int cxl_region_init(void); > void cxl_region_exit(void); > int cxl_get_poison_by_endpoint(struct cxl_port *port); > +int cxl_ed_add_one_extent(struct cxl_endpoint_decoder *cxled, > + struct cxl_dc_extent *dc_extent); > #else > static inline int cxl_get_poison_by_endpoint(struct cxl_port *port) > { > @@ -43,6 +47,11 @@ static inline int cxl_region_init(void) > static inline void cxl_region_exit(void) > { > } > +static inline int cxl_ed_add_one_extent(struct cxl_endpoint_decoder *cxled, > + struct cxl_dc_extent *dc_extent) > +{ > + return 0; > +} > #define CXL_REGION_ATTR(x) NULL > #define CXL_REGION_TYPE(x) NULL > #define SET_CXL_REGION_ATTR(x) > diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c > index 58b31fa47b93..9e33a0976828 100644 > --- a/drivers/cxl/core/mbox.c > +++ b/drivers/cxl/core/mbox.c > @@ -870,6 +870,53 @@ int cxl_enumerate_cmds(struct cxl_memdev_state *mds) > } > EXPORT_SYMBOL_NS_GPL(cxl_enumerate_cmds, CXL); > > +static int cxl_validate_extent(struct cxl_memdev_state *mds, > + struct cxl_dc_extent *dc_extent) > +{ > + struct device *dev = mds->cxlds.dev; > + uint64_t start, len; u64 > + > + start = le64_to_cpu(dc_extent->start_dpa); > + len = le64_to_cpu(dc_extent->length); > + > + /* Extents must not cross region boundary's */ > + for (int i = 0; i < mds->nr_dc_region; i++) { > + struct cxl_dc_region_info *dcr = &mds->dc_region[i]; > + > + if (dcr->base <= start && > + (start + len) <= (dcr->base + dcr->decode_len)) { Can range_contains() be used here as well? > + dev_dbg(dev, "DC extent DPA %#llx - %#llx (DCR:%d:%#llx)\n", > + start, start + len - 1, i, start - dcr->base); > + return 0; > + } > + } > + > + dev_err_ratelimited(dev, > + "DC extent DPA %#llx - %#llx is not in any DC region\n", > + start, start + len - 1); > + return -EINVAL; > +} > + > +static bool cxl_dc_extent_in_ed(struct cxl_endpoint_decoder *cxled, cxl_dc_extent_in_endpoint_decoder() is more readable > + struct cxl_dc_extent *extent) > +{ > + uint64_t start = le64_to_cpu(extent->start_dpa); > + uint64_t length = le64_to_cpu(extent->length); u64 > + struct range ext_range = (struct range){ > + .start = start, > + .end = start + length - 1, > + }; > + struct range ed_range = (struct range) { > + .start = cxled->dpa_res->start, > + .end = cxled->dpa_res->end, > + }; > + > + dev_dbg(&cxled->cxld.dev, "Checking ED (%pr) for extent DPA:%#llx LEN:%#llx\n", > + cxled->dpa_res, start, length); > + > + return range_contains(&ed_range, &ext_range); > +} > + > void cxl_event_trace_record(const struct cxl_memdev *cxlmd, > enum cxl_event_log_type type, > enum cxl_event_type event_type, > @@ -973,6 +1020,15 @@ static int cxl_clear_event_record(struct cxl_memdev_state *mds, > return rc; > } > > +static struct cxl_memdev_state * > +cxled_to_mds(struct cxl_endpoint_decoder *cxled) > +{ > + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); > + struct cxl_dev_state *cxlds = cxlmd->cxlds; > + > + return container_of(cxlds, struct cxl_memdev_state, cxlds); > +} > + > static void cxl_mem_get_records_log(struct cxl_memdev_state *mds, > enum cxl_event_log_type type) > { > @@ -1406,6 +1462,142 @@ int cxl_dev_dynamic_capacity_identify(struct cxl_memdev_state *mds) > } > EXPORT_SYMBOL_NS_GPL(cxl_dev_dynamic_capacity_identify, CXL); > > +static int cxl_dev_get_dc_extent_cnt(struct cxl_memdev_state *mds, cxl_dev_get_dc_extent_generation()? or spell out count DJ > + unsigned int *extent_gen_num) > +{ > + struct cxl_mbox_get_dc_extent_in get_dc_extent; > + struct cxl_mbox_get_dc_extent_out dc_extents; > + struct cxl_mbox_cmd mbox_cmd; > + unsigned int count; > + int rc; > + > + get_dc_extent = (struct cxl_mbox_get_dc_extent_in) { > + .extent_cnt = cpu_to_le32(0), > + .start_extent_index = cpu_to_le32(0), > + }; > + > + mbox_cmd = (struct cxl_mbox_cmd) { > + .opcode = CXL_MBOX_OP_GET_DC_EXTENT_LIST, > + .payload_in = &get_dc_extent, > + .size_in = sizeof(get_dc_extent), > + .size_out = sizeof(dc_extents), > + .payload_out = &dc_extents, > + .min_out = 1, > + }; > + > + rc = cxl_internal_send_cmd(mds, &mbox_cmd); > + if (rc < 0) > + return rc; > + > + count = le32_to_cpu(dc_extents.total_extent_cnt); > + *extent_gen_num = le32_to_cpu(dc_extents.extent_list_num); > + > + return count; > +} > + > +static int cxl_dev_get_dc_extents(struct cxl_endpoint_decoder *cxled, > + unsigned int start_gen_num, > + unsigned int exp_cnt) > +{ > + struct cxl_memdev_state *mds = cxled_to_mds(cxled); > + unsigned int start_index, total_read; > + struct device *dev = mds->cxlds.dev; > + struct cxl_mbox_cmd mbox_cmd; > + > + struct cxl_mbox_get_dc_extent_out *dc_extents __free(kfree) = > + kvmalloc(mds->payload_size, GFP_KERNEL); > + if (!dc_extents) > + return -ENOMEM; > + > + total_read = 0; > + start_index = 0; > + do { > + unsigned int nr_ext, total_extent_cnt, gen_num; > + struct cxl_mbox_get_dc_extent_in get_dc_extent; > + int rc; > + > + get_dc_extent = (struct cxl_mbox_get_dc_extent_in) { > + .extent_cnt = cpu_to_le32(exp_cnt - start_index), > + .start_extent_index = cpu_to_le32(start_index), > + }; > + > + mbox_cmd = (struct cxl_mbox_cmd) { > + .opcode = CXL_MBOX_OP_GET_DC_EXTENT_LIST, > + .payload_in = &get_dc_extent, > + .size_in = sizeof(get_dc_extent), > + .size_out = mds->payload_size, > + .payload_out = dc_extents, > + .min_out = 1, > + }; > + > + rc = cxl_internal_send_cmd(mds, &mbox_cmd); > + if (rc < 0) > + return rc; > + > + nr_ext = le32_to_cpu(dc_extents->ret_extent_cnt); > + total_read += nr_ext; > + total_extent_cnt = le32_to_cpu(dc_extents->total_extent_cnt); > + gen_num = le32_to_cpu(dc_extents->extent_list_num); > + > + dev_dbg(dev, "Get extent list count:%d generation Num:%d\n", > + total_extent_cnt, gen_num); > + > + if (gen_num != start_gen_num || exp_cnt != total_extent_cnt) { > + dev_err(dev, "Possible incomplete extent list; gen %u != %u : cnt %u != %u\n", > + gen_num, start_gen_num, exp_cnt, total_extent_cnt); > + return -EIO; > + } > + > + for (int i = 0; i < nr_ext ; i++) { > + dev_dbg(dev, "Processing extent %d/%d\n", > + start_index + i, exp_cnt); > + rc = cxl_validate_extent(mds, &dc_extents->extent[i]); > + if (rc) > + continue; > + if (!cxl_dc_extent_in_ed(cxled, &dc_extents->extent[i])) > + continue; > + rc = cxl_ed_add_one_extent(cxled, &dc_extents->extent[i]); > + if (rc) > + return rc; > + } > + > + start_index += nr_ext; > + } while (exp_cnt > total_read); > + > + return 0; > +} > + > +/** > + * cxl_read_dc_extents() - Read any existing extents > + * @cxled: Endpoint decoder which is part of a region > + * > + * Issue the Get Dynamic Capacity Extent List command to the device > + * and add any existing extents found which belong to this decoder. > + * > + * Return: 0 if command was executed successfully, -ERRNO on error. > + */ > +int cxl_read_dc_extents(struct cxl_endpoint_decoder *cxled) > +{ > + struct cxl_memdev_state *mds = cxled_to_mds(cxled); > + struct device *dev = mds->cxlds.dev; > + unsigned int extent_gen_num; > + int rc; > + > + if (!cxl_dcd_supported(mds)) { > + dev_dbg(dev, "DCD unsupported\n"); > + return 0; > + } > + > + rc = cxl_dev_get_dc_extent_cnt(mds, &extent_gen_num); > + dev_dbg(mds->cxlds.dev, "Extent count: %d Generation Num: %d\n", > + rc, extent_gen_num); > + if (rc <= 0) /* 0 == no records found */ > + return rc; > + > + return cxl_dev_get_dc_extents(cxled, extent_gen_num, rc); > +} > +EXPORT_SYMBOL_NS_GPL(cxl_read_dc_extents, CXL); > + > static int add_dpa_res(struct device *dev, struct resource *parent, > struct resource *res, resource_size_t start, > resource_size_t size, const char *type) > diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c > index 0d7b09a49dcf..3e563ab29afe 100644 > --- a/drivers/cxl/core/region.c > +++ b/drivers/cxl/core/region.c > @@ -1450,6 +1450,13 @@ static int cxl_region_validate_position(struct cxl_region *cxlr, > return 0; > } > > +/* Callers are expected to ensure cxled has been attached to a region */ > +int cxl_ed_add_one_extent(struct cxl_endpoint_decoder *cxled, > + struct cxl_dc_extent *dc_extent) > +{ > + return 0; > +} > + > static int cxl_region_attach_position(struct cxl_region *cxlr, > struct cxl_root_decoder *cxlrd, > struct cxl_endpoint_decoder *cxled, > @@ -2773,6 +2780,22 @@ static int devm_cxl_add_pmem_region(struct cxl_region *cxlr) > return rc; > } > > +static int cxl_region_read_extents(struct cxl_region *cxlr) > +{ > + struct cxl_region_params *p = &cxlr->params; > + int i; > + > + for (i = 0; i < p->nr_targets; i++) { > + int rc; > + > + rc = cxl_read_dc_extents(p->targets[i]); > + if (rc) > + return rc; > + } > + > + return 0; > +} > + > static void cxlr_dax_unregister(void *_cxlr_dax) > { > struct cxl_dax_region *cxlr_dax = _cxlr_dax; > @@ -2807,6 +2830,12 @@ static int devm_cxl_add_dax_region(struct cxl_region *cxlr) > dev_dbg(&cxlr->dev, "%s: register %s\n", dev_name(dev->parent), > dev_name(dev)); > > + if (cxlr->mode == CXL_REGION_DC) { > + rc = cxl_region_read_extents(cxlr); > + if (rc) > + goto err; > + } > + > return devm_add_action_or_reset(&cxlr->dev, cxlr_dax_unregister, > cxlr_dax); > err: > diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h > index 01bee6eedff3..8f2d8944d334 100644 > --- a/drivers/cxl/cxlmem.h > +++ b/drivers/cxl/cxlmem.h > @@ -604,6 +604,54 @@ enum cxl_opcode { > UUID_INIT(0xe1819d9, 0x11a9, 0x400c, 0x81, 0x1f, 0xd6, 0x07, 0x19, \ > 0x40, 0x3d, 0x86) > > +/* > + * Add Dynamic Capacity Response > + * CXL rev 3.1 section 8.2.9.9.9.3; Table 8-168 & Table 8-169 > + */ > +struct cxl_mbox_dc_response { > + __le32 extent_list_size; > + u8 flags; > + u8 reserved[3]; > + struct updated_extent_list { > + __le64 dpa_start; > + __le64 length; > + u8 reserved[8]; > + } __packed extent_list[]; > +} __packed; > + > +/* > + * CXL rev 3.1 section 8.2.9.2.1.6; Table 8-51 > + */ > +#define CXL_DC_EXTENT_TAG_LEN 0x10 > +struct cxl_dc_extent { > + __le64 start_dpa; > + __le64 length; > + u8 tag[CXL_DC_EXTENT_TAG_LEN]; > + __le16 shared_extn_seq; > + u8 reserved[6]; > +} __packed; > + > +/* > + * Get Dynamic Capacity Extent List; Input Payload > + * CXL rev 3.1 section 8.2.9.9.9.2; Table 8-166 > + */ > +struct cxl_mbox_get_dc_extent_in { > + __le32 extent_cnt; > + __le32 start_extent_index; > +} __packed; > + > +/* > + * Get Dynamic Capacity Extent List; Output Payload > + * CXL rev 3.1 section 8.2.9.9.9.2; Table 8-167 > + */ > +struct cxl_mbox_get_dc_extent_out { > + __le32 ret_extent_cnt; > + __le32 total_extent_cnt; > + __le32 extent_list_num; > + u8 rsvd[4]; > + struct cxl_dc_extent extent[]; > +} __packed; > + > struct cxl_mbox_get_supported_logs { > __le16 entries; > u8 rsvd[6]; > @@ -879,6 +927,7 @@ int cxl_internal_send_cmd(struct cxl_memdev_state *mds, > struct cxl_mbox_cmd *cmd); > int cxl_dev_state_identify(struct cxl_memdev_state *mds); > int cxl_dev_dynamic_capacity_identify(struct cxl_memdev_state *mds); > +int cxl_read_dc_extents(struct cxl_endpoint_decoder *cxled); > int cxl_await_media_ready(struct cxl_dev_state *cxlds); > int cxl_enumerate_cmds(struct cxl_memdev_state *mds); > int cxl_mem_create_range_info(struct cxl_memdev_state *mds); >
On 3/25/24 00:18, ira.weiny@intel.com wrote: > From: Navneet Singh <navneet.singh@intel.com> > > Dynamic capacity device extents may be left in an accepted state on a > device due to an unexpected host crash. In this case creation of a new > region on top of the DC partition (region) is expected to expose those > extents for continued use. > > Once all endpoint decoders are part of a region and the region is being > realized read the device extent list. For ease of review, this patch > stops after reading the extent list and leaves realization of the region > extents to a future patch. > > Signed-off-by: Navneet Singh <navneet.singh@intel.com> > Co-developed-by: Ira Weiny <ira.weiny@intel.com> > Signed-off-by: Ira Weiny <ira.weiny@intel.com> > > --- > Changes for v1: > [iweiny: remove extent list xarray] > [iweiny: Update spec references to 3.1] > [iweiny: use struct range in extents] > [iweiny: remove all reference tracking and let regions track extents > through the extent devices.] > [djbw/Jonathan/Fan: move extent tracking to endpoint decoders] > --- > drivers/cxl/core/core.h | 9 +++ > drivers/cxl/core/mbox.c | 192 ++++++++++++++++++++++++++++++++++++++++++++++ > drivers/cxl/core/region.c | 29 +++++++ > drivers/cxl/cxlmem.h | 49 ++++++++++++ > 4 files changed, 279 insertions(+) > > diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h > index 91abeffbe985..119b12362977 100644 > --- a/drivers/cxl/core/core.h > +++ b/drivers/cxl/core/core.h > @@ -4,6 +4,8 @@ > #ifndef __CXL_CORE_H__ > #define __CXL_CORE_H__ > > +#include <cxlmem.h> > + > extern const struct device_type cxl_nvdimm_bridge_type; > extern const struct device_type cxl_nvdimm_type; > extern const struct device_type cxl_pmu_type; > @@ -28,6 +30,8 @@ void cxl_decoder_kill_region(struct cxl_endpoint_decoder *cxled); > int cxl_region_init(void); > void cxl_region_exit(void); > int cxl_get_poison_by_endpoint(struct cxl_port *port); > +int cxl_ed_add_one_extent(struct cxl_endpoint_decoder *cxled, > + struct cxl_dc_extent *dc_extent); > #else > static inline int cxl_get_poison_by_endpoint(struct cxl_port *port) > { > @@ -43,6 +47,11 @@ static inline int cxl_region_init(void) > static inline void cxl_region_exit(void) > { > } > +static inline int cxl_ed_add_one_extent(struct cxl_endpoint_decoder *cxled, > + struct cxl_dc_extent *dc_extent) > +{ > + return 0; > +} > #define CXL_REGION_ATTR(x) NULL > #define CXL_REGION_TYPE(x) NULL > #define SET_CXL_REGION_ATTR(x) > diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c > index 58b31fa47b93..9e33a0976828 100644 > --- a/drivers/cxl/core/mbox.c > +++ b/drivers/cxl/core/mbox.c > @@ -870,6 +870,53 @@ int cxl_enumerate_cmds(struct cxl_memdev_state *mds) > } > EXPORT_SYMBOL_NS_GPL(cxl_enumerate_cmds, CXL); > > +static int cxl_validate_extent(struct cxl_memdev_state *mds, > + struct cxl_dc_extent *dc_extent) > +{ > + struct device *dev = mds->cxlds.dev; > + uint64_t start, len; > + > + start = le64_to_cpu(dc_extent->start_dpa); > + len = le64_to_cpu(dc_extent->length); > + > + /* Extents must not cross region boundary's */ > + for (int i = 0; i < mds->nr_dc_region; i++) { > + struct cxl_dc_region_info *dcr = &mds->dc_region[i]; > + > + if (dcr->base <= start && > + (start + len) <= (dcr->base + dcr->decode_len)) { > + dev_dbg(dev, "DC extent DPA %#llx - %#llx (DCR:%d:%#llx)\n", > + start, start + len - 1, i, start - dcr->base); > + return 0; > + } > + } > + > + dev_err_ratelimited(dev, > + "DC extent DPA %#llx - %#llx is not in any DC region\n", > + start, start + len - 1); > + return -EINVAL; > +} > + > +static bool cxl_dc_extent_in_ed(struct cxl_endpoint_decoder *cxled, > + struct cxl_dc_extent *extent) > +{ > + uint64_t start = le64_to_cpu(extent->start_dpa); > + uint64_t length = le64_to_cpu(extent->length); > + struct range ext_range = (struct range){ > + .start = start, > + .end = start + length - 1, > + }; > + struct range ed_range = (struct range) { > + .start = cxled->dpa_res->start, > + .end = cxled->dpa_res->end, > + }; > + > + dev_dbg(&cxled->cxld.dev, "Checking ED (%pr) for extent DPA:%#llx LEN:%#llx\n", > + cxled->dpa_res, start, length); > + > + return range_contains(&ed_range, &ext_range); > +} > + > void cxl_event_trace_record(const struct cxl_memdev *cxlmd, > enum cxl_event_log_type type, > enum cxl_event_type event_type, > @@ -973,6 +1020,15 @@ static int cxl_clear_event_record(struct cxl_memdev_state *mds, > return rc; > } > > +static struct cxl_memdev_state * > +cxled_to_mds(struct cxl_endpoint_decoder *cxled) > +{ > + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); > + struct cxl_dev_state *cxlds = cxlmd->cxlds; > + > + return container_of(cxlds, struct cxl_memdev_state, cxlds); > +} > + > static void cxl_mem_get_records_log(struct cxl_memdev_state *mds, > enum cxl_event_log_type type) > { > @@ -1406,6 +1462,142 @@ int cxl_dev_dynamic_capacity_identify(struct cxl_memdev_state *mds) > } > EXPORT_SYMBOL_NS_GPL(cxl_dev_dynamic_capacity_identify, CXL); > > +static int cxl_dev_get_dc_extent_cnt(struct cxl_memdev_state *mds, > + unsigned int *extent_gen_num) > +{ > + struct cxl_mbox_get_dc_extent_in get_dc_extent; > + struct cxl_mbox_get_dc_extent_out dc_extents; > + struct cxl_mbox_cmd mbox_cmd; > + unsigned int count; > + int rc; > + > + get_dc_extent = (struct cxl_mbox_get_dc_extent_in) { > + .extent_cnt = cpu_to_le32(0), > + .start_extent_index = cpu_to_le32(0), > + }; > + > + mbox_cmd = (struct cxl_mbox_cmd) { > + .opcode = CXL_MBOX_OP_GET_DC_EXTENT_LIST, > + .payload_in = &get_dc_extent, > + .size_in = sizeof(get_dc_extent), > + .size_out = sizeof(dc_extents), > + .payload_out = &dc_extents, > + .min_out = 1, > + }; > + > + rc = cxl_internal_send_cmd(mds, &mbox_cmd); > + if (rc < 0) > + return rc; > + > + count = le32_to_cpu(dc_extents.total_extent_cnt); > + *extent_gen_num = le32_to_cpu(dc_extents.extent_list_num); > + > + return count; > +} > + > +static int cxl_dev_get_dc_extents(struct cxl_endpoint_decoder *cxled, > + unsigned int start_gen_num, > + unsigned int exp_cnt) > +{ > + struct cxl_memdev_state *mds = cxled_to_mds(cxled); > + unsigned int start_index, total_read; > + struct device *dev = mds->cxlds.dev; > + struct cxl_mbox_cmd mbox_cmd; > + > + struct cxl_mbox_get_dc_extent_out *dc_extents __free(kfree) = > + kvmalloc(mds->payload_size, GFP_KERNEL); > + if (!dc_extents) > + return -ENOMEM; > + > + total_read = 0; > + start_index = 0; > + do { > + unsigned int nr_ext, total_extent_cnt, gen_num; > + struct cxl_mbox_get_dc_extent_in get_dc_extent; > + int rc; > + > + get_dc_extent = (struct cxl_mbox_get_dc_extent_in) { > + .extent_cnt = cpu_to_le32(exp_cnt - start_index), > + .start_extent_index = cpu_to_le32(start_index), > + }; > + > + mbox_cmd = (struct cxl_mbox_cmd) { > + .opcode = CXL_MBOX_OP_GET_DC_EXTENT_LIST, > + .payload_in = &get_dc_extent, > + .size_in = sizeof(get_dc_extent), > + .size_out = mds->payload_size, > + .payload_out = dc_extents, > + .min_out = 1, > + }; > + > + rc = cxl_internal_send_cmd(mds, &mbox_cmd); > + if (rc < 0) > + return rc; > + > + nr_ext = le32_to_cpu(dc_extents->ret_extent_cnt); > + total_read += nr_ext; > + total_extent_cnt = le32_to_cpu(dc_extents->total_extent_cnt); > + gen_num = le32_to_cpu(dc_extents->extent_list_num); > + > + dev_dbg(dev, "Get extent list count:%d generation Num:%d\n", > + total_extent_cnt, gen_num); > + > + if (gen_num != start_gen_num || exp_cnt != total_extent_cnt) { > + dev_err(dev, "Possible incomplete extent list; gen %u != %u : cnt %u != %u\n", > + gen_num, start_gen_num, exp_cnt, total_extent_cnt); > + return -EIO; > + } > + > + for (int i = 0; i < nr_ext ; i++) { > + dev_dbg(dev, "Processing extent %d/%d\n", > + start_index + i, exp_cnt); > + rc = cxl_validate_extent(mds, &dc_extents->extent[i]); > + if (rc) > + continue; > + if (!cxl_dc_extent_in_ed(cxled, &dc_extents->extent[i])) > + continue; > + rc = cxl_ed_add_one_extent(cxled, &dc_extents->extent[i]); > + if (rc) > + return rc; > + } > + > + start_index += nr_ext; > + } while (exp_cnt > total_read); > + > + return 0; > +} > + > +/** > + * cxl_read_dc_extents() - Read any existing extents > + * @cxled: Endpoint decoder which is part of a region > + * > + * Issue the Get Dynamic Capacity Extent List command to the device > + * and add any existing extents found which belong to this decoder. > + * > + * Return: 0 if command was executed successfully, -ERRNO on error. > + */ > +int cxl_read_dc_extents(struct cxl_endpoint_decoder *cxled) > +{ > + struct cxl_memdev_state *mds = cxled_to_mds(cxled); > + struct device *dev = mds->cxlds.dev; > + unsigned int extent_gen_num; > + int rc; > + > + if (!cxl_dcd_supported(mds)) { > + dev_dbg(dev, "DCD unsupported\n"); > + return 0; > + } > + > + rc = cxl_dev_get_dc_extent_cnt(mds, &extent_gen_num); > + dev_dbg(mds->cxlds.dev, "Extent count: %d Generation Num: %d\n", > + rc, extent_gen_num); > + if (rc <= 0) /* 0 == no records found */ > + return rc; > + > + return cxl_dev_get_dc_extents(cxled, extent_gen_num, rc); Is it necessary to spend a device interaction to get the generation number? Couldn't cxl_dev_get_dc_extents obtain that as part of the first call to the device, and then use it to ensure the consistency of any remaining calls, if any are necessary? Thanks, Jørgen > +} > +EXPORT_SYMBOL_NS_GPL(cxl_read_dc_extents, CXL); > + > static int add_dpa_res(struct device *dev, struct resource *parent, > struct resource *res, resource_size_t start, > resource_size_t size, const char *type) > diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c > index 0d7b09a49dcf..3e563ab29afe 100644 > --- a/drivers/cxl/core/region.c > +++ b/drivers/cxl/core/region.c > @@ -1450,6 +1450,13 @@ static int cxl_region_validate_position(struct cxl_region *cxlr, > return 0; > } > > +/* Callers are expected to ensure cxled has been attached to a region */ > +int cxl_ed_add_one_extent(struct cxl_endpoint_decoder *cxled, > + struct cxl_dc_extent *dc_extent) > +{ > + return 0; > +} > + > static int cxl_region_attach_position(struct cxl_region *cxlr, > struct cxl_root_decoder *cxlrd, > struct cxl_endpoint_decoder *cxled, > @@ -2773,6 +2780,22 @@ static int devm_cxl_add_pmem_region(struct cxl_region *cxlr) > return rc; > } > > +static int cxl_region_read_extents(struct cxl_region *cxlr) > +{ > + struct cxl_region_params *p = &cxlr->params; > + int i; > + > + for (i = 0; i < p->nr_targets; i++) { > + int rc; > + > + rc = cxl_read_dc_extents(p->targets[i]); > + if (rc) > + return rc; > + } > + > + return 0; > +} > + > static void cxlr_dax_unregister(void *_cxlr_dax) > { > struct cxl_dax_region *cxlr_dax = _cxlr_dax; > @@ -2807,6 +2830,12 @@ static int devm_cxl_add_dax_region(struct cxl_region *cxlr) > dev_dbg(&cxlr->dev, "%s: register %s\n", dev_name(dev->parent), > dev_name(dev)); > > + if (cxlr->mode == CXL_REGION_DC) { > + rc = cxl_region_read_extents(cxlr); > + if (rc) > + goto err; > + } > + > return devm_add_action_or_reset(&cxlr->dev, cxlr_dax_unregister, > cxlr_dax); > err: > diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h > index 01bee6eedff3..8f2d8944d334 100644 > --- a/drivers/cxl/cxlmem.h > +++ b/drivers/cxl/cxlmem.h > @@ -604,6 +604,54 @@ enum cxl_opcode { > UUID_INIT(0xe1819d9, 0x11a9, 0x400c, 0x81, 0x1f, 0xd6, 0x07, 0x19, \ > 0x40, 0x3d, 0x86) > > +/* > + * Add Dynamic Capacity Response > + * CXL rev 3.1 section 8.2.9.9.9.3; Table 8-168 & Table 8-169 > + */ > +struct cxl_mbox_dc_response { > + __le32 extent_list_size; > + u8 flags; > + u8 reserved[3]; > + struct updated_extent_list { > + __le64 dpa_start; > + __le64 length; > + u8 reserved[8]; > + } __packed extent_list[]; > +} __packed; > + > +/* > + * CXL rev 3.1 section 8.2.9.2.1.6; Table 8-51 > + */ > +#define CXL_DC_EXTENT_TAG_LEN 0x10 > +struct cxl_dc_extent { > + __le64 start_dpa; > + __le64 length; > + u8 tag[CXL_DC_EXTENT_TAG_LEN]; > + __le16 shared_extn_seq; > + u8 reserved[6]; > +} __packed; > + > +/* > + * Get Dynamic Capacity Extent List; Input Payload > + * CXL rev 3.1 section 8.2.9.9.9.2; Table 8-166 > + */ > +struct cxl_mbox_get_dc_extent_in { > + __le32 extent_cnt; > + __le32 start_extent_index; > +} __packed; > + > +/* > + * Get Dynamic Capacity Extent List; Output Payload > + * CXL rev 3.1 section 8.2.9.9.9.2; Table 8-167 > + */ > +struct cxl_mbox_get_dc_extent_out { > + __le32 ret_extent_cnt; > + __le32 total_extent_cnt; > + __le32 extent_list_num; > + u8 rsvd[4]; > + struct cxl_dc_extent extent[]; > +} __packed; > + > struct cxl_mbox_get_supported_logs { > __le16 entries; > u8 rsvd[6]; > @@ -879,6 +927,7 @@ int cxl_internal_send_cmd(struct cxl_memdev_state *mds, > struct cxl_mbox_cmd *cmd); > int cxl_dev_state_identify(struct cxl_memdev_state *mds); > int cxl_dev_dynamic_capacity_identify(struct cxl_memdev_state *mds); > +int cxl_read_dc_extents(struct cxl_endpoint_decoder *cxled); > int cxl_await_media_ready(struct cxl_dev_state *cxlds); > int cxl_enumerate_cmds(struct cxl_memdev_state *mds); > int cxl_mem_create_range_info(struct cxl_memdev_state *mds); > > -- > 2.44.0 > >
On Sun, 24 Mar 2024 16:18:17 -0700 ira.weiny@intel.com wrote: > From: Navneet Singh <navneet.singh@intel.com> > > Dynamic capacity device extents may be left in an accepted state on a > device due to an unexpected host crash. In this case creation of a new > region on top of the DC partition (region) is expected to expose those > extents for continued use. > > Once all endpoint decoders are part of a region and the region is being > realized read the device extent list. For ease of review, this patch > stops after reading the extent list and leaves realization of the region > extents to a future patch. > > Signed-off-by: Navneet Singh <navneet.singh@intel.com> > Co-developed-by: Ira Weiny <ira.weiny@intel.com> > Signed-off-by: Ira Weiny <ira.weiny@intel.com> A few things inline. J > static void cxl_mem_get_records_log(struct cxl_memdev_state *mds, > enum cxl_event_log_type type) > { > @@ -1406,6 +1462,142 @@ int cxl_dev_dynamic_capacity_identify(struct cxl_memdev_state *mds) > } > EXPORT_SYMBOL_NS_GPL(cxl_dev_dynamic_capacity_identify, CXL); > > +static int cxl_dev_get_dc_extent_cnt(struct cxl_memdev_state *mds, > + unsigned int *extent_gen_num) > +{ > + struct cxl_mbox_get_dc_extent_in get_dc_extent; > + struct cxl_mbox_get_dc_extent_out dc_extents; > + struct cxl_mbox_cmd mbox_cmd; > + unsigned int count; > + int rc; > + > + get_dc_extent = (struct cxl_mbox_get_dc_extent_in) { > + .extent_cnt = cpu_to_le32(0), > + .start_extent_index = cpu_to_le32(0), > + }; > + > + mbox_cmd = (struct cxl_mbox_cmd) { > + .opcode = CXL_MBOX_OP_GET_DC_EXTENT_LIST, > + .payload_in = &get_dc_extent, > + .size_in = sizeof(get_dc_extent), > + .size_out = sizeof(dc_extents), > + .payload_out = &dc_extents, > + .min_out = 1, Why 1? > + }; > + > + rc = cxl_internal_send_cmd(mds, &mbox_cmd); > + if (rc < 0) > + return rc; > + > + count = le32_to_cpu(dc_extents.total_extent_cnt); > + *extent_gen_num = le32_to_cpu(dc_extents.extent_list_num); > + > + return count; > +} > + > +static int cxl_dev_get_dc_extents(struct cxl_endpoint_decoder *cxled, > + unsigned int start_gen_num, > + unsigned int exp_cnt) > +{ > + struct cxl_memdev_state *mds = cxled_to_mds(cxled); > + unsigned int start_index, total_read; > + struct device *dev = mds->cxlds.dev; > + struct cxl_mbox_cmd mbox_cmd; > + > + struct cxl_mbox_get_dc_extent_out *dc_extents __free(kfree) = > + kvmalloc(mds->payload_size, GFP_KERNEL); > + if (!dc_extents) > + return -ENOMEM; > + > + total_read = 0; > + start_index = 0; > + do { > + unsigned int nr_ext, total_extent_cnt, gen_num; > + struct cxl_mbox_get_dc_extent_in get_dc_extent; > + int rc; > + > + get_dc_extent = (struct cxl_mbox_get_dc_extent_in) { > + .extent_cnt = cpu_to_le32(exp_cnt - start_index), > + .start_extent_index = cpu_to_le32(start_index), > + }; > + > + mbox_cmd = (struct cxl_mbox_cmd) { > + .opcode = CXL_MBOX_OP_GET_DC_EXTENT_LIST, > + .payload_in = &get_dc_extent, > + .size_in = sizeof(get_dc_extent), > + .size_out = mds->payload_size, > + .payload_out = dc_extents, > + .min_out = 1, Why 1? > + }; > + > + rc = cxl_internal_send_cmd(mds, &mbox_cmd); > + if (rc < 0) > + return rc; > + > + nr_ext = le32_to_cpu(dc_extents->ret_extent_cnt); > + total_read += nr_ext; > + total_extent_cnt = le32_to_cpu(dc_extents->total_extent_cnt); > + gen_num = le32_to_cpu(dc_extents->extent_list_num); > + > + dev_dbg(dev, "Get extent list count:%d generation Num:%d\n", > + total_extent_cnt, gen_num); > + > + if (gen_num != start_gen_num || exp_cnt != total_extent_cnt) { > + dev_err(dev, "Possible incomplete extent list; gen %u != %u : cnt %u != %u\n", > + gen_num, start_gen_num, exp_cnt, total_extent_cnt); > + return -EIO; > + } > + > + for (int i = 0; i < nr_ext ; i++) { > + dev_dbg(dev, "Processing extent %d/%d\n", > + start_index + i, exp_cnt); > + rc = cxl_validate_extent(mds, &dc_extents->extent[i]); > + if (rc) > + continue; A blank line here > + if (!cxl_dc_extent_in_ed(cxled, &dc_extents->extent[i])) > + continue; and here would make this more readable I think. > + rc = cxl_ed_add_one_extent(cxled, &dc_extents->extent[i]); > + if (rc) > + return rc; > + } > + > + start_index += nr_ext; > + } while (exp_cnt > total_read); > + > + return 0; > +} > diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c > index 0d7b09a49dcf..3e563ab29afe 100644 > --- a/drivers/cxl/core/region.c > +++ b/drivers/cxl/core/region.c > > +static int cxl_region_read_extents(struct cxl_region *cxlr) > +{ > + struct cxl_region_params *p = &cxlr->params; > + int i; > + > + for (i = 0; i < p->nr_targets; i++) { > + int rc; Maybe worth giving up early if we see nr_targets > 1? If nothing else it saves people trying to figure out what happens if we reboot into an older kernel that doesn't support interleave (from one that does) > + > + rc = cxl_read_dc_extents(p->targets[i]); > + if (rc) > + return rc; > + } > + > + return 0; > +} > + > diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h > index 01bee6eedff3..8f2d8944d334 100644 > --- a/drivers/cxl/cxlmem.h > +++ b/drivers/cxl/cxlmem.h > +/* > + * CXL rev 3.1 section 8.2.9.2.1.6; Table 8-51 Throw a table name or section name into the reference so people can find it in CXL rNext. > + */ > +#define CXL_DC_EXTENT_TAG_LEN 0x10 > +struct cxl_dc_extent { > + __le64 start_dpa; > + __le64 length; > + u8 tag[CXL_DC_EXTENT_TAG_LEN]; > + __le16 shared_extn_seq; > + u8 reserved[6]; > +} __packed; > + > +/* > + * Get Dynamic Capacity Extent List; Output Payload > + * CXL rev 3.1 section 8.2.9.9.9.2; Table 8-167 > + */ > +struct cxl_mbox_get_dc_extent_out { > + __le32 ret_extent_cnt; > + __le32 total_extent_cnt; > + __le32 extent_list_num; Naming isn't that clear given generation bit missing. > + u8 rsvd[4]; > + struct cxl_dc_extent extent[]; > +} __packed; > +
> +static bool cxl_dc_extent_in_ed(struct cxl_endpoint_decoder *cxled, > + struct cxl_dc_extent *extent) > +{ > + uint64_t start = le64_to_cpu(extent->start_dpa); > + uint64_t length = le64_to_cpu(extent->length); > + struct range ext_range = (struct range){ space ) { > + .start = start, > + .end = start + length - 1, > + }; > + struct range ed_range = (struct range) { > + .start = cxled->dpa_res->start, > + .end = cxled->dpa_res->end, > + };
fan wrote: > On Sun, Mar 24, 2024 at 04:18:17PM -0700, ira.weiny@intel.com wrote: > > From: Navneet Singh <navneet.singh@intel.com> > > [snip] > > diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c > > index 58b31fa47b93..9e33a0976828 100644 > > --- a/drivers/cxl/core/mbox.c > > +++ b/drivers/cxl/core/mbox.c > > @@ -870,6 +870,53 @@ int cxl_enumerate_cmds(struct cxl_memdev_state *mds) > > } > > EXPORT_SYMBOL_NS_GPL(cxl_enumerate_cmds, CXL); > > > > +static int cxl_validate_extent(struct cxl_memdev_state *mds, > > + struct cxl_dc_extent *dc_extent) > > +{ > > + struct device *dev = mds->cxlds.dev; > > + uint64_t start, len; > > + > > + start = le64_to_cpu(dc_extent->start_dpa); > > + len = le64_to_cpu(dc_extent->length); > > + > > + /* Extents must not cross region boundary's */ > > + for (int i = 0; i < mds->nr_dc_region; i++) { > > + struct cxl_dc_region_info *dcr = &mds->dc_region[i]; > > + > > + if (dcr->base <= start && > > + (start + len) <= (dcr->base + dcr->decode_len)) { > > Why not use range_contains here as below? Because when I initially wrote this I (or perhaps Navneet, I can't remember) we were not using ranges. This version I tried to convert to ranges and I missed this one. Good catch! Ira
Dave Jiang wrote: > > > On 3/24/24 4:18 PM, ira.weiny@intel.com wrote: > > From: Navneet Singh <navneet.singh@intel.com> > > [snip] > > diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c > > index 58b31fa47b93..9e33a0976828 100644 > > --- a/drivers/cxl/core/mbox.c > > +++ b/drivers/cxl/core/mbox.c > > @@ -870,6 +870,53 @@ int cxl_enumerate_cmds(struct cxl_memdev_state *mds) > > } > > EXPORT_SYMBOL_NS_GPL(cxl_enumerate_cmds, CXL); > > > > +static int cxl_validate_extent(struct cxl_memdev_state *mds, > > + struct cxl_dc_extent *dc_extent) > > +{ > > + struct device *dev = mds->cxlds.dev; > > + uint64_t start, len; > u64 > Yep > > > + > > + start = le64_to_cpu(dc_extent->start_dpa); > > + len = le64_to_cpu(dc_extent->length); > > + > > + /* Extents must not cross region boundary's */ > > + for (int i = 0; i < mds->nr_dc_region; i++) { > > + struct cxl_dc_region_info *dcr = &mds->dc_region[i]; > > + > > + if (dcr->base <= start && > > + (start + len) <= (dcr->base + dcr->decode_len)) { > > Can range_contains() be used here as well? Yep and done. > > > + dev_dbg(dev, "DC extent DPA %#llx - %#llx (DCR:%d:%#llx)\n", > > + start, start + len - 1, i, start - dcr->base); > > + return 0; > > + } > > + } > > + > > + dev_err_ratelimited(dev, > > + "DC extent DPA %#llx - %#llx is not in any DC region\n", > > + start, start + len - 1); > > + return -EINVAL; > > +} > > + > > +static bool cxl_dc_extent_in_ed(struct cxl_endpoint_decoder *cxled, > > cxl_dc_extent_in_endpoint_decoder() is more readable Sure but we are getting awful close to Java naming there... j/k ;-) Changed. > > > + struct cxl_dc_extent *extent) > > +{ > > + uint64_t start = le64_to_cpu(extent->start_dpa); > > + uint64_t length = le64_to_cpu(extent->length); > u64 > Yep [snip] > > > > +static struct cxl_memdev_state * > > +cxled_to_mds(struct cxl_endpoint_decoder *cxled) > > +{ > > + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); > > + struct cxl_dev_state *cxlds = cxlmd->cxlds; > > + > > + return container_of(cxlds, struct cxl_memdev_state, cxlds); > > +} > > + > > static void cxl_mem_get_records_log(struct cxl_memdev_state *mds, > > enum cxl_event_log_type type) > > { > > @@ -1406,6 +1462,142 @@ int cxl_dev_dynamic_capacity_identify(struct cxl_memdev_state *mds) > > } > > EXPORT_SYMBOL_NS_GPL(cxl_dev_dynamic_capacity_identify, CXL); > > > > +static int cxl_dev_get_dc_extent_cnt(struct cxl_memdev_state *mds, > > cxl_dev_get_dc_extent_generation()? or spell out count I'll spell out count because that is the primary goal. The generation number is just to be able to check each query to ensure the list does not change whilst reading. Ira
fan wrote: > On Sun, Mar 24, 2024 at 04:18:17PM -0700, ira.weiny@intel.com wrote: > > From: Navneet Singh <navneet.singh@intel.com> > > [snip] > > + > > +/** > > + * cxl_read_dc_extents() - Read any existing extents > > + * @cxled: Endpoint decoder which is part of a region > > + * > > + * Issue the Get Dynamic Capacity Extent List command to the device > > + * and add any existing extents found which belong to this decoder. > > + * > > + * Return: 0 if command was executed successfully, -ERRNO on error. > > + */ > > +int cxl_read_dc_extents(struct cxl_endpoint_decoder *cxled) > > +{ > > + struct cxl_memdev_state *mds = cxled_to_mds(cxled); > > + struct device *dev = mds->cxlds.dev; > > + unsigned int extent_gen_num; > > + int rc; > > + > > + if (!cxl_dcd_supported(mds)) { > > + dev_dbg(dev, "DCD unsupported\n"); > > + return 0; > > + } > > + > > + rc = cxl_dev_get_dc_extent_cnt(mds, &extent_gen_num); > > + dev_dbg(mds->cxlds.dev, "Extent count: %d Generation Num: %d\n", > > + rc, extent_gen_num); > > + if (rc <= 0) /* 0 == no records found */ > > + return rc; > > + > > + return cxl_dev_get_dc_extents(cxled, extent_gen_num, rc); > > Not sure about the behaviour here. From the cxl_dev_get_dc_extents > implementation below, if gen_num changed or the expected extent count > changed, it will return error. yep. > If I understand it correctly, if the above two values change, it means > the extent list has been updated due to extent add/release since last > time we read the extent list info (cxl_dev_get_dc_extent_cnt), do we > need to fail the operation or try again? The original series was safe to fail the operation because the list was read on memory device driver load and not when the regions were created. This is an oversight with the new architecture. Now that regions query for the list independent of other regions being active the list could indeed change during this operation. :-/ So a retry is necessary. Let me work on the retry because some of the extents may have been surfaced during the list processing which means a re-read of the list will need to properly ignore those already found. Or some other tracking needs to be put in place. Ira
Jørgen Hansen wrote: > On 3/25/24 00:18, ira.weiny@intel.com wrote: > > From: Navneet Singh <navneet.singh@intel.com> > > > > Dynamic capacity device extents may be left in an accepted state on a > > device due to an unexpected host crash. In this case creation of a new > > region on top of the DC partition (region) is expected to expose those > > extents for continued use. > > > > Once all endpoint decoders are part of a region and the region is being > > realized read the device extent list. For ease of review, this patch > > stops after reading the extent list and leaves realization of the region > > extents to a future patch. > > > > Signed-off-by: Navneet Singh <navneet.singh@intel.com> > > Co-developed-by: Ira Weiny <ira.weiny@intel.com> > > Signed-off-by: Ira Weiny <ira.weiny@intel.com> > > > > --- > > Changes for v1: > > [iweiny: remove extent list xarray] > > [iweiny: Update spec references to 3.1] > > [iweiny: use struct range in extents] > > [iweiny: remove all reference tracking and let regions track extents > > through the extent devices.] > > [djbw/Jonathan/Fan: move extent tracking to endpoint decoders] > > --- > > drivers/cxl/core/core.h | 9 +++ > > drivers/cxl/core/mbox.c | 192 ++++++++++++++++++++++++++++++++++++++++++++++ > > drivers/cxl/core/region.c | 29 +++++++ > > drivers/cxl/cxlmem.h | 49 ++++++++++++ > > 4 files changed, 279 insertions(+) > > > > diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h > > index 91abeffbe985..119b12362977 100644 > > --- a/drivers/cxl/core/core.h > > +++ b/drivers/cxl/core/core.h > > @@ -4,6 +4,8 @@ > > #ifndef __CXL_CORE_H__ > > #define __CXL_CORE_H__ > > > > +#include <cxlmem.h> > > + > > extern const struct device_type cxl_nvdimm_bridge_type; > > extern const struct device_type cxl_nvdimm_type; > > extern const struct device_type cxl_pmu_type; > > @@ -28,6 +30,8 @@ void cxl_decoder_kill_region(struct cxl_endpoint_decoder *cxled); > > int cxl_region_init(void); > > void cxl_region_exit(void); > > int cxl_get_poison_by_endpoint(struct cxl_port *port); > > +int cxl_ed_add_one_extent(struct cxl_endpoint_decoder *cxled, > > + struct cxl_dc_extent *dc_extent); > > #else > > static inline int cxl_get_poison_by_endpoint(struct cxl_port *port) > > { > > @@ -43,6 +47,11 @@ static inline int cxl_region_init(void) > > static inline void cxl_region_exit(void) > > { > > } > > +static inline int cxl_ed_add_one_extent(struct cxl_endpoint_decoder *cxled, > > + struct cxl_dc_extent *dc_extent) > > +{ > > + return 0; > > +} > > #define CXL_REGION_ATTR(x) NULL > > #define CXL_REGION_TYPE(x) NULL > > #define SET_CXL_REGION_ATTR(x) > > diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c > > index 58b31fa47b93..9e33a0976828 100644 > > --- a/drivers/cxl/core/mbox.c > > +++ b/drivers/cxl/core/mbox.c > > @@ -870,6 +870,53 @@ int cxl_enumerate_cmds(struct cxl_memdev_state *mds) > > } > > EXPORT_SYMBOL_NS_GPL(cxl_enumerate_cmds, CXL); > > > > +static int cxl_validate_extent(struct cxl_memdev_state *mds, > > + struct cxl_dc_extent *dc_extent) > > +{ > > + struct device *dev = mds->cxlds.dev; > > + uint64_t start, len; > > + > > + start = le64_to_cpu(dc_extent->start_dpa); > > + len = le64_to_cpu(dc_extent->length); > > + > > + /* Extents must not cross region boundary's */ > > + for (int i = 0; i < mds->nr_dc_region; i++) { > > + struct cxl_dc_region_info *dcr = &mds->dc_region[i]; > > + > > + if (dcr->base <= start && > > + (start + len) <= (dcr->base + dcr->decode_len)) { > > + dev_dbg(dev, "DC extent DPA %#llx - %#llx (DCR:%d:%#llx)\n", > > + start, start + len - 1, i, start - dcr->base); > > + return 0; > > + } > > + } > > + > > + dev_err_ratelimited(dev, > > + "DC extent DPA %#llx - %#llx is not in any DC region\n", > > + start, start + len - 1); > > + return -EINVAL; > > +} > > + > > +static bool cxl_dc_extent_in_ed(struct cxl_endpoint_decoder *cxled, > > + struct cxl_dc_extent *extent) > > +{ > > + uint64_t start = le64_to_cpu(extent->start_dpa); > > + uint64_t length = le64_to_cpu(extent->length); > > + struct range ext_range = (struct range){ > > + .start = start, > > + .end = start + length - 1, > > + }; > > + struct range ed_range = (struct range) { > > + .start = cxled->dpa_res->start, > > + .end = cxled->dpa_res->end, > > + }; > > + > > + dev_dbg(&cxled->cxld.dev, "Checking ED (%pr) for extent DPA:%#llx LEN:%#llx\n", > > + cxled->dpa_res, start, length); > > + > > + return range_contains(&ed_range, &ext_range); > > +} > > + > > void cxl_event_trace_record(const struct cxl_memdev *cxlmd, > > enum cxl_event_log_type type, > > enum cxl_event_type event_type, > > @@ -973,6 +1020,15 @@ static int cxl_clear_event_record(struct cxl_memdev_state *mds, > > return rc; > > } > > > > +static struct cxl_memdev_state * > > +cxled_to_mds(struct cxl_endpoint_decoder *cxled) > > +{ > > + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); > > + struct cxl_dev_state *cxlds = cxlmd->cxlds; > > + > > + return container_of(cxlds, struct cxl_memdev_state, cxlds); > > +} > > + > > static void cxl_mem_get_records_log(struct cxl_memdev_state *mds, > > enum cxl_event_log_type type) > > { > > @@ -1406,6 +1462,142 @@ int cxl_dev_dynamic_capacity_identify(struct cxl_memdev_state *mds) > > } > > EXPORT_SYMBOL_NS_GPL(cxl_dev_dynamic_capacity_identify, CXL); > > > > +static int cxl_dev_get_dc_extent_cnt(struct cxl_memdev_state *mds, > > + unsigned int *extent_gen_num) > > +{ > > + struct cxl_mbox_get_dc_extent_in get_dc_extent; > > + struct cxl_mbox_get_dc_extent_out dc_extents; > > + struct cxl_mbox_cmd mbox_cmd; > > + unsigned int count; > > + int rc; > > + > > + get_dc_extent = (struct cxl_mbox_get_dc_extent_in) { > > + .extent_cnt = cpu_to_le32(0), > > + .start_extent_index = cpu_to_le32(0), > > + }; > > + > > + mbox_cmd = (struct cxl_mbox_cmd) { > > + .opcode = CXL_MBOX_OP_GET_DC_EXTENT_LIST, > > + .payload_in = &get_dc_extent, > > + .size_in = sizeof(get_dc_extent), > > + .size_out = sizeof(dc_extents), > > + .payload_out = &dc_extents, > > + .min_out = 1, > > + }; > > + > > + rc = cxl_internal_send_cmd(mds, &mbox_cmd); > > + if (rc < 0) > > + return rc; > > + > > + count = le32_to_cpu(dc_extents.total_extent_cnt); > > + *extent_gen_num = le32_to_cpu(dc_extents.extent_list_num); > > + > > + return count; > > +} > > + > > +static int cxl_dev_get_dc_extents(struct cxl_endpoint_decoder *cxled, > > + unsigned int start_gen_num, > > + unsigned int exp_cnt) > > +{ > > + struct cxl_memdev_state *mds = cxled_to_mds(cxled); > > + unsigned int start_index, total_read; > > + struct device *dev = mds->cxlds.dev; > > + struct cxl_mbox_cmd mbox_cmd; > > + > > + struct cxl_mbox_get_dc_extent_out *dc_extents __free(kfree) = > > + kvmalloc(mds->payload_size, GFP_KERNEL); > > + if (!dc_extents) > > + return -ENOMEM; > > + > > + total_read = 0; > > + start_index = 0; > > + do { > > + unsigned int nr_ext, total_extent_cnt, gen_num; > > + struct cxl_mbox_get_dc_extent_in get_dc_extent; > > + int rc; > > + > > + get_dc_extent = (struct cxl_mbox_get_dc_extent_in) { > > + .extent_cnt = cpu_to_le32(exp_cnt - start_index), > > + .start_extent_index = cpu_to_le32(start_index), > > + }; > > + > > + mbox_cmd = (struct cxl_mbox_cmd) { > > + .opcode = CXL_MBOX_OP_GET_DC_EXTENT_LIST, > > + .payload_in = &get_dc_extent, > > + .size_in = sizeof(get_dc_extent), > > + .size_out = mds->payload_size, > > + .payload_out = dc_extents, > > + .min_out = 1, > > + }; > > + > > + rc = cxl_internal_send_cmd(mds, &mbox_cmd); > > + if (rc < 0) > > + return rc; > > + > > + nr_ext = le32_to_cpu(dc_extents->ret_extent_cnt); > > + total_read += nr_ext; > > + total_extent_cnt = le32_to_cpu(dc_extents->total_extent_cnt); > > + gen_num = le32_to_cpu(dc_extents->extent_list_num); > > + > > + dev_dbg(dev, "Get extent list count:%d generation Num:%d\n", > > + total_extent_cnt, gen_num); > > + > > + if (gen_num != start_gen_num || exp_cnt != total_extent_cnt) { > > + dev_err(dev, "Possible incomplete extent list; gen %u != %u : cnt %u != %u\n", > > + gen_num, start_gen_num, exp_cnt, total_extent_cnt); > > + return -EIO; > > + } > > + > > + for (int i = 0; i < nr_ext ; i++) { > > + dev_dbg(dev, "Processing extent %d/%d\n", > > + start_index + i, exp_cnt); > > + rc = cxl_validate_extent(mds, &dc_extents->extent[i]); > > + if (rc) > > + continue; > > + if (!cxl_dc_extent_in_ed(cxled, &dc_extents->extent[i])) > > + continue; > > + rc = cxl_ed_add_one_extent(cxled, &dc_extents->extent[i]); > > + if (rc) > > + return rc; > > + } > > + > > + start_index += nr_ext; > > + } while (exp_cnt > total_read); > > + > > + return 0; > > +} > > + > > +/** > > + * cxl_read_dc_extents() - Read any existing extents > > + * @cxled: Endpoint decoder which is part of a region > > + * > > + * Issue the Get Dynamic Capacity Extent List command to the device > > + * and add any existing extents found which belong to this decoder. > > + * > > + * Return: 0 if command was executed successfully, -ERRNO on error. > > + */ > > +int cxl_read_dc_extents(struct cxl_endpoint_decoder *cxled) > > +{ > > + struct cxl_memdev_state *mds = cxled_to_mds(cxled); > > + struct device *dev = mds->cxlds.dev; > > + unsigned int extent_gen_num; > > + int rc; > > + > > + if (!cxl_dcd_supported(mds)) { > > + dev_dbg(dev, "DCD unsupported\n"); > > + return 0; > > + } > > + > > + rc = cxl_dev_get_dc_extent_cnt(mds, &extent_gen_num); > > + dev_dbg(mds->cxlds.dev, "Extent count: %d Generation Num: %d\n", > > + rc, extent_gen_num); > > + if (rc <= 0) /* 0 == no records found */ > > + return rc; > > + > > + return cxl_dev_get_dc_extents(cxled, extent_gen_num, rc); > > Is it necessary to spend a device interaction to get the generation > number? Not completely necessary no. > Couldn't cxl_dev_get_dc_extents obtain that as part of the first > call to the device, and then use it to ensure the consistency of any > remaining calls, if any are necessary? ... However, this is not a critical path and the extra query to hardware makes the code a bit easier to follow IMO. There are 2 distinct steps. 1) get expected number of extents and the current generation number 2) query for that number whilst checking that the gen number is stable Doing what you suggest results in special casing the first query within the loop which is kind of ugly IMO. That said, with the new retry requirement Fan pointed out I'll consider this in that new algorithm context. Ira
On Sun, Mar 24, 2024 at 04:18:17PM -0700, Ira Weiny wrote: > From: Navneet Singh <navneet.singh@intel.com> > > Dynamic capacity device extents may be left in an accepted state on a > device due to an unexpected host crash. In this case creation of a new > region on top of the DC partition (region) is expected to expose those > extents for continued use. > > Once all endpoint decoders are part of a region and the region is being > realized read the device extent list. For ease of review, this patch > stops after reading the extent list and leaves realization of the region > extents to a future patch. > > Signed-off-by: Navneet Singh <navneet.singh@intel.com> > Co-developed-by: Ira Weiny <ira.weiny@intel.com> > Signed-off-by: Ira Weiny <ira.weiny@intel.com> > > --- > Changes for v1: > [iweiny: remove extent list xarray] > [iweiny: Update spec references to 3.1] > [iweiny: use struct range in extents] > [iweiny: remove all reference tracking and let regions track extents > through the extent devices.] > [djbw/Jonathan/Fan: move extent tracking to endpoint decoders] > --- > drivers/cxl/core/core.h | 9 +++ > drivers/cxl/core/mbox.c | 192 ++++++++++++++++++++++++++++++++++++++++++++++ > drivers/cxl/core/region.c | 29 +++++++ > drivers/cxl/cxlmem.h | 49 ++++++++++++ > 4 files changed, 279 insertions(+) snip > > +static int cxl_validate_extent(struct cxl_memdev_state *mds, > + struct cxl_dc_extent *dc_extent) > +{ > + struct device *dev = mds->cxlds.dev; > + uint64_t start, len; > + > + start = le64_to_cpu(dc_extent->start_dpa); > + len = le64_to_cpu(dc_extent->length); > + > + /* Extents must not cross region boundary's */ > + for (int i = 0; i < mds->nr_dc_region; i++) { > + struct cxl_dc_region_info *dcr = &mds->dc_region[i]; > + I think you already got range_contains suggestion > + if (dcr->base <= start && > + (start + len) <= (dcr->base + dcr->decode_len)) { > + dev_dbg(dev, "DC extent DPA %#llx - %#llx (DCR:%d:%#llx)\n", > + start, start + len - 1, i, start - dcr->base); > + return 0; > + } > + } > + > + dev_err_ratelimited(dev, > + "DC extent DPA %#llx - %#llx is not in any DC region\n", > + start, start + len - 1); Need some clarification. Isn't this checking that the extent is fully contained within a region? And then, it dev_err's if not fully contained. There is not actually a check and an error message about crossing region boundary's as the comment suggests. Maybe update the comment to reflect the work.. like: /* Extent must be fully contained in a region */ > + return -EINVAL; > +} > + > +static bool cxl_dc_extent_in_ed(struct cxl_endpoint_decoder *cxled, > + struct cxl_dc_extent *extent) > +{ > + uint64_t start = le64_to_cpu(extent->start_dpa); > + uint64_t length = le64_to_cpu(extent->length); u64 here (and in other places too) > + struct range ext_range = (struct range){ > + .start = start, > + .end = start + length - 1, > + }; > + struct range ed_range = (struct range) { > + .start = cxled->dpa_res->start, > + .end = cxled->dpa_res->end, > + }; > + > + dev_dbg(&cxled->cxld.dev, "Checking ED (%pr) for extent DPA:%#llx LEN:%#llx\n", > + cxled->dpa_res, start, length); > + > + return range_contains(&ed_range, &ext_range); > +} > + > void cxl_event_trace_record(const struct cxl_memdev *cxlmd, > enum cxl_event_log_type type, > enum cxl_event_type event_type, > @@ -973,6 +1020,15 @@ static int cxl_clear_event_record(struct cxl_memdev_state *mds, > return rc; > } > > +static struct cxl_memdev_state * > +cxled_to_mds(struct cxl_endpoint_decoder *cxled) > +{ > + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); > + struct cxl_dev_state *cxlds = cxlmd->cxlds; > + > + return container_of(cxlds, struct cxl_memdev_state, cxlds); > +} > + That's nice! > static void cxl_mem_get_records_log(struct cxl_memdev_state *mds, > enum cxl_event_log_type type) > { > @@ -1406,6 +1462,142 @@ int cxl_dev_dynamic_capacity_identify(struct cxl_memdev_state *mds) > } > EXPORT_SYMBOL_NS_GPL(cxl_dev_dynamic_capacity_identify, CXL); > > +static int cxl_dev_get_dc_extent_cnt(struct cxl_memdev_state *mds, Perhaps drop the _dev_ from this (and other, like below) function names. > +static int cxl_dev_get_dc_extents(struct cxl_endpoint_decoder *cxled, snip > +/** > + * cxl_read_dc_extents() - Read any existing extents > + * @cxled: Endpoint decoder which is part of a region > + * > + * Issue the Get Dynamic Capacity Extent List command to the device > + * and add any existing extents found which belong to this decoder. > + * > + * Return: 0 if command was executed successfully, -ERRNO on error. > + */ > +int cxl_read_dc_extents(struct cxl_endpoint_decoder *cxled) > +{ > + struct cxl_memdev_state *mds = cxled_to_mds(cxled); > + struct device *dev = mds->cxlds.dev; > + unsigned int extent_gen_num; > + int rc; > + > + if (!cxl_dcd_supported(mds)) { > + dev_dbg(dev, "DCD unsupported\n"); > + return 0; > + } > + > + rc = cxl_dev_get_dc_extent_cnt(mds, &extent_gen_num); > + dev_dbg(mds->cxlds.dev, "Extent count: %d Generation Num: %d\n", Either use the *dev defined in both dev_dbg()'s or get rid of it use mds->cxlds.dev. > + rc, extent_gen_num); > + if (rc <= 0) /* 0 == no records found */ > + return rc; > + > + return cxl_dev_get_dc_extents(cxled, extent_gen_num, rc); > +} > +EXPORT_SYMBOL_NS_GPL(cxl_read_dc_extents, CXL); > + snip > > +static int cxl_region_read_extents(struct cxl_region *cxlr) > +{ How about: static int cxl_region_read_extents(struct cxl_region_params *p) > + struct cxl_region_params *p = &cxlr->params; > + int i; > + > + for (i = 0; i < p->nr_targets; i++) { > + int rc; > + > + rc = cxl_read_dc_extents(p->targets[i]); > + if (rc) > + return rc; > + } > + > + return 0; > +} snip to end
ira.weiny@ wrote: > From: Navneet Singh <navneet.singh@intel.com> > > Dynamic capacity device extents may be left in an accepted state on a > device due to an unexpected host crash. In this case creation of a new > region on top of the DC partition (region) is expected to expose those > extents for continued use. > > Once all endpoint decoders are part of a region and the region is being > realized read the device extent list. For ease of review, this patch > stops after reading the extent list and leaves realization of the region > extents to a future patch. > > Signed-off-by: Navneet Singh <navneet.singh@intel.com> > Co-developed-by: Ira Weiny <ira.weiny@intel.com> > Signed-off-by: Ira Weiny <ira.weiny@intel.com> > > --- > Changes for v1: > [iweiny: remove extent list xarray] > [iweiny: Update spec references to 3.1] > [iweiny: use struct range in extents] > [iweiny: remove all reference tracking and let regions track extents > through the extent devices.] > [djbw/Jonathan/Fan: move extent tracking to endpoint decoders] > --- > drivers/cxl/core/core.h | 9 +++ > drivers/cxl/core/mbox.c | 192 ++++++++++++++++++++++++++++++++++++++++++++++ > drivers/cxl/core/region.c | 29 +++++++ > drivers/cxl/cxlmem.h | 49 ++++++++++++ > 4 files changed, 279 insertions(+) > > diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h > index 91abeffbe985..119b12362977 100644 > --- a/drivers/cxl/core/core.h > +++ b/drivers/cxl/core/core.h > @@ -4,6 +4,8 @@ > #ifndef __CXL_CORE_H__ > #define __CXL_CORE_H__ > > +#include <cxlmem.h> > + > extern const struct device_type cxl_nvdimm_bridge_type; > extern const struct device_type cxl_nvdimm_type; > extern const struct device_type cxl_pmu_type; > @@ -28,6 +30,8 @@ void cxl_decoder_kill_region(struct cxl_endpoint_decoder *cxled); > int cxl_region_init(void); > void cxl_region_exit(void); > int cxl_get_poison_by_endpoint(struct cxl_port *port); > +int cxl_ed_add_one_extent(struct cxl_endpoint_decoder *cxled, > + struct cxl_dc_extent *dc_extent); There are already functions called "cxled_", so lets not invent the "cxl_ed_" prefix. [..] > diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c > index 58b31fa47b93..9e33a0976828 100644 > --- a/drivers/cxl/core/mbox.c > +++ b/drivers/cxl/core/mbox.c > @@ -870,6 +870,53 @@ int cxl_enumerate_cmds(struct cxl_memdev_state *mds) > } > EXPORT_SYMBOL_NS_GPL(cxl_enumerate_cmds, CXL); > > +static int cxl_validate_extent(struct cxl_memdev_state *mds, > + struct cxl_dc_extent *dc_extent) > +{ > + struct device *dev = mds->cxlds.dev; > + uint64_t start, len; > + > + start = le64_to_cpu(dc_extent->start_dpa); > + len = le64_to_cpu(dc_extent->length); > + > + /* Extents must not cross region boundary's */ > + for (int i = 0; i < mds->nr_dc_region; i++) { > + struct cxl_dc_region_info *dcr = &mds->dc_region[i]; > + > + if (dcr->base <= start && > + (start + len) <= (dcr->base + dcr->decode_len)) { > + dev_dbg(dev, "DC extent DPA %#llx - %#llx (DCR:%d:%#llx)\n", > + start, start + len - 1, i, start - dcr->base); > + return 0; > + } > + } > + > + dev_err_ratelimited(dev, > + "DC extent DPA %#llx - %#llx is not in any DC region\n", > + start, start + len - 1); If the goal is give the admin an answer to the question "hey what happened to the capacity I was expecting?", then this should include the tag. Also, this is a warning, not an error, right? I.e. the driver continues with the validated extents. > + return -EINVAL; This value is not returned up the stack, however, I expect EINVAL on user input errors. For misaligned device-internal addressing, ENXIO is more appropriate. > +} > + > +static bool cxl_dc_extent_in_ed(struct cxl_endpoint_decoder *cxled, > + struct cxl_dc_extent *extent) How about cxled_contains_extent()? There's no other "extents" besides "dc_extents" in the driver, and once a symbol name goes over 2 underscores it starts to be too many tokens. > +{ > + uint64_t start = le64_to_cpu(extent->start_dpa); > + uint64_t length = le64_to_cpu(extent->length); > + struct range ext_range = (struct range){ > + .start = start, > + .end = start + length - 1, > + }; > + struct range ed_range = (struct range) { > + .start = cxled->dpa_res->start, > + .end = cxled->dpa_res->end, > + }; > + > + dev_dbg(&cxled->cxld.dev, "Checking ED (%pr) for extent DPA:%#llx LEN:%#llx\n", > + cxled->dpa_res, start, length); ED is not a standalone abbreviation anywhere else, it's either "cxled" or "endpoint decoder". I am open to renames, but not mixed names. For this one the decoder name is already in the printout, so no real need to redundantly mention "ED". Lastly, I think continued use of 'struct range' is begging for a new enlightened format specifier. I am thinking "%par" since these things are usually some kind of physical address, and I do not see an easy way to extend the existing "%pr/%pR" to accommodate ranges. > + > + return range_contains(&ed_range, &ext_range); > +} > + > void cxl_event_trace_record(const struct cxl_memdev *cxlmd, > enum cxl_event_log_type type, > enum cxl_event_type event_type, > @@ -973,6 +1020,15 @@ static int cxl_clear_event_record(struct cxl_memdev_state *mds, > return rc; > } > > +static struct cxl_memdev_state * > +cxled_to_mds(struct cxl_endpoint_decoder *cxled) > +{ > + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); > + struct cxl_dev_state *cxlds = cxlmd->cxlds; > + > + return container_of(cxlds, struct cxl_memdev_state, cxlds); > +} Looks good, makes me wonder if a cxled_to_devstate() would be a net positive reduction in code. I think most of the current cxled_to_memdev(), just do cxlmd->cxlds with the result. > static void cxl_mem_get_records_log(struct cxl_memdev_state *mds, > enum cxl_event_log_type type) > { > @@ -1406,6 +1462,142 @@ int cxl_dev_dynamic_capacity_identify(struct cxl_memdev_state *mds) > } > EXPORT_SYMBOL_NS_GPL(cxl_dev_dynamic_capacity_identify, CXL); > > +static int cxl_dev_get_dc_extent_cnt(struct cxl_memdev_state *mds, > + unsigned int *extent_gen_num) I know the spec has this behavior where asking for zero extents returns the total pending, but that does not really justify having this extra step before retrieving extents. > +{ > + struct cxl_mbox_get_dc_extent_in get_dc_extent; > + struct cxl_mbox_get_dc_extent_out dc_extents; The more I look at these patches the more I think a s/dc_extent/extent/ change is warranted to cut down on the visual token parsing reading this code. > + struct cxl_mbox_cmd mbox_cmd; > + unsigned int count; > + int rc; > + > + get_dc_extent = (struct cxl_mbox_get_dc_extent_in) { > + .extent_cnt = cpu_to_le32(0), > + .start_extent_index = cpu_to_le32(0), > + }; > + > + mbox_cmd = (struct cxl_mbox_cmd) { > + .opcode = CXL_MBOX_OP_GET_DC_EXTENT_LIST, > + .payload_in = &get_dc_extent, > + .size_in = sizeof(get_dc_extent), > + .size_out = sizeof(dc_extents), > + .payload_out = &dc_extents, > + .min_out = 1, > + }; > + > + rc = cxl_internal_send_cmd(mds, &mbox_cmd); > + if (rc < 0) > + return rc; > + > + count = le32_to_cpu(dc_extents.total_extent_cnt); > + *extent_gen_num = le32_to_cpu(dc_extents.extent_list_num); Setting aside that this function likely serves no incremental purpose, why is the number of extents stored in a variable called "gen_num"? > + > + return count; > +} > + > +static int cxl_dev_get_dc_extents(struct cxl_endpoint_decoder *cxled, > + unsigned int start_gen_num, > + unsigned int exp_cnt) > +{ > + struct cxl_memdev_state *mds = cxled_to_mds(cxled); > + unsigned int start_index, total_read; > + struct device *dev = mds->cxlds.dev; > + struct cxl_mbox_cmd mbox_cmd; > + > + struct cxl_mbox_get_dc_extent_out *dc_extents __free(kfree) = > + kvmalloc(mds->payload_size, GFP_KERNEL); > + if (!dc_extents) > + return -ENOMEM; > + > + total_read = 0; > + start_index = 0; > + do { > + unsigned int nr_ext, total_extent_cnt, gen_num; > + struct cxl_mbox_get_dc_extent_in get_dc_extent; > + int rc; > + > + get_dc_extent = (struct cxl_mbox_get_dc_extent_in) { > + .extent_cnt = cpu_to_le32(exp_cnt - start_index), Shouldn't this be something like: .extent_cnt = cpu_to_le32(start_index ? remaining : 1), ...where @remaining is initialized at the end of the first iteration? > + .start_extent_index = cpu_to_le32(start_index), > + }; > + > + mbox_cmd = (struct cxl_mbox_cmd) { > + .opcode = CXL_MBOX_OP_GET_DC_EXTENT_LIST, > + .payload_in = &get_dc_extent, > + .size_in = sizeof(get_dc_extent), > + .size_out = mds->payload_size, > + .payload_out = dc_extents, > + .min_out = 1, > + }; > + > + rc = cxl_internal_send_cmd(mds, &mbox_cmd); > + if (rc < 0) > + return rc; > + > + nr_ext = le32_to_cpu(dc_extents->ret_extent_cnt); It occurs to me that usage of "nr_" outnumbers "_cnt" in the driver, lets stick to the predominate style and just use "nr_" for symbol names that represent counts and just call this nr_returned, or similar. > + total_read += nr_ext; > + total_extent_cnt = le32_to_cpu(dc_extents->total_extent_cnt); > + gen_num = le32_to_cpu(dc_extents->extent_list_num); > + > + dev_dbg(dev, "Get extent list count:%d generation Num:%d\n", > + total_extent_cnt, gen_num); > + > + if (gen_num != start_gen_num || exp_cnt != total_extent_cnt) { > + dev_err(dev, "Possible incomplete extent list; gen %u != %u : cnt %u != %u\n", > + gen_num, start_gen_num, exp_cnt, total_extent_cnt); > + return -EIO; Why fail? When the generation number has changed I would only hope that means that the number of extents in the list has gone up, not that previously retrieved extents have been invalidated. So a generation number change event likely just means to retry the retrieval starting from the end of the last generation. > + } > + > + for (int i = 0; i < nr_ext ; i++) { > + dev_dbg(dev, "Processing extent %d/%d\n", > + start_index + i, exp_cnt); > + rc = cxl_validate_extent(mds, &dc_extents->extent[i]); > + if (rc) > + continue; > + if (!cxl_dc_extent_in_ed(cxled, &dc_extents->extent[i])) > + continue; > + rc = cxl_ed_add_one_extent(cxled, &dc_extents->extent[i]); > + if (rc) > + return rc; I would rather this patch just claim to only validate all present extents rather than pretend to add it. I.e. defer cxl_ed_add_one_extent() to be defined and called later. When it comes back a name with less tokens like cxled_add_extent() would be nice. "one" is already assumed by non-plural "extent". > + } > + > + start_index += nr_ext; > + } while (exp_cnt > total_read); > + > + return 0; > +} > + > +/** > + * cxl_read_dc_extents() - Read any existing extents > + * @cxled: Endpoint decoder which is part of a region > + * > + * Issue the Get Dynamic Capacity Extent List command to the device > + * and add any existing extents found which belong to this decoder. > + * > + * Return: 0 if command was executed successfully, -ERRNO on error. > + */ > +int cxl_read_dc_extents(struct cxl_endpoint_decoder *cxled) > +{ > + struct cxl_memdev_state *mds = cxled_to_mds(cxled); > + struct device *dev = mds->cxlds.dev; > + unsigned int extent_gen_num; > + int rc; > + > + if (!cxl_dcd_supported(mds)) { Why is "dcd_supported" being checked again so deep in the stack? How does an upper layer get this far into the driver without something already noticing that dcd support is not present? [..] > diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c > index 0d7b09a49dcf..3e563ab29afe 100644 > --- a/drivers/cxl/core/region.c > +++ b/drivers/cxl/core/region.c > @@ -1450,6 +1450,13 @@ static int cxl_region_validate_position(struct cxl_region *cxlr, > return 0; > } > > +/* Callers are expected to ensure cxled has been attached to a region */ > +int cxl_ed_add_one_extent(struct cxl_endpoint_decoder *cxled, > + struct cxl_dc_extent *dc_extent) > +{ > + return 0; > +} > + > static int cxl_region_attach_position(struct cxl_region *cxlr, > struct cxl_root_decoder *cxlrd, > struct cxl_endpoint_decoder *cxled, > @@ -2773,6 +2780,22 @@ static int devm_cxl_add_pmem_region(struct cxl_region *cxlr) > return rc; > } > > +static int cxl_region_read_extents(struct cxl_region *cxlr) > +{ > + struct cxl_region_params *p = &cxlr->params; > + int i; > + > + for (i = 0; i < p->nr_targets; i++) { > + int rc; > + > + rc = cxl_read_dc_extents(p->targets[i]); Per comment above, the targets should have already been checked for dcd support before being added to the region. > + if (rc) > + return rc; > + } > + > + return 0; > +} > + > static void cxlr_dax_unregister(void *_cxlr_dax) > { > struct cxl_dax_region *cxlr_dax = _cxlr_dax; > @@ -2807,6 +2830,12 @@ static int devm_cxl_add_dax_region(struct cxl_region *cxlr) > dev_dbg(&cxlr->dev, "%s: register %s\n", dev_name(dev->parent), > dev_name(dev)); > > + if (cxlr->mode == CXL_REGION_DC) { > + rc = cxl_region_read_extents(cxlr); devm_cxl_add_dax_region() happens way after the region parameters have been validated. I would have expected that initial extent list validation happens earlier during region attach. This reorganization also more naturally fits the interleave case where there will need be cross device-validation before cxl_region_probe() runs. [..]
Dan Williams wrote: > ira.weiny@ wrote: > > From: Navneet Singh <navneet.singh@intel.com> > > I seemed to have missed some of the comments/questions in this review. [snip] > > > > +#include <cxlmem.h> > > + > > extern const struct device_type cxl_nvdimm_bridge_type; > > extern const struct device_type cxl_nvdimm_type; > > extern const struct device_type cxl_pmu_type; > > @@ -28,6 +30,8 @@ void cxl_decoder_kill_region(struct cxl_endpoint_decoder *cxled); > > int cxl_region_init(void); > > void cxl_region_exit(void); > > int cxl_get_poison_by_endpoint(struct cxl_port *port); > > +int cxl_ed_add_one_extent(struct cxl_endpoint_decoder *cxled, > > + struct cxl_dc_extent *dc_extent); > > There are already functions called "cxled_", so lets not invent the > "cxl_ed_" prefix. Done. > > [..] > > diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c > > index 58b31fa47b93..9e33a0976828 100644 > > --- a/drivers/cxl/core/mbox.c > > +++ b/drivers/cxl/core/mbox.c > > @@ -870,6 +870,53 @@ int cxl_enumerate_cmds(struct cxl_memdev_state *mds) > > } > > EXPORT_SYMBOL_NS_GPL(cxl_enumerate_cmds, CXL); > > > > +static int cxl_validate_extent(struct cxl_memdev_state *mds, > > + struct cxl_dc_extent *dc_extent) > > +{ > > + struct device *dev = mds->cxlds.dev; > > + uint64_t start, len; > > + > > + start = le64_to_cpu(dc_extent->start_dpa); > > + len = le64_to_cpu(dc_extent->length); > > + > > + /* Extents must not cross region boundary's */ > > + for (int i = 0; i < mds->nr_dc_region; i++) { > > + struct cxl_dc_region_info *dcr = &mds->dc_region[i]; > > + > > + if (dcr->base <= start && > > + (start + len) <= (dcr->base + dcr->decode_len)) { > > + dev_dbg(dev, "DC extent DPA %#llx - %#llx (DCR:%d:%#llx)\n", > > + start, start + len - 1, i, start - dcr->base); > > + return 0; > > + } > > + } > > + > > + dev_err_ratelimited(dev, > > + "DC extent DPA %#llx - %#llx is not in any DC region\n", > > + start, start + len - 1); > > If the goal is give the admin an answer to the question "hey what > happened to the capacity I was expecting?", then this should include the > tag. Also, this is a warning, not an error, right? I.e. the driver > continues with the validated extents. Done. > > > + return -EINVAL; > > This value is not returned up the stack, however, I expect EINVAL on > user input errors. For misaligned device-internal addressing, ENXIO is > more appropriate. Done. > > > +} > > + > > +static bool cxl_dc_extent_in_ed(struct cxl_endpoint_decoder *cxled, > > + struct cxl_dc_extent *extent) > > How about cxled_contains_extent()? > > There's no other "extents" besides "dc_extents" in the driver, and once > a symbol name goes over 2 underscores it starts to be too many tokens. Done. > > > +{ > > + uint64_t start = le64_to_cpu(extent->start_dpa); > > + uint64_t length = le64_to_cpu(extent->length); > > + struct range ext_range = (struct range){ > > + .start = start, > > + .end = start + length - 1, > > + }; > > + struct range ed_range = (struct range) { > > + .start = cxled->dpa_res->start, > > + .end = cxled->dpa_res->end, > > + }; > > + > > + dev_dbg(&cxled->cxld.dev, "Checking ED (%pr) for extent DPA:%#llx LEN:%#llx\n", > > + cxled->dpa_res, start, length); > > ED is not a standalone abbreviation anywhere else, it's either "cxled" or > "endpoint decoder". I am open to renames, but not mixed names. Done. > > For this one the decoder name is already in the printout, so no real > need to redundantly mention "ED". > > Lastly, I think continued use of 'struct range' is begging for a new > enlightened format specifier. I am thinking "%par" since these things > are usually some kind of physical address, and I do not see an easy way > to extend the existing "%pr/%pR" to accommodate ranges. I've made a patch to add '%pn' Not sure if '%par' would be better but using a single modifier to %p seems to be more standard for this. > > > + > > + return range_contains(&ed_range, &ext_range); > > +} > > + > > void cxl_event_trace_record(const struct cxl_memdev *cxlmd, > > enum cxl_event_log_type type, > > enum cxl_event_type event_type, > > @@ -973,6 +1020,15 @@ static int cxl_clear_event_record(struct cxl_memdev_state *mds, > > return rc; > > } > > > > +static struct cxl_memdev_state * > > +cxled_to_mds(struct cxl_endpoint_decoder *cxled) > > +{ > > + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); > > + struct cxl_dev_state *cxlds = cxlmd->cxlds; > > + > > + return container_of(cxlds, struct cxl_memdev_state, cxlds); > > +} > > Looks good, makes me wonder if a cxled_to_devstate() would be a net positive > reduction in code. I think most of the current cxled_to_memdev(), just > do cxlmd->cxlds with the result. > > > static void cxl_mem_get_records_log(struct cxl_memdev_state *mds, > > enum cxl_event_log_type type) > > { > > @@ -1406,6 +1462,142 @@ int cxl_dev_dynamic_capacity_identify(struct cxl_memdev_state *mds) > > } > > EXPORT_SYMBOL_NS_GPL(cxl_dev_dynamic_capacity_identify, CXL); > > > > +static int cxl_dev_get_dc_extent_cnt(struct cxl_memdev_state *mds, > > + unsigned int *extent_gen_num) > > I know the spec has this behavior where asking for zero extents returns > the total pending, but that does not really justify having this extra > step before retrieving extents. Ok > > > +{ > > + struct cxl_mbox_get_dc_extent_in get_dc_extent; > > + struct cxl_mbox_get_dc_extent_out dc_extents; > > The more I look at these patches the more I think a s/dc_extent/extent/ > change is warranted to cut down on the visual token parsing reading this > code. I'll try and clean it up. I have gone back and forth on the number of objects which represent an 'extent' and so I want to make it clear what 'extent thing' is being worked on. This does get pretty verbose though. > > > + struct cxl_mbox_cmd mbox_cmd; > > + unsigned int count; > > + int rc; > > + > > + get_dc_extent = (struct cxl_mbox_get_dc_extent_in) { > > + .extent_cnt = cpu_to_le32(0), > > + .start_extent_index = cpu_to_le32(0), > > + }; > > + > > + mbox_cmd = (struct cxl_mbox_cmd) { > > + .opcode = CXL_MBOX_OP_GET_DC_EXTENT_LIST, > > + .payload_in = &get_dc_extent, > > + .size_in = sizeof(get_dc_extent), > > + .size_out = sizeof(dc_extents), > > + .payload_out = &dc_extents, > > + .min_out = 1, > > + }; > > + > > + rc = cxl_internal_send_cmd(mds, &mbox_cmd); > > + if (rc < 0) > > + return rc; > > + > > + count = le32_to_cpu(dc_extents.total_extent_cnt); > > + *extent_gen_num = le32_to_cpu(dc_extents.extent_list_num); > > Setting aside that this function likely serves no incremental purpose, > why is the number of extents stored in a variable called "gen_num"? It is not. extent_list_num is the generation number. That is a poor name for the structure not this code. I've removed this function per your's and other feedback and have changed the structure to be 'generation num' which is much clearer. > > > + > > + return count; > > +} > > + > > +static int cxl_dev_get_dc_extents(struct cxl_endpoint_decoder *cxled, > > + unsigned int start_gen_num, > > + unsigned int exp_cnt) > > +{ > > + struct cxl_memdev_state *mds = cxled_to_mds(cxled); > > + unsigned int start_index, total_read; > > + struct device *dev = mds->cxlds.dev; > > + struct cxl_mbox_cmd mbox_cmd; > > + > > + struct cxl_mbox_get_dc_extent_out *dc_extents __free(kfree) = > > + kvmalloc(mds->payload_size, GFP_KERNEL); > > + if (!dc_extents) > > + return -ENOMEM; > > + > > + total_read = 0; > > + start_index = 0; > > + do { > > + unsigned int nr_ext, total_extent_cnt, gen_num; > > + struct cxl_mbox_get_dc_extent_in get_dc_extent; > > + int rc; > > + > > + get_dc_extent = (struct cxl_mbox_get_dc_extent_in) { > > + .extent_cnt = cpu_to_le32(exp_cnt - start_index), > > Shouldn't this be something like: > > .extent_cnt = cpu_to_le32(start_index ? remaining : 1), > > ...where @remaining is initialized at the end of the first iteration? That could work but the math is such that we have to have the start_index anyway. So this calculation is 'remaining'. The algorithm in this patch is to query for exp_cnt ahead of time for exactly this reason. But without that initial call one must do: .extent_cnt = max(max_extent_count, cpu_to_le32(expected_total - current_start)), Where max_extended_count is calculated per the payload_size. The lack of a max was a bug in this patch. So it is good to have questioned this. But the remaining calculation was correct. > > > + .start_extent_index = cpu_to_le32(start_index), > > + }; > > + > > + mbox_cmd = (struct cxl_mbox_cmd) { > > + .opcode = CXL_MBOX_OP_GET_DC_EXTENT_LIST, > > + .payload_in = &get_dc_extent, > > + .size_in = sizeof(get_dc_extent), > > + .size_out = mds->payload_size, > > + .payload_out = dc_extents, > > + .min_out = 1, > > + }; > > + > > + rc = cxl_internal_send_cmd(mds, &mbox_cmd); > > + if (rc < 0) > > + return rc; > > + > > + nr_ext = le32_to_cpu(dc_extents->ret_extent_cnt); > > It occurs to me that usage of "nr_" outnumbers "_cnt" in the driver, lets > stick to the predominate style and just use "nr_" for symbol names that > represent counts and just call this nr_returned, or similar. 'cnt' removed. > > > + total_read += nr_ext; > > + total_extent_cnt = le32_to_cpu(dc_extents->total_extent_cnt); > > + gen_num = le32_to_cpu(dc_extents->extent_list_num); > > + > > + dev_dbg(dev, "Get extent list count:%d generation Num:%d\n", > > + total_extent_cnt, gen_num); > > + > > + if (gen_num != start_gen_num || exp_cnt != total_extent_cnt) { > > + dev_err(dev, "Possible incomplete extent list; gen %u != %u : cnt %u != %u\n", > > + gen_num, start_gen_num, exp_cnt, total_extent_cnt); > > + return -EIO; > > Why fail? When the generation number has changed I would only hope that > means that the number of extents in the list has gone up, not that > previously retrieved extents have been invalidated. Actually it could be that previous extents were removed. But it could be that they were removed for other regions which had nothing to do with the current region being onlined. All the generation number means is that the list changed. How it changed is up to the host to resolve. > > So a generation number change event likely just means to retry the > retrieval starting from the end of the last generation. I've added the retry per Jonathan's request already. But this does bring up a good point. If extents are removed _as_ the region is being onlined I think there is a race here. I'm going to punt on this ATM because it seems unlikely to be real. But the list could change at any time if other live regions are changing. > > > + } > > + > > + for (int i = 0; i < nr_ext ; i++) { > > + dev_dbg(dev, "Processing extent %d/%d\n", > > + start_index + i, exp_cnt); > > + rc = cxl_validate_extent(mds, &dc_extents->extent[i]); > > + if (rc) > > + continue; > > + if (!cxl_dc_extent_in_ed(cxled, &dc_extents->extent[i])) > > + continue; > > + rc = cxl_ed_add_one_extent(cxled, &dc_extents->extent[i]); > > + if (rc) > > + return rc; > > I would rather this patch just claim to only validate all present > extents rather than pretend to add it. I.e. defer > cxl_ed_add_one_extent() to be defined and called later. When it comes > back a name with less tokens like cxled_add_extent() would be nice. > "one" is already assumed by non-plural "extent". This code has changed a bit. I've avoided the stubbed out calls in the next version per your feedback on the patch which added the implementation. > > > + } > > + > > + start_index += nr_ext; > > + } while (exp_cnt > total_read); > > + > > + return 0; > > +} > > + > > +/** > > + * cxl_read_dc_extents() - Read any existing extents > > + * @cxled: Endpoint decoder which is part of a region > > + * > > + * Issue the Get Dynamic Capacity Extent List command to the device > > + * and add any existing extents found which belong to this decoder. > > + * > > + * Return: 0 if command was executed successfully, -ERRNO on error. > > + */ > > +int cxl_read_dc_extents(struct cxl_endpoint_decoder *cxled) > > +{ > > + struct cxl_memdev_state *mds = cxled_to_mds(cxled); > > + struct device *dev = mds->cxlds.dev; > > + unsigned int extent_gen_num; > > + int rc; > > + > > + if (!cxl_dcd_supported(mds)) { > > Why is "dcd_supported" being checked again so deep in the stack? How > does an upper layer get this far into the driver without something > already noticing that dcd support is not present? This is the first check of this type. I think the real issue is that a DC region was allowed to be created in the first place. Although I thought that was something I tested. I'll double check these flows. > > [..] > > diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c > > index 0d7b09a49dcf..3e563ab29afe 100644 > > --- a/drivers/cxl/core/region.c > > +++ b/drivers/cxl/core/region.c > > @@ -1450,6 +1450,13 @@ static int cxl_region_validate_position(struct cxl_region *cxlr, > > return 0; > > } > > > > +/* Callers are expected to ensure cxled has been attached to a region */ > > +int cxl_ed_add_one_extent(struct cxl_endpoint_decoder *cxled, > > + struct cxl_dc_extent *dc_extent) > > +{ > > + return 0; > > +} > > + > > static int cxl_region_attach_position(struct cxl_region *cxlr, > > struct cxl_root_decoder *cxlrd, > > struct cxl_endpoint_decoder *cxled, > > @@ -2773,6 +2780,22 @@ static int devm_cxl_add_pmem_region(struct cxl_region *cxlr) > > return rc; > > } > > > > +static int cxl_region_read_extents(struct cxl_region *cxlr) > > +{ > > + struct cxl_region_params *p = &cxlr->params; > > + int i; > > + > > + for (i = 0; i < p->nr_targets; i++) { > > + int rc; > > + > > + rc = cxl_read_dc_extents(p->targets[i]); > > Per comment above, the targets should have already been checked for dcd > support before being added to the region. > Yes. > > > + if (rc) > > + return rc; > > + } > > + > > + return 0; > > +} > > + > > static void cxlr_dax_unregister(void *_cxlr_dax) > > { > > struct cxl_dax_region *cxlr_dax = _cxlr_dax; > > @@ -2807,6 +2830,12 @@ static int devm_cxl_add_dax_region(struct cxl_region *cxlr) > > dev_dbg(&cxlr->dev, "%s: register %s\n", dev_name(dev->parent), > > dev_name(dev)); > > > > + if (cxlr->mode == CXL_REGION_DC) { > > + rc = cxl_region_read_extents(cxlr); > > devm_cxl_add_dax_region() happens way after the region parameters have > been validated. I don't follow this. The purpose of this patch is to read and surface any extents which existed during a system crash and were not released by the device. > > I would have expected that initial extent list > validation happens earlier during region attach. The validation is just to keep the host software consistent and to ignore any extents which are not part of the region being onlined. Obviously I have not make the purpose of this patch clear. I will try and do better in the next version. > This reorganization > also more naturally fits the interleave case where there will need be > cross device-validation before cxl_region_probe() runs. I don't follow this either. What cross device validation must occur? An extent can be offered on any 1 device in the interleave set. The driver must ignore that until the other devices offer their matching extents. This is why I have said in the past that the driver will need to track the extents in the endpoint decoders not just in the region. Because until all the EDs for the interleave have a matching extent the region extent can't be surfaced. Furthermore, this call was placed here after the large discussion at plumbers last year where it was insisted that the driver not accept any extents until the region is created. It is true that extents could be read for each ED when that ED is attached to the region. But why bother if the region is never fully realized? Ira
diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h index 91abeffbe985..119b12362977 100644 --- a/drivers/cxl/core/core.h +++ b/drivers/cxl/core/core.h @@ -4,6 +4,8 @@ #ifndef __CXL_CORE_H__ #define __CXL_CORE_H__ +#include <cxlmem.h> + extern const struct device_type cxl_nvdimm_bridge_type; extern const struct device_type cxl_nvdimm_type; extern const struct device_type cxl_pmu_type; @@ -28,6 +30,8 @@ void cxl_decoder_kill_region(struct cxl_endpoint_decoder *cxled); int cxl_region_init(void); void cxl_region_exit(void); int cxl_get_poison_by_endpoint(struct cxl_port *port); +int cxl_ed_add_one_extent(struct cxl_endpoint_decoder *cxled, + struct cxl_dc_extent *dc_extent); #else static inline int cxl_get_poison_by_endpoint(struct cxl_port *port) { @@ -43,6 +47,11 @@ static inline int cxl_region_init(void) static inline void cxl_region_exit(void) { } +static inline int cxl_ed_add_one_extent(struct cxl_endpoint_decoder *cxled, + struct cxl_dc_extent *dc_extent) +{ + return 0; +} #define CXL_REGION_ATTR(x) NULL #define CXL_REGION_TYPE(x) NULL #define SET_CXL_REGION_ATTR(x) diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c index 58b31fa47b93..9e33a0976828 100644 --- a/drivers/cxl/core/mbox.c +++ b/drivers/cxl/core/mbox.c @@ -870,6 +870,53 @@ int cxl_enumerate_cmds(struct cxl_memdev_state *mds) } EXPORT_SYMBOL_NS_GPL(cxl_enumerate_cmds, CXL); +static int cxl_validate_extent(struct cxl_memdev_state *mds, + struct cxl_dc_extent *dc_extent) +{ + struct device *dev = mds->cxlds.dev; + uint64_t start, len; + + start = le64_to_cpu(dc_extent->start_dpa); + len = le64_to_cpu(dc_extent->length); + + /* Extents must not cross region boundary's */ + for (int i = 0; i < mds->nr_dc_region; i++) { + struct cxl_dc_region_info *dcr = &mds->dc_region[i]; + + if (dcr->base <= start && + (start + len) <= (dcr->base + dcr->decode_len)) { + dev_dbg(dev, "DC extent DPA %#llx - %#llx (DCR:%d:%#llx)\n", + start, start + len - 1, i, start - dcr->base); + return 0; + } + } + + dev_err_ratelimited(dev, + "DC extent DPA %#llx - %#llx is not in any DC region\n", + start, start + len - 1); + return -EINVAL; +} + +static bool cxl_dc_extent_in_ed(struct cxl_endpoint_decoder *cxled, + struct cxl_dc_extent *extent) +{ + uint64_t start = le64_to_cpu(extent->start_dpa); + uint64_t length = le64_to_cpu(extent->length); + struct range ext_range = (struct range){ + .start = start, + .end = start + length - 1, + }; + struct range ed_range = (struct range) { + .start = cxled->dpa_res->start, + .end = cxled->dpa_res->end, + }; + + dev_dbg(&cxled->cxld.dev, "Checking ED (%pr) for extent DPA:%#llx LEN:%#llx\n", + cxled->dpa_res, start, length); + + return range_contains(&ed_range, &ext_range); +} + void cxl_event_trace_record(const struct cxl_memdev *cxlmd, enum cxl_event_log_type type, enum cxl_event_type event_type, @@ -973,6 +1020,15 @@ static int cxl_clear_event_record(struct cxl_memdev_state *mds, return rc; } +static struct cxl_memdev_state * +cxled_to_mds(struct cxl_endpoint_decoder *cxled) +{ + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_dev_state *cxlds = cxlmd->cxlds; + + return container_of(cxlds, struct cxl_memdev_state, cxlds); +} + static void cxl_mem_get_records_log(struct cxl_memdev_state *mds, enum cxl_event_log_type type) { @@ -1406,6 +1462,142 @@ int cxl_dev_dynamic_capacity_identify(struct cxl_memdev_state *mds) } EXPORT_SYMBOL_NS_GPL(cxl_dev_dynamic_capacity_identify, CXL); +static int cxl_dev_get_dc_extent_cnt(struct cxl_memdev_state *mds, + unsigned int *extent_gen_num) +{ + struct cxl_mbox_get_dc_extent_in get_dc_extent; + struct cxl_mbox_get_dc_extent_out dc_extents; + struct cxl_mbox_cmd mbox_cmd; + unsigned int count; + int rc; + + get_dc_extent = (struct cxl_mbox_get_dc_extent_in) { + .extent_cnt = cpu_to_le32(0), + .start_extent_index = cpu_to_le32(0), + }; + + mbox_cmd = (struct cxl_mbox_cmd) { + .opcode = CXL_MBOX_OP_GET_DC_EXTENT_LIST, + .payload_in = &get_dc_extent, + .size_in = sizeof(get_dc_extent), + .size_out = sizeof(dc_extents), + .payload_out = &dc_extents, + .min_out = 1, + }; + + rc = cxl_internal_send_cmd(mds, &mbox_cmd); + if (rc < 0) + return rc; + + count = le32_to_cpu(dc_extents.total_extent_cnt); + *extent_gen_num = le32_to_cpu(dc_extents.extent_list_num); + + return count; +} + +static int cxl_dev_get_dc_extents(struct cxl_endpoint_decoder *cxled, + unsigned int start_gen_num, + unsigned int exp_cnt) +{ + struct cxl_memdev_state *mds = cxled_to_mds(cxled); + unsigned int start_index, total_read; + struct device *dev = mds->cxlds.dev; + struct cxl_mbox_cmd mbox_cmd; + + struct cxl_mbox_get_dc_extent_out *dc_extents __free(kfree) = + kvmalloc(mds->payload_size, GFP_KERNEL); + if (!dc_extents) + return -ENOMEM; + + total_read = 0; + start_index = 0; + do { + unsigned int nr_ext, total_extent_cnt, gen_num; + struct cxl_mbox_get_dc_extent_in get_dc_extent; + int rc; + + get_dc_extent = (struct cxl_mbox_get_dc_extent_in) { + .extent_cnt = cpu_to_le32(exp_cnt - start_index), + .start_extent_index = cpu_to_le32(start_index), + }; + + mbox_cmd = (struct cxl_mbox_cmd) { + .opcode = CXL_MBOX_OP_GET_DC_EXTENT_LIST, + .payload_in = &get_dc_extent, + .size_in = sizeof(get_dc_extent), + .size_out = mds->payload_size, + .payload_out = dc_extents, + .min_out = 1, + }; + + rc = cxl_internal_send_cmd(mds, &mbox_cmd); + if (rc < 0) + return rc; + + nr_ext = le32_to_cpu(dc_extents->ret_extent_cnt); + total_read += nr_ext; + total_extent_cnt = le32_to_cpu(dc_extents->total_extent_cnt); + gen_num = le32_to_cpu(dc_extents->extent_list_num); + + dev_dbg(dev, "Get extent list count:%d generation Num:%d\n", + total_extent_cnt, gen_num); + + if (gen_num != start_gen_num || exp_cnt != total_extent_cnt) { + dev_err(dev, "Possible incomplete extent list; gen %u != %u : cnt %u != %u\n", + gen_num, start_gen_num, exp_cnt, total_extent_cnt); + return -EIO; + } + + for (int i = 0; i < nr_ext ; i++) { + dev_dbg(dev, "Processing extent %d/%d\n", + start_index + i, exp_cnt); + rc = cxl_validate_extent(mds, &dc_extents->extent[i]); + if (rc) + continue; + if (!cxl_dc_extent_in_ed(cxled, &dc_extents->extent[i])) + continue; + rc = cxl_ed_add_one_extent(cxled, &dc_extents->extent[i]); + if (rc) + return rc; + } + + start_index += nr_ext; + } while (exp_cnt > total_read); + + return 0; +} + +/** + * cxl_read_dc_extents() - Read any existing extents + * @cxled: Endpoint decoder which is part of a region + * + * Issue the Get Dynamic Capacity Extent List command to the device + * and add any existing extents found which belong to this decoder. + * + * Return: 0 if command was executed successfully, -ERRNO on error. + */ +int cxl_read_dc_extents(struct cxl_endpoint_decoder *cxled) +{ + struct cxl_memdev_state *mds = cxled_to_mds(cxled); + struct device *dev = mds->cxlds.dev; + unsigned int extent_gen_num; + int rc; + + if (!cxl_dcd_supported(mds)) { + dev_dbg(dev, "DCD unsupported\n"); + return 0; + } + + rc = cxl_dev_get_dc_extent_cnt(mds, &extent_gen_num); + dev_dbg(mds->cxlds.dev, "Extent count: %d Generation Num: %d\n", + rc, extent_gen_num); + if (rc <= 0) /* 0 == no records found */ + return rc; + + return cxl_dev_get_dc_extents(cxled, extent_gen_num, rc); +} +EXPORT_SYMBOL_NS_GPL(cxl_read_dc_extents, CXL); + static int add_dpa_res(struct device *dev, struct resource *parent, struct resource *res, resource_size_t start, resource_size_t size, const char *type) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 0d7b09a49dcf..3e563ab29afe 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -1450,6 +1450,13 @@ static int cxl_region_validate_position(struct cxl_region *cxlr, return 0; } +/* Callers are expected to ensure cxled has been attached to a region */ +int cxl_ed_add_one_extent(struct cxl_endpoint_decoder *cxled, + struct cxl_dc_extent *dc_extent) +{ + return 0; +} + static int cxl_region_attach_position(struct cxl_region *cxlr, struct cxl_root_decoder *cxlrd, struct cxl_endpoint_decoder *cxled, @@ -2773,6 +2780,22 @@ static int devm_cxl_add_pmem_region(struct cxl_region *cxlr) return rc; } +static int cxl_region_read_extents(struct cxl_region *cxlr) +{ + struct cxl_region_params *p = &cxlr->params; + int i; + + for (i = 0; i < p->nr_targets; i++) { + int rc; + + rc = cxl_read_dc_extents(p->targets[i]); + if (rc) + return rc; + } + + return 0; +} + static void cxlr_dax_unregister(void *_cxlr_dax) { struct cxl_dax_region *cxlr_dax = _cxlr_dax; @@ -2807,6 +2830,12 @@ static int devm_cxl_add_dax_region(struct cxl_region *cxlr) dev_dbg(&cxlr->dev, "%s: register %s\n", dev_name(dev->parent), dev_name(dev)); + if (cxlr->mode == CXL_REGION_DC) { + rc = cxl_region_read_extents(cxlr); + if (rc) + goto err; + } + return devm_add_action_or_reset(&cxlr->dev, cxlr_dax_unregister, cxlr_dax); err: diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index 01bee6eedff3..8f2d8944d334 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -604,6 +604,54 @@ enum cxl_opcode { UUID_INIT(0xe1819d9, 0x11a9, 0x400c, 0x81, 0x1f, 0xd6, 0x07, 0x19, \ 0x40, 0x3d, 0x86) +/* + * Add Dynamic Capacity Response + * CXL rev 3.1 section 8.2.9.9.9.3; Table 8-168 & Table 8-169 + */ +struct cxl_mbox_dc_response { + __le32 extent_list_size; + u8 flags; + u8 reserved[3]; + struct updated_extent_list { + __le64 dpa_start; + __le64 length; + u8 reserved[8]; + } __packed extent_list[]; +} __packed; + +/* + * CXL rev 3.1 section 8.2.9.2.1.6; Table 8-51 + */ +#define CXL_DC_EXTENT_TAG_LEN 0x10 +struct cxl_dc_extent { + __le64 start_dpa; + __le64 length; + u8 tag[CXL_DC_EXTENT_TAG_LEN]; + __le16 shared_extn_seq; + u8 reserved[6]; +} __packed; + +/* + * Get Dynamic Capacity Extent List; Input Payload + * CXL rev 3.1 section 8.2.9.9.9.2; Table 8-166 + */ +struct cxl_mbox_get_dc_extent_in { + __le32 extent_cnt; + __le32 start_extent_index; +} __packed; + +/* + * Get Dynamic Capacity Extent List; Output Payload + * CXL rev 3.1 section 8.2.9.9.9.2; Table 8-167 + */ +struct cxl_mbox_get_dc_extent_out { + __le32 ret_extent_cnt; + __le32 total_extent_cnt; + __le32 extent_list_num; + u8 rsvd[4]; + struct cxl_dc_extent extent[]; +} __packed; + struct cxl_mbox_get_supported_logs { __le16 entries; u8 rsvd[6]; @@ -879,6 +927,7 @@ int cxl_internal_send_cmd(struct cxl_memdev_state *mds, struct cxl_mbox_cmd *cmd); int cxl_dev_state_identify(struct cxl_memdev_state *mds); int cxl_dev_dynamic_capacity_identify(struct cxl_memdev_state *mds); +int cxl_read_dc_extents(struct cxl_endpoint_decoder *cxled); int cxl_await_media_ready(struct cxl_dev_state *cxlds); int cxl_enumerate_cmds(struct cxl_memdev_state *mds); int cxl_mem_create_range_info(struct cxl_memdev_state *mds);