Message ID | 20240324-dcd-type2-upstream-v1-3-b7b00d623625@intel.com |
---|---|
State | Changes Requested |
Headers | show |
Series | DCD: Add support for Dynamic Capacity Devices (DCD) | expand |
On Sun, 24 Mar 2024 16:18:06 -0700 ira.weiny@intel.com wrote: > From: Navneet Singh <navneet.singh@intel.com> > > Devices can optionally support Dynamic Capacity (DC). These devices are > known as Dynamic Capacity Devices (DCD). > > Implement the DC mailbox commands as specified in CXL 3.1 section > 8.2.9.9.9 (opcodes 48XXh). Read the DC configuration and store the DC > region information in the device state. > > Signed-off-by: Navneet Singh <navneet.singh@intel.com> > Co-developed-by: Ira Weiny <ira.weiny@intel.com> > Signed-off-by: Ira Weiny <ira.weiny@intel.com> A few minor things inline, Jonathan > > --- > Changes for v1 > [Jørgen: ensure CXL 2.0 device support by removing dc_event_log_size] > [iweiny/Jørgen: use get DC config command to signal DCD support] > [djiang: fix subject] > [Fan: add additional region configuration checks] > [Jonathan/djiang: split out region mode changes] > [Jonathan: fix up comments/kdoc] > [Jonathan: s/cxl_get_dc_id/cxl_get_dc_config/] > [Jonathan: use __free() in identify call] > [Jonathan: remove unneeded formatting changes] > [Jonathan: s/cxl_mbox_dynamic_capacity/cxl_mbox_get_dc_config_out/] > [Jonathan: s/cxl_mbox_get_dc_config/cxl_mbox_get_dc_config_in/] > [iweiny: remove type2 work dependancy/rebase on master] > [iweiny: fix 0day build issues] > --- > drivers/cxl/core/mbox.c | 184 +++++++++++++++++++++++++++++++++++++++++++++++- > drivers/cxl/cxlmem.h | 49 +++++++++++++ > drivers/cxl/pci.c | 4 ++ > 3 files changed, 236 insertions(+), 1 deletion(-) > > diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c > index ed4131c6f50b..14e8a7528a8b 100644 > --- a/drivers/cxl/core/mbox.c > +++ b/drivers/cxl/core/mbox.c > @@ -1123,7 +1123,7 @@ int cxl_dev_state_identify(struct cxl_memdev_state *mds) > if (rc < 0) > return rc; > > - mds->total_bytes = > + mds->static_cap = > le64_to_cpu(id.total_capacity) * CXL_CAPACITY_MULTIPLIER; > mds->volatile_only_bytes = > le64_to_cpu(id.volatile_capacity) * CXL_CAPACITY_MULTIPLIER; > @@ -1230,6 +1230,175 @@ int cxl_mem_sanitize(struct cxl_memdev *cxlmd, u16 cmd) > return rc; > } > > +static int cxl_dc_save_region_info(struct cxl_memdev_state *mds, u8 index, > + struct cxl_dc_region_config *region_config) > +{ > + struct cxl_dc_region_info *dcr = &mds->dc_region[index]; > + struct device *dev = mds->cxlds.dev; > + > + dcr->base = le64_to_cpu(region_config->region_base); > + dcr->decode_len = le64_to_cpu(region_config->region_decode_length); > + dcr->decode_len *= CXL_CAPACITY_MULTIPLIER; > + dcr->len = le64_to_cpu(region_config->region_length); > + dcr->blk_size = le64_to_cpu(region_config->region_block_size); > + dcr->dsmad_handle = le32_to_cpu(region_config->region_dsmad_handle); > + dcr->flags = region_config->flags; > + snprintf(dcr->name, CXL_DC_REGION_STRLEN, "dc%d", index); > + > + /* Check regions are in increasing DPA order */ > + if (index > 0) { > + struct cxl_dc_region_info *prev_dcr = &mds->dc_region[index - 1]; > + > + if ((prev_dcr->base + prev_dcr->decode_len) > dcr->base) { > + dev_err(dev, > + "DPA ordering violation for DC region %d and %d\n", > + index - 1, index); > + return -EINVAL; > + } > + } > + > + if (!IS_ALIGNED(dcr->base, SZ_256M) || > + !IS_ALIGNED(dcr->base, dcr->blk_size)) { > + dev_err(dev, "DC region %d invalid base %#llx blk size %#llx\n", index, Odd choice of line wrap. I'd drag index onto the line below. > + dcr->base, dcr->blk_size); > + return -EINVAL; > + } > + > + if (dcr->decode_len == 0 || dcr->len == 0 || dcr->decode_len < dcr->len || > + !IS_ALIGNED(dcr->len, dcr->blk_size)) { > + dev_err(dev, "DC region %d invalid length; decode %#llx len %#llx blk size %#llx\n", > + index, dcr->decode_len, dcr->len, dcr->blk_size); > + return -EINVAL; > + } > + > + if (dcr->blk_size == 0 || dcr->blk_size % 0x40 || Hmm. I thought we had a define for CXL 'cacheline' size, but can't find it now. If not we should add one (and find a better name than that). > + !is_power_of_2(dcr->blk_size)) { > + dev_err(dev, "DC region %d invalid block size; %#llx\n", > + index, dcr->blk_size); > + return -EINVAL; > + } > + > + dev_dbg(dev, > + "DC region %s DPA: %#llx LEN: %#llx BLKSZ: %#llx\n", > + dcr->name, dcr->base, dcr->decode_len, dcr->blk_size); > + > + return 0; > +} > + > +/* Returns the number of regions in dc_resp or -ERRNO */ > +static int cxl_get_dc_config(struct cxl_memdev_state *mds, u8 start_region, > + struct cxl_mbox_get_dc_config_out *dc_resp, > + size_t dc_resp_size) > +{ > + struct cxl_mbox_get_dc_config_in get_dc = (struct cxl_mbox_get_dc_config_in) { > + .region_count = CXL_MAX_DC_REGION, > + .start_region_index = start_region, > + }; > + struct cxl_mbox_cmd mbox_cmd = (struct cxl_mbox_cmd) { > + .opcode = CXL_MBOX_OP_GET_DC_CONFIG, > + .payload_in = &get_dc, > + .size_in = sizeof(get_dc), > + .size_out = dc_resp_size, > + .payload_out = dc_resp, > + .min_out = 1, > + }; > + struct device *dev = mds->cxlds.dev; > + int rc; > + > + rc = cxl_internal_send_cmd(mds, &mbox_cmd); > + if (rc < 0) > + return rc; > + > + rc = dc_resp->avail_region_count - start_region; > + > + /* > + * The number of regions in the payload may have been truncated due to > + * payload_size limits; if so adjust the returned count to match. > + */ > + if (mbox_cmd.size_out < sizeof(*dc_resp)) > + rc = CXL_REGIONS_RETURNED(mbox_cmd.size_out); Why not always return this? If there was space, doesn't it equal the value set above anyway? > + > + dev_dbg(dev, "Read %d/%d DC regions\n", rc, dc_resp->avail_region_count); > + > + return rc; > +} > +/** > + * cxl_dev_dynamic_capacity_identify() - Reads the dynamic capacity > + * information from the device. > + * @mds: The memory device state > + * > + * Read Dynamic Capacity information from the device and populate the state > + * structures for later use. > + * > + * Return: 0 if identify was executed successfully, -ERRNO on error. > + */ > +int cxl_dev_dynamic_capacity_identify(struct cxl_memdev_state *mds) > +{ > + size_t dc_resp_size = mds->payload_size; > + struct device *dev = mds->cxlds.dev; > + u8 start_region, i; > + int rc = 0; Is this used before being set? > + > + for (i = 0; i < CXL_MAX_DC_REGION; i++) > + snprintf(mds->dc_region[i].name, CXL_DC_REGION_STRLEN, "<nil>"); > + > + /* Check GET_DC_CONFIG is supported by device */ > + if (!cxl_dcd_supported(mds)) { > + dev_dbg(dev, "DCD not supported\n"); > + return 0; > + } > + > + struct cxl_mbox_get_dc_config_out *dc_resp __free(kfree) = > + kvmalloc(dc_resp_size, GFP_KERNEL); > + if (!dc_resp) > + return -ENOMEM; > + > + start_region = 0; > + do { > + int j; > + > + rc = cxl_get_dc_config(mds, start_region, dc_resp, dc_resp_size); > + if (rc < 0) { > + dev_dbg(dev, "Failed to get DC config: %d\n", rc); > + return rc; > + } > + > + mds->nr_dc_region += rc; > + > + if (mds->nr_dc_region < 1 || mds->nr_dc_region > CXL_MAX_DC_REGION) { > + dev_err(dev, "Invalid num of dynamic capacity regions %d\n", > + mds->nr_dc_region); > + return -EINVAL; > + } > + > + for (i = start_region, j = 0; i < mds->nr_dc_region; i++, j++) { > + rc = cxl_dc_save_region_info(mds, i, &dc_resp->region[j]); > + if (rc) { > + dev_dbg(dev, "Failed to save region info: %d\n", rc); > + return rc; > + } > + } > + > + start_region = mds->nr_dc_region; > + > + } while (mds->nr_dc_region < dc_resp->avail_region_count); > + > + mds->dynamic_cap = > + mds->dc_region[mds->nr_dc_region - 1].base + > + mds->dc_region[mds->nr_dc_region - 1].decode_len - > + mds->dc_region[0].base; > + dev_dbg(dev, "Total dynamic capacity: %#llx\n", mds->dynamic_cap); > + > + return 0; > +} > +EXPORT_SYMBOL_NS_GPL(cxl_dev_dynamic_capacity_identify, CXL); > diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h > index 79a67cff9143..4624cf612c1e 100644 > --- a/drivers/cxl/cxlmem.h > +++ b/drivers/cxl/cxlmem.h > /** > * struct cxl_memdev_state - Generic Type-3 Memory Device Class driver data > * > @@ -467,6 +482,8 @@ struct cxl_dev_state { > * @enabled_cmds: Hardware commands found enabled in CEL. > * @exclusive_cmds: Commands that are kernel-internal only > * @total_bytes: sum of all possible capacities > + * @static_cap: Sum of static RAM and PMEM capacities > + * @dynamic_cap: Complete DPA range occupied by DC regions > * @volatile_only_bytes: hard volatile capacity > * @persistent_only_bytes: hard persistent capacity > * @partition_align_bytes: alignment size for partition-able capacity > @@ -474,6 +491,8 @@ struct cxl_dev_state { > * @active_persistent_bytes: sum of hard + soft persistent > * @next_volatile_bytes: volatile capacity change pending device reset > * @next_persistent_bytes: persistent capacity change pending device reset Looks like we have some ordering issues ram_perf and pmem_perf (at least) that we should fix up as a precursor. I sent a reply to the QoS patch that added these. > + * @nr_dc_region: number of DC regions implemented in the memory device > + * @dc_region: array containing info about the DC regions > * @event: event log driver state > * @poison: poison driver state info > * @security: security driver state info > @@ -494,7 +513,10 @@ struct cxl_memdev_state { > DECLARE_BITMAP(dcd_cmds, CXL_DCD_ENABLED_MAX); > DECLARE_BITMAP(enabled_cmds, CXL_MEM_COMMAND_ID_MAX); > DECLARE_BITMAP(exclusive_cmds, CXL_MEM_COMMAND_ID_MAX); > + Trivial but this is an unrelated change and shouldn't be in this patch. > u64 total_bytes; > + u64 static_cap; > + u64 dynamic_cap; > u64 volatile_only_bytes; > u64 persistent_only_bytes; > u64 partition_align_bytes; > @@ -506,6 +528,9 @@ struct cxl_memdev_state { > struct cxl_dpa_perf ram_perf; > struct cxl_dpa_perf pmem_perf; > > + u8 nr_dc_region; > + struct cxl_dc_region_info dc_region[CXL_MAX_DC_REGION]; > + > struct cxl_event_state event; > struct cxl_poison_state poison; > struct cxl_security_state security; > + > +/* See CXL 3.0 Table 125 get dynamic capacity config Output Payload */ > +struct cxl_mbox_get_dc_config_out { > + u8 avail_region_count; > + u8 rsvd[7]; > + struct cxl_dc_region_config { > + __le64 region_base; > + __le64 region_decode_length; > + __le64 region_length; > + __le64 region_block_size; > + __le32 region_dsmad_handle; > + u8 flags; > + u8 rsvd[3]; > + } __packed region[]; > +} __packed; > +#define CXL_DYNAMIC_CAPACITY_SANITIZE_ON_RELEASE_FLAG BIT(0) > +#define CXL_REGIONS_RETURNED(size_out) \ > + ((size_out - 8) / sizeof(struct cxl_dc_region_config)) Can we make that 8 self documenting? offsetof(struct cxl_dc_region_config, region) perhaps? > +
On Sun, Mar 24, 2024 at 04:18:06PM -0700, ira.weiny@intel.com wrote: > From: Navneet Singh <navneet.singh@intel.com> > > Devices can optionally support Dynamic Capacity (DC). These devices are > known as Dynamic Capacity Devices (DCD). > > Implement the DC mailbox commands as specified in CXL 3.1 section > 8.2.9.9.9 (opcodes 48XXh). Read the DC configuration and store the DC > region information in the device state. > > Signed-off-by: Navneet Singh <navneet.singh@intel.com> > Co-developed-by: Ira Weiny <ira.weiny@intel.com> > Signed-off-by: Ira Weiny <ira.weiny@intel.com> > > --- > Changes for v1 > [Jørgen: ensure CXL 2.0 device support by removing dc_event_log_size] > [iweiny/Jørgen: use get DC config command to signal DCD support] > [djiang: fix subject] > [Fan: add additional region configuration checks] > [Jonathan/djiang: split out region mode changes] > [Jonathan: fix up comments/kdoc] > [Jonathan: s/cxl_get_dc_id/cxl_get_dc_config/] > [Jonathan: use __free() in identify call] > [Jonathan: remove unneeded formatting changes] > [Jonathan: s/cxl_mbox_dynamic_capacity/cxl_mbox_get_dc_config_out/] > [Jonathan: s/cxl_mbox_get_dc_config/cxl_mbox_get_dc_config_in/] > [iweiny: remove type2 work dependancy/rebase on master] > [iweiny: fix 0day build issues] > --- > drivers/cxl/core/mbox.c | 184 +++++++++++++++++++++++++++++++++++++++++++++++- > drivers/cxl/cxlmem.h | 49 +++++++++++++ > drivers/cxl/pci.c | 4 ++ > 3 files changed, 236 insertions(+), 1 deletion(-) > > diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c > index ed4131c6f50b..14e8a7528a8b 100644 > --- a/drivers/cxl/core/mbox.c > +++ b/drivers/cxl/core/mbox.c > @@ -1123,7 +1123,7 @@ int cxl_dev_state_identify(struct cxl_memdev_state *mds) > if (rc < 0) > return rc; > > - mds->total_bytes = > + mds->static_cap = > le64_to_cpu(id.total_capacity) * CXL_CAPACITY_MULTIPLIER; > mds->volatile_only_bytes = > le64_to_cpu(id.volatile_capacity) * CXL_CAPACITY_MULTIPLIER; > @@ -1230,6 +1230,175 @@ int cxl_mem_sanitize(struct cxl_memdev *cxlmd, u16 cmd) > return rc; > } > > +static int cxl_dc_save_region_info(struct cxl_memdev_state *mds, u8 index, > + struct cxl_dc_region_config *region_config) > +{ > + struct cxl_dc_region_info *dcr = &mds->dc_region[index]; > + struct device *dev = mds->cxlds.dev; > + > + dcr->base = le64_to_cpu(region_config->region_base); > + dcr->decode_len = le64_to_cpu(region_config->region_decode_length); > + dcr->decode_len *= CXL_CAPACITY_MULTIPLIER; > + dcr->len = le64_to_cpu(region_config->region_length); > + dcr->blk_size = le64_to_cpu(region_config->region_block_size); > + dcr->dsmad_handle = le32_to_cpu(region_config->region_dsmad_handle); > + dcr->flags = region_config->flags; > + snprintf(dcr->name, CXL_DC_REGION_STRLEN, "dc%d", index); > + > + /* Check regions are in increasing DPA order */ > + if (index > 0) { > + struct cxl_dc_region_info *prev_dcr = &mds->dc_region[index - 1]; > + > + if ((prev_dcr->base + prev_dcr->decode_len) > dcr->base) { > + dev_err(dev, > + "DPA ordering violation for DC region %d and %d\n", > + index - 1, index); > + return -EINVAL; > + } > + } > + > + if (!IS_ALIGNED(dcr->base, SZ_256M) || > + !IS_ALIGNED(dcr->base, dcr->blk_size)) { > + dev_err(dev, "DC region %d invalid base %#llx blk size %#llx\n", index, > + dcr->base, dcr->blk_size); > + return -EINVAL; > + } > + > + if (dcr->decode_len == 0 || dcr->len == 0 || dcr->decode_len < dcr->len || > + !IS_ALIGNED(dcr->len, dcr->blk_size)) { > + dev_err(dev, "DC region %d invalid length; decode %#llx len %#llx blk size %#llx\n", > + index, dcr->decode_len, dcr->len, dcr->blk_size); > + return -EINVAL; > + } > + > + if (dcr->blk_size == 0 || dcr->blk_size % 0x40 || > + !is_power_of_2(dcr->blk_size)) { > + dev_err(dev, "DC region %d invalid block size; %#llx\n", > + index, dcr->blk_size); > + return -EINVAL; > + } > + > + dev_dbg(dev, > + "DC region %s DPA: %#llx LEN: %#llx BLKSZ: %#llx\n", > + dcr->name, dcr->base, dcr->decode_len, dcr->blk_size); > + > + return 0; > +} > + > +/* Returns the number of regions in dc_resp or -ERRNO */ > +static int cxl_get_dc_config(struct cxl_memdev_state *mds, u8 start_region, > + struct cxl_mbox_get_dc_config_out *dc_resp, > + size_t dc_resp_size) > +{ > + struct cxl_mbox_get_dc_config_in get_dc = (struct cxl_mbox_get_dc_config_in) { > + .region_count = CXL_MAX_DC_REGION, > + .start_region_index = start_region, > + }; > + struct cxl_mbox_cmd mbox_cmd = (struct cxl_mbox_cmd) { > + .opcode = CXL_MBOX_OP_GET_DC_CONFIG, > + .payload_in = &get_dc, > + .size_in = sizeof(get_dc), > + .size_out = dc_resp_size, > + .payload_out = dc_resp, > + .min_out = 1, > + }; > + struct device *dev = mds->cxlds.dev; > + int rc; > + > + rc = cxl_internal_send_cmd(mds, &mbox_cmd); > + if (rc < 0) > + return rc; > + > + rc = dc_resp->avail_region_count - start_region; > + > + /* > + * The number of regions in the payload may have been truncated due to > + * payload_size limits; if so adjust the returned count to match. > + */ > + if (mbox_cmd.size_out < sizeof(*dc_resp)) > + rc = CXL_REGIONS_RETURNED(mbox_cmd.size_out); > + > + dev_dbg(dev, "Read %d/%d DC regions\n", rc, dc_resp->avail_region_count); > + > + return rc; > +} > + > +static bool cxl_dcd_supported(struct cxl_memdev_state *mds) > +{ > + return test_bit(CXL_DCD_ENABLED_GET_CONFIG, mds->dcd_cmds); > +} > + > +/** > + * cxl_dev_dynamic_capacity_identify() - Reads the dynamic capacity > + * information from the device. > + * @mds: The memory device state > + * > + * Read Dynamic Capacity information from the device and populate the state > + * structures for later use. > + * > + * Return: 0 if identify was executed successfully, -ERRNO on error. > + */ > +int cxl_dev_dynamic_capacity_identify(struct cxl_memdev_state *mds) > +{ > + size_t dc_resp_size = mds->payload_size; > + struct device *dev = mds->cxlds.dev; > + u8 start_region, i; > + int rc = 0; > + > + for (i = 0; i < CXL_MAX_DC_REGION; i++) > + snprintf(mds->dc_region[i].name, CXL_DC_REGION_STRLEN, "<nil>"); > + > + /* Check GET_DC_CONFIG is supported by device */ > + if (!cxl_dcd_supported(mds)) { > + dev_dbg(dev, "DCD not supported\n"); > + return 0; > + } > + > + struct cxl_mbox_get_dc_config_out *dc_resp __free(kfree) = > + kvmalloc(dc_resp_size, GFP_KERNEL); > + if (!dc_resp) > + return -ENOMEM; > + > + start_region = 0; > + do { > + int j; > + > + rc = cxl_get_dc_config(mds, start_region, dc_resp, dc_resp_size); > + if (rc < 0) { > + dev_dbg(dev, "Failed to get DC config: %d\n", rc); > + return rc; > + } > + > + mds->nr_dc_region += rc; > + > + if (mds->nr_dc_region < 1 || mds->nr_dc_region > CXL_MAX_DC_REGION) { > + dev_err(dev, "Invalid num of dynamic capacity regions %d\n", > + mds->nr_dc_region); > + return -EINVAL; > + } > + > + for (i = start_region, j = 0; i < mds->nr_dc_region; i++, j++) { > + rc = cxl_dc_save_region_info(mds, i, &dc_resp->region[j]); > + if (rc) { > + dev_dbg(dev, "Failed to save region info: %d\n", rc); > + return rc; > + } > + } > + > + start_region = mds->nr_dc_region; > + > + } while (mds->nr_dc_region < dc_resp->avail_region_count); > + > + mds->dynamic_cap = > + mds->dc_region[mds->nr_dc_region - 1].base + > + mds->dc_region[mds->nr_dc_region - 1].decode_len - > + mds->dc_region[0].base; > + dev_dbg(dev, "Total dynamic capacity: %#llx\n", mds->dynamic_cap); > + > + return 0; > +} > +EXPORT_SYMBOL_NS_GPL(cxl_dev_dynamic_capacity_identify, CXL); > + > static int add_dpa_res(struct device *dev, struct resource *parent, > struct resource *res, resource_size_t start, > resource_size_t size, const char *type) > @@ -1260,8 +1429,12 @@ int cxl_mem_create_range_info(struct cxl_memdev_state *mds) > { > struct cxl_dev_state *cxlds = &mds->cxlds; > struct device *dev = cxlds->dev; > + size_t untenanted_mem; > int rc; > > + untenanted_mem = mds->dc_region[0].base - mds->static_cap; > + mds->total_bytes = mds->static_cap + untenanted_mem + mds->dynamic_cap; > + > if (!cxlds->media_ready) { > cxlds->dpa_res = DEFINE_RES_MEM(0, 0); > cxlds->ram_res = DEFINE_RES_MEM(0, 0); > @@ -1271,6 +1444,15 @@ int cxl_mem_create_range_info(struct cxl_memdev_state *mds) > > cxlds->dpa_res = DEFINE_RES_MEM(0, mds->total_bytes); > > + for (int i = 0; i < mds->nr_dc_region; i++) { > + struct cxl_dc_region_info *dcr = &mds->dc_region[i]; > + > + rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->dc_res[i], > + dcr->base, dcr->decode_len, dcr->name); > + if (rc) > + return rc; > + } > + > if (mds->partition_align_bytes == 0) { > rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->ram_res, 0, > mds->volatile_only_bytes, "ram"); > diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h > index 79a67cff9143..4624cf612c1e 100644 > --- a/drivers/cxl/cxlmem.h > +++ b/drivers/cxl/cxlmem.h > @@ -402,6 +402,7 @@ enum cxl_devtype { > CXL_DEVTYPE_CLASSMEM, > }; > > +#define CXL_MAX_DC_REGION 8 > /** > * struct cxl_dpa_perf - DPA performance property entry > * @dpa_range - range for DPA address > @@ -431,6 +432,8 @@ struct cxl_dpa_perf { > * @dpa_res: Overall DPA resource tree for the device > * @pmem_res: Active Persistent memory capacity configuration > * @ram_res: Active Volatile memory capacity configuration > + * @dc_res: Active Dynamic Capacity memory configuration for each possible > + * region > * @serial: PCIe Device Serial Number > * @type: Generic Memory Class device or Vendor Specific Memory device > */ > @@ -445,10 +448,22 @@ struct cxl_dev_state { > struct resource dpa_res; > struct resource pmem_res; > struct resource ram_res; > + struct resource dc_res[CXL_MAX_DC_REGION]; > u64 serial; > enum cxl_devtype type; > }; > > +#define CXL_DC_REGION_STRLEN 8 > +struct cxl_dc_region_info { > + u64 base; > + u64 decode_len; > + u64 len; > + u64 blk_size; > + u32 dsmad_handle; > + u8 flags; > + u8 name[CXL_DC_REGION_STRLEN]; > +}; > + > /** > * struct cxl_memdev_state - Generic Type-3 Memory Device Class driver data > * > @@ -467,6 +482,8 @@ struct cxl_dev_state { > * @enabled_cmds: Hardware commands found enabled in CEL. > * @exclusive_cmds: Commands that are kernel-internal only > * @total_bytes: sum of all possible capacities > + * @static_cap: Sum of static RAM and PMEM capacities > + * @dynamic_cap: Complete DPA range occupied by DC regions > * @volatile_only_bytes: hard volatile capacity > * @persistent_only_bytes: hard persistent capacity > * @partition_align_bytes: alignment size for partition-able capacity > @@ -474,6 +491,8 @@ struct cxl_dev_state { > * @active_persistent_bytes: sum of hard + soft persistent > * @next_volatile_bytes: volatile capacity change pending device reset > * @next_persistent_bytes: persistent capacity change pending device reset > + * @nr_dc_region: number of DC regions implemented in the memory device > + * @dc_region: array containing info about the DC regions > * @event: event log driver state > * @poison: poison driver state info > * @security: security driver state info > @@ -494,7 +513,10 @@ struct cxl_memdev_state { > DECLARE_BITMAP(dcd_cmds, CXL_DCD_ENABLED_MAX); > DECLARE_BITMAP(enabled_cmds, CXL_MEM_COMMAND_ID_MAX); > DECLARE_BITMAP(exclusive_cmds, CXL_MEM_COMMAND_ID_MAX); > + > u64 total_bytes; > + u64 static_cap; > + u64 dynamic_cap; > u64 volatile_only_bytes; > u64 persistent_only_bytes; > u64 partition_align_bytes; > @@ -506,6 +528,9 @@ struct cxl_memdev_state { > struct cxl_dpa_perf ram_perf; > struct cxl_dpa_perf pmem_perf; > > + u8 nr_dc_region; > + struct cxl_dc_region_info dc_region[CXL_MAX_DC_REGION]; > + > struct cxl_event_state event; > struct cxl_poison_state poison; > struct cxl_security_state security; > @@ -705,6 +730,29 @@ struct cxl_mbox_set_partition_info { > > #define CXL_SET_PARTITION_IMMEDIATE_FLAG BIT(0) > > +struct cxl_mbox_get_dc_config_in { > + u8 region_count; > + u8 start_region_index; > +} __packed; > + > +/* See CXL 3.0 Table 125 get dynamic capacity config Output Payload */ > +struct cxl_mbox_get_dc_config_out { > + u8 avail_region_count; > + u8 rsvd[7]; > + struct cxl_dc_region_config { > + __le64 region_base; > + __le64 region_decode_length; > + __le64 region_length; > + __le64 region_block_size; > + __le32 region_dsmad_handle; > + u8 flags; > + u8 rsvd[3]; > + } __packed region[]; > +} __packed; > +#define CXL_DYNAMIC_CAPACITY_SANITIZE_ON_RELEASE_FLAG BIT(0) > +#define CXL_REGIONS_RETURNED(size_out) \ > + ((size_out - 8) / sizeof(struct cxl_dc_region_config)) Although the result may be unchanged, but in cxl spec r3.1, there are four fields after the region configuration structure. Fan > + > /* Set Timestamp CXL 3.0 Spec 8.2.9.4.2 */ > struct cxl_mbox_set_timestamp_in { > __le64 timestamp; > @@ -828,6 +876,7 @@ enum { > int cxl_internal_send_cmd(struct cxl_memdev_state *mds, > struct cxl_mbox_cmd *cmd); > int cxl_dev_state_identify(struct cxl_memdev_state *mds); > +int cxl_dev_dynamic_capacity_identify(struct cxl_memdev_state *mds); > int cxl_await_media_ready(struct cxl_dev_state *cxlds); > int cxl_enumerate_cmds(struct cxl_memdev_state *mds); > int cxl_mem_create_range_info(struct cxl_memdev_state *mds); > diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c > index 2ff361e756d6..216881455364 100644 > --- a/drivers/cxl/pci.c > +++ b/drivers/cxl/pci.c > @@ -874,6 +874,10 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) > if (rc) > return rc; > > + rc = cxl_dev_dynamic_capacity_identify(mds); > + if (rc) > + return rc; > + > rc = cxl_mem_create_range_info(mds); > if (rc) > return rc; > > -- > 2.44.0 >
On 3/25/24 00:18, ira.weiny@intel.com wrote: > From: Navneet Singh <navneet.singh@intel.com> > > Devices can optionally support Dynamic Capacity (DC). These devices are > known as Dynamic Capacity Devices (DCD). > > Implement the DC mailbox commands as specified in CXL 3.1 section > 8.2.9.9.9 (opcodes 48XXh). Read the DC configuration and store the DC > region information in the device state. > > Signed-off-by: Navneet Singh <navneet.singh@intel.com> > Co-developed-by: Ira Weiny <ira.weiny@intel.com> > Signed-off-by: Ira Weiny <ira.weiny@intel.com> > > --- > Changes for v1 > [Jørgen: ensure CXL 2.0 device support by removing dc_event_log_size] > [iweiny/Jørgen: use get DC config command to signal DCD support] > [djiang: fix subject] > [Fan: add additional region configuration checks] > [Jonathan/djiang: split out region mode changes] > [Jonathan: fix up comments/kdoc] > [Jonathan: s/cxl_get_dc_id/cxl_get_dc_config/] > [Jonathan: use __free() in identify call] > [Jonathan: remove unneeded formatting changes] > [Jonathan: s/cxl_mbox_dynamic_capacity/cxl_mbox_get_dc_config_out/] > [Jonathan: s/cxl_mbox_get_dc_config/cxl_mbox_get_dc_config_in/] > [iweiny: remove type2 work dependancy/rebase on master] > [iweiny: fix 0day build issues] > --- > drivers/cxl/core/mbox.c | 184 +++++++++++++++++++++++++++++++++++++++++++++++- > drivers/cxl/cxlmem.h | 49 +++++++++++++ > drivers/cxl/pci.c | 4 ++ > 3 files changed, 236 insertions(+), 1 deletion(-) > > diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c > index ed4131c6f50b..14e8a7528a8b 100644 > --- a/drivers/cxl/core/mbox.c > +++ b/drivers/cxl/core/mbox.c > @@ -1123,7 +1123,7 @@ int cxl_dev_state_identify(struct cxl_memdev_state *mds) > if (rc < 0) > return rc; > > - mds->total_bytes = > + mds->static_cap = > le64_to_cpu(id.total_capacity) * CXL_CAPACITY_MULTIPLIER; > mds->volatile_only_bytes = > le64_to_cpu(id.volatile_capacity) * CXL_CAPACITY_MULTIPLIER; > @@ -1230,6 +1230,175 @@ int cxl_mem_sanitize(struct cxl_memdev *cxlmd, u16 cmd) > return rc; > } > > +static int cxl_dc_save_region_info(struct cxl_memdev_state *mds, u8 index, > + struct cxl_dc_region_config *region_config) > +{ > + struct cxl_dc_region_info *dcr = &mds->dc_region[index]; > + struct device *dev = mds->cxlds.dev; > + > + dcr->base = le64_to_cpu(region_config->region_base); > + dcr->decode_len = le64_to_cpu(region_config->region_decode_length); > + dcr->decode_len *= CXL_CAPACITY_MULTIPLIER; > + dcr->len = le64_to_cpu(region_config->region_length); > + dcr->blk_size = le64_to_cpu(region_config->region_block_size); > + dcr->dsmad_handle = le32_to_cpu(region_config->region_dsmad_handle); > + dcr->flags = region_config->flags; > + snprintf(dcr->name, CXL_DC_REGION_STRLEN, "dc%d", index); > + > + /* Check regions are in increasing DPA order */ > + if (index > 0) { > + struct cxl_dc_region_info *prev_dcr = &mds->dc_region[index - 1]; > + > + if ((prev_dcr->base + prev_dcr->decode_len) > dcr->base) { > + dev_err(dev, > + "DPA ordering violation for DC region %d and %d\n", > + index - 1, index); > + return -EINVAL; > + } > + } > + > + if (!IS_ALIGNED(dcr->base, SZ_256M) || > + !IS_ALIGNED(dcr->base, dcr->blk_size)) { > + dev_err(dev, "DC region %d invalid base %#llx blk size %#llx\n", index, > + dcr->base, dcr->blk_size); > + return -EINVAL; > + } > + > + if (dcr->decode_len == 0 || dcr->len == 0 || dcr->decode_len < dcr->len || > + !IS_ALIGNED(dcr->len, dcr->blk_size)) { > + dev_err(dev, "DC region %d invalid length; decode %#llx len %#llx blk size %#llx\n", > + index, dcr->decode_len, dcr->len, dcr->blk_size); > + return -EINVAL; > + } > + > + if (dcr->blk_size == 0 || dcr->blk_size % 0x40 || > + !is_power_of_2(dcr->blk_size)) { > + dev_err(dev, "DC region %d invalid block size; %#llx\n", > + index, dcr->blk_size); > + return -EINVAL; > + } > + > + dev_dbg(dev, > + "DC region %s DPA: %#llx LEN: %#llx BLKSZ: %#llx\n", > + dcr->name, dcr->base, dcr->decode_len, dcr->blk_size); > + > + return 0; > +} > + > +/* Returns the number of regions in dc_resp or -ERRNO */ > +static int cxl_get_dc_config(struct cxl_memdev_state *mds, u8 start_region, > + struct cxl_mbox_get_dc_config_out *dc_resp, > + size_t dc_resp_size) > +{ > + struct cxl_mbox_get_dc_config_in get_dc = (struct cxl_mbox_get_dc_config_in) { > + .region_count = CXL_MAX_DC_REGION, > + .start_region_index = start_region, > + }; > + struct cxl_mbox_cmd mbox_cmd = (struct cxl_mbox_cmd) { > + .opcode = CXL_MBOX_OP_GET_DC_CONFIG, > + .payload_in = &get_dc, > + .size_in = sizeof(get_dc), > + .size_out = dc_resp_size, > + .payload_out = dc_resp, > + .min_out = 1, > + }; > + struct device *dev = mds->cxlds.dev; > + int rc; > + > + rc = cxl_internal_send_cmd(mds, &mbox_cmd); > + if (rc < 0) > + return rc; > + > + rc = dc_resp->avail_region_count - start_region; > + > + /* > + * The number of regions in the payload may have been truncated due to > + * payload_size limits; if so adjust the returned count to match. > + */ > + if (mbox_cmd.size_out < sizeof(*dc_resp)) > + rc = CXL_REGIONS_RETURNED(mbox_cmd.size_out); > + > + dev_dbg(dev, "Read %d/%d DC regions\n", rc, dc_resp->avail_region_count); > + > + return rc; > +} > + > +static bool cxl_dcd_supported(struct cxl_memdev_state *mds) > +{ > + return test_bit(CXL_DCD_ENABLED_GET_CONFIG, mds->dcd_cmds); > +} > + > +/** > + * cxl_dev_dynamic_capacity_identify() - Reads the dynamic capacity > + * information from the device. > + * @mds: The memory device state > + * > + * Read Dynamic Capacity information from the device and populate the state > + * structures for later use. > + * > + * Return: 0 if identify was executed successfully, -ERRNO on error. > + */ > +int cxl_dev_dynamic_capacity_identify(struct cxl_memdev_state *mds) > +{ > + size_t dc_resp_size = mds->payload_size; > + struct device *dev = mds->cxlds.dev; > + u8 start_region, i; > + int rc = 0; > + > + for (i = 0; i < CXL_MAX_DC_REGION; i++) > + snprintf(mds->dc_region[i].name, CXL_DC_REGION_STRLEN, "<nil>"); > + > + /* Check GET_DC_CONFIG is supported by device */ > + if (!cxl_dcd_supported(mds)) { > + dev_dbg(dev, "DCD not supported\n"); > + return 0; > + } > + > + struct cxl_mbox_get_dc_config_out *dc_resp __free(kfree) = > + kvmalloc(dc_resp_size, GFP_KERNEL); > + if (!dc_resp) > + return -ENOMEM; > + > + start_region = 0; > + do { > + int j; > + > + rc = cxl_get_dc_config(mds, start_region, dc_resp, dc_resp_size); > + if (rc < 0) { > + dev_dbg(dev, "Failed to get DC config: %d\n", rc); > + return rc; > + } > + > + mds->nr_dc_region += rc; > + > + if (mds->nr_dc_region < 1 || mds->nr_dc_region > CXL_MAX_DC_REGION) { > + dev_err(dev, "Invalid num of dynamic capacity regions %d\n", > + mds->nr_dc_region); > + return -EINVAL; > + } > + > + for (i = start_region, j = 0; i < mds->nr_dc_region; i++, j++) { > + rc = cxl_dc_save_region_info(mds, i, &dc_resp->region[j]); > + if (rc) { > + dev_dbg(dev, "Failed to save region info: %d\n", rc); > + return rc; > + } > + } > + > + start_region = mds->nr_dc_region; > + > + } while (mds->nr_dc_region < dc_resp->avail_region_count); > + > + mds->dynamic_cap = > + mds->dc_region[mds->nr_dc_region - 1].base + > + mds->dc_region[mds->nr_dc_region - 1].decode_len - > + mds->dc_region[0].base; > + dev_dbg(dev, "Total dynamic capacity: %#llx\n", mds->dynamic_cap); > + > + return 0; > +} > +EXPORT_SYMBOL_NS_GPL(cxl_dev_dynamic_capacity_identify, CXL); > + > static int add_dpa_res(struct device *dev, struct resource *parent, > struct resource *res, resource_size_t start, > resource_size_t size, const char *type) > @@ -1260,8 +1429,12 @@ int cxl_mem_create_range_info(struct cxl_memdev_state *mds) > { > struct cxl_dev_state *cxlds = &mds->cxlds; > struct device *dev = cxlds->dev; > + size_t untenanted_mem; > int rc; > > + untenanted_mem = mds->dc_region[0].base - mds->static_cap; > + mds->total_bytes = mds->static_cap + untenanted_mem + mds->dynamic_cap; > + > if (!cxlds->media_ready) { > cxlds->dpa_res = DEFINE_RES_MEM(0, 0); > cxlds->ram_res = DEFINE_RES_MEM(0, 0); > @@ -1271,6 +1444,15 @@ int cxl_mem_create_range_info(struct cxl_memdev_state *mds) > > cxlds->dpa_res = DEFINE_RES_MEM(0, mds->total_bytes); > > + for (int i = 0; i < mds->nr_dc_region; i++) { > + struct cxl_dc_region_info *dcr = &mds->dc_region[i]; > + > + rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->dc_res[i], > + dcr->base, dcr->decode_len, dcr->name); > + if (rc) > + return rc; > + } > + > if (mds->partition_align_bytes == 0) { > rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->ram_res, 0, > mds->volatile_only_bytes, "ram"); > diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h > index 79a67cff9143..4624cf612c1e 100644 > --- a/drivers/cxl/cxlmem.h > +++ b/drivers/cxl/cxlmem.h > @@ -402,6 +402,7 @@ enum cxl_devtype { > CXL_DEVTYPE_CLASSMEM, > }; > > +#define CXL_MAX_DC_REGION 8 > /** > * struct cxl_dpa_perf - DPA performance property entry > * @dpa_range - range for DPA address > @@ -431,6 +432,8 @@ struct cxl_dpa_perf { > * @dpa_res: Overall DPA resource tree for the device > * @pmem_res: Active Persistent memory capacity configuration > * @ram_res: Active Volatile memory capacity configuration > + * @dc_res: Active Dynamic Capacity memory configuration for each possible > + * region > * @serial: PCIe Device Serial Number > * @type: Generic Memory Class device or Vendor Specific Memory device > */ > @@ -445,10 +448,22 @@ struct cxl_dev_state { > struct resource dpa_res; > struct resource pmem_res; > struct resource ram_res; > + struct resource dc_res[CXL_MAX_DC_REGION]; > u64 serial; > enum cxl_devtype type; > }; > > +#define CXL_DC_REGION_STRLEN 8 > +struct cxl_dc_region_info { > + u64 base; > + u64 decode_len; > + u64 len; > + u64 blk_size; > + u32 dsmad_handle; > + u8 flags; > + u8 name[CXL_DC_REGION_STRLEN]; > +}; > + > /** > * struct cxl_memdev_state - Generic Type-3 Memory Device Class driver data > * > @@ -467,6 +482,8 @@ struct cxl_dev_state { > * @enabled_cmds: Hardware commands found enabled in CEL. > * @exclusive_cmds: Commands that are kernel-internal only > * @total_bytes: sum of all possible capacities > + * @static_cap: Sum of static RAM and PMEM capacities > + * @dynamic_cap: Complete DPA range occupied by DC regions How about naming these total_range, static_cap and dynamic_range to make it clear that the DPA range occupied by DC regions isn't necessarily usable capacity (as opposed to the static_cap where the spec defines it as usable capacity). Thanks, Jørgen > * @volatile_only_bytes: hard volatile capacity > * @persistent_only_bytes: hard persistent capacity > * @partition_align_bytes: alignment size for partition-able capacity > @@ -474,6 +491,8 @@ struct cxl_dev_state { > * @active_persistent_bytes: sum of hard + soft persistent > * @next_volatile_bytes: volatile capacity change pending device reset > * @next_persistent_bytes: persistent capacity change pending device reset > + * @nr_dc_region: number of DC regions implemented in the memory device > + * @dc_region: array containing info about the DC regions > * @event: event log driver state > * @poison: poison driver state info > * @security: security driver state info > @@ -494,7 +513,10 @@ struct cxl_memdev_state { > DECLARE_BITMAP(dcd_cmds, CXL_DCD_ENABLED_MAX); > DECLARE_BITMAP(enabled_cmds, CXL_MEM_COMMAND_ID_MAX); > DECLARE_BITMAP(exclusive_cmds, CXL_MEM_COMMAND_ID_MAX); > + > u64 total_bytes; > + u64 static_cap; > + u64 dynamic_cap; > u64 volatile_only_bytes; > u64 persistent_only_bytes; > u64 partition_align_bytes; > @@ -506,6 +528,9 @@ struct cxl_memdev_state { > struct cxl_dpa_perf ram_perf; > struct cxl_dpa_perf pmem_perf; > > + u8 nr_dc_region; > + struct cxl_dc_region_info dc_region[CXL_MAX_DC_REGION]; > + > struct cxl_event_state event; > struct cxl_poison_state poison; > struct cxl_security_state security; > @@ -705,6 +730,29 @@ struct cxl_mbox_set_partition_info { > > #define CXL_SET_PARTITION_IMMEDIATE_FLAG BIT(0) > > +struct cxl_mbox_get_dc_config_in { > + u8 region_count; > + u8 start_region_index; > +} __packed; > + > +/* See CXL 3.0 Table 125 get dynamic capacity config Output Payload */ > +struct cxl_mbox_get_dc_config_out { > + u8 avail_region_count; > + u8 rsvd[7]; > + struct cxl_dc_region_config { > + __le64 region_base; > + __le64 region_decode_length; > + __le64 region_length; > + __le64 region_block_size; > + __le32 region_dsmad_handle; > + u8 flags; > + u8 rsvd[3]; > + } __packed region[]; > +} __packed; > +#define CXL_DYNAMIC_CAPACITY_SANITIZE_ON_RELEASE_FLAG BIT(0) > +#define CXL_REGIONS_RETURNED(size_out) \ > + ((size_out - 8) / sizeof(struct cxl_dc_region_config)) > + > /* Set Timestamp CXL 3.0 Spec 8.2.9.4.2 */ > struct cxl_mbox_set_timestamp_in { > __le64 timestamp; > @@ -828,6 +876,7 @@ enum { > int cxl_internal_send_cmd(struct cxl_memdev_state *mds, > struct cxl_mbox_cmd *cmd); > int cxl_dev_state_identify(struct cxl_memdev_state *mds); > +int cxl_dev_dynamic_capacity_identify(struct cxl_memdev_state *mds); > int cxl_await_media_ready(struct cxl_dev_state *cxlds); > int cxl_enumerate_cmds(struct cxl_memdev_state *mds); > int cxl_mem_create_range_info(struct cxl_memdev_state *mds); > diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c > index 2ff361e756d6..216881455364 100644 > --- a/drivers/cxl/pci.c > +++ b/drivers/cxl/pci.c > @@ -874,6 +874,10 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) > if (rc) > return rc; > > + rc = cxl_dev_dynamic_capacity_identify(mds); > + if (rc) > + return rc; > + > rc = cxl_mem_create_range_info(mds); > if (rc) > return rc; > > -- > 2.44.0 > >
Jonathan Cameron wrote: > On Sun, 24 Mar 2024 16:18:06 -0700 > ira.weiny@intel.com wrote: > [snip] > > + > > + /* Check regions are in increasing DPA order */ > > + if (index > 0) { > > + struct cxl_dc_region_info *prev_dcr = &mds->dc_region[index - 1]; > > + > > + if ((prev_dcr->base + prev_dcr->decode_len) > dcr->base) { > > + dev_err(dev, > > + "DPA ordering violation for DC region %d and %d\n", > > + index - 1, index); > > + return -EINVAL; > > + } > > + } > > + > > + if (!IS_ALIGNED(dcr->base, SZ_256M) || > > + !IS_ALIGNED(dcr->base, dcr->blk_size)) { > > + dev_err(dev, "DC region %d invalid base %#llx blk size %#llx\n", index, > > Odd choice of line wrap. I'd drag index onto the line below. fixed. > > > + dcr->base, dcr->blk_size); > > + return -EINVAL; > > + } > > + > > + if (dcr->decode_len == 0 || dcr->len == 0 || dcr->decode_len < dcr->len || > > + !IS_ALIGNED(dcr->len, dcr->blk_size)) { > > + dev_err(dev, "DC region %d invalid length; decode %#llx len %#llx blk size %#llx\n", > > + index, dcr->decode_len, dcr->len, dcr->blk_size); > > + return -EINVAL; > > + } > > + > > + if (dcr->blk_size == 0 || dcr->blk_size % 0x40 || > > Hmm. I thought we had a define for CXL 'cacheline' size, but can't find it now. > If not we should add one (and find a better name than that). Asking me to add a define is fine... Asking me to name said define is... The issue... I am absolute rubbish at picking names... :-/ ;-) :-D > > > + !is_power_of_2(dcr->blk_size)) { > > + dev_err(dev, "DC region %d invalid block size; %#llx\n", > > + index, dcr->blk_size); > > + return -EINVAL; > > + } > > + > > + dev_dbg(dev, > > + "DC region %s DPA: %#llx LEN: %#llx BLKSZ: %#llx\n", > > + dcr->name, dcr->base, dcr->decode_len, dcr->blk_size); > > + > > + return 0; > > +} > > + > > +/* Returns the number of regions in dc_resp or -ERRNO */ > > +static int cxl_get_dc_config(struct cxl_memdev_state *mds, u8 start_region, > > + struct cxl_mbox_get_dc_config_out *dc_resp, > > + size_t dc_resp_size) > > +{ > > + struct cxl_mbox_get_dc_config_in get_dc = (struct cxl_mbox_get_dc_config_in) { > > + .region_count = CXL_MAX_DC_REGION, > > + .start_region_index = start_region, > > + }; > > + struct cxl_mbox_cmd mbox_cmd = (struct cxl_mbox_cmd) { > > + .opcode = CXL_MBOX_OP_GET_DC_CONFIG, > > + .payload_in = &get_dc, > > + .size_in = sizeof(get_dc), > > + .size_out = dc_resp_size, > > + .payload_out = dc_resp, > > + .min_out = 1, > > + }; > > + struct device *dev = mds->cxlds.dev; > > + int rc; > > + > > + rc = cxl_internal_send_cmd(mds, &mbox_cmd); > > + if (rc < 0) > > + return rc; > > + > > + rc = dc_resp->avail_region_count - start_region; > > + > > + /* > > + * The number of regions in the payload may have been truncated due to > > + * payload_size limits; if so adjust the returned count to match. > > + */ > > + if (mbox_cmd.size_out < sizeof(*dc_resp)) > > + rc = CXL_REGIONS_RETURNED(mbox_cmd.size_out); > > Why not always return this? If there was space, doesn't it equal > the value set above anyway? I've been looking at this more carefully and there is a bigger issue with this. I need to update this code to handle the regions_returned which was added in the errata and get rid of this macro. > > > + > > + dev_dbg(dev, "Read %d/%d DC regions\n", rc, dc_resp->avail_region_count); > > + > > + return rc; > > +} > > > +/** > > + * cxl_dev_dynamic_capacity_identify() - Reads the dynamic capacity > > + * information from the device. > > + * @mds: The memory device state > > + * > > + * Read Dynamic Capacity information from the device and populate the state > > + * structures for later use. > > + * > > + * Return: 0 if identify was executed successfully, -ERRNO on error. > > + */ > > +int cxl_dev_dynamic_capacity_identify(struct cxl_memdev_state *mds) > > +{ > > + size_t dc_resp_size = mds->payload_size; > > + struct device *dev = mds->cxlds.dev; > > + u8 start_region, i; > > + int rc = 0; > > Is this used before being set? nope... > > > + > > + for (i = 0; i < CXL_MAX_DC_REGION; i++) > > + snprintf(mds->dc_region[i].name, CXL_DC_REGION_STRLEN, "<nil>"); > > + > > + /* Check GET_DC_CONFIG is supported by device */ > > + if (!cxl_dcd_supported(mds)) { > > + dev_dbg(dev, "DCD not supported\n"); > > + return 0; > > + } > > + > > + struct cxl_mbox_get_dc_config_out *dc_resp __free(kfree) = > > + kvmalloc(dc_resp_size, GFP_KERNEL); > > + if (!dc_resp) > > + return -ENOMEM; > > + > > + start_region = 0; > > + do { > > + int j; > > + > > + rc = cxl_get_dc_config(mds, start_region, dc_resp, dc_resp_size); > > + if (rc < 0) { > > + dev_dbg(dev, "Failed to get DC config: %d\n", rc); > > + return rc; > > + } > > + > > + mds->nr_dc_region += rc; > > + > > + if (mds->nr_dc_region < 1 || mds->nr_dc_region > CXL_MAX_DC_REGION) { > > + dev_err(dev, "Invalid num of dynamic capacity regions %d\n", > > + mds->nr_dc_region); > > + return -EINVAL; > > + } > > + > > + for (i = start_region, j = 0; i < mds->nr_dc_region; i++, j++) { > > + rc = cxl_dc_save_region_info(mds, i, &dc_resp->region[j]); > > + if (rc) { > > + dev_dbg(dev, "Failed to save region info: %d\n", rc); > > + return rc; > > + } > > + } > > + > > + start_region = mds->nr_dc_region; > > + > > + } while (mds->nr_dc_region < dc_resp->avail_region_count); > > + > > + mds->dynamic_cap = > > + mds->dc_region[mds->nr_dc_region - 1].base + > > + mds->dc_region[mds->nr_dc_region - 1].decode_len - > > + mds->dc_region[0].base; > > + dev_dbg(dev, "Total dynamic capacity: %#llx\n", mds->dynamic_cap); > > + > > + return 0; > > +} > > +EXPORT_SYMBOL_NS_GPL(cxl_dev_dynamic_capacity_identify, CXL); > > > > > diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h > > index 79a67cff9143..4624cf612c1e 100644 > > --- a/drivers/cxl/cxlmem.h > > +++ b/drivers/cxl/cxlmem.h > > > /** > > * struct cxl_memdev_state - Generic Type-3 Memory Device Class driver data > > * > > @@ -467,6 +482,8 @@ struct cxl_dev_state { > > * @enabled_cmds: Hardware commands found enabled in CEL. > > * @exclusive_cmds: Commands that are kernel-internal only > > * @total_bytes: sum of all possible capacities > > + * @static_cap: Sum of static RAM and PMEM capacities > > + * @dynamic_cap: Complete DPA range occupied by DC regions > > * @volatile_only_bytes: hard volatile capacity > > * @persistent_only_bytes: hard persistent capacity > > * @partition_align_bytes: alignment size for partition-able capacity > > @@ -474,6 +491,8 @@ struct cxl_dev_state { > > * @active_persistent_bytes: sum of hard + soft persistent > > * @next_volatile_bytes: volatile capacity change pending device reset > > * @next_persistent_bytes: persistent capacity change pending device reset > > Looks like we have some ordering issues ram_perf and pmem_perf (at least) > that we should fix up as a precursor. I sent a reply to the QoS patch > that added these. I see. That will likely resolve out when I rebase. But seems nothing to be done for this patch and best left as a separate patch from this series. > > > + * @nr_dc_region: number of DC regions implemented in the memory device > > + * @dc_region: array containing info about the DC regions > > * @event: event log driver state > > * @poison: poison driver state info > > * @security: security driver state info > > @@ -494,7 +513,10 @@ struct cxl_memdev_state { > > DECLARE_BITMAP(dcd_cmds, CXL_DCD_ENABLED_MAX); > > DECLARE_BITMAP(enabled_cmds, CXL_MEM_COMMAND_ID_MAX); > > DECLARE_BITMAP(exclusive_cmds, CXL_MEM_COMMAND_ID_MAX); > > + > Trivial but this is an unrelated change and shouldn't be in this patch. > > > u64 total_bytes; > > + u64 static_cap; > > + u64 dynamic_cap; > > u64 volatile_only_bytes; > > u64 persistent_only_bytes; > > u64 partition_align_bytes; > > @@ -506,6 +528,9 @@ struct cxl_memdev_state { > > struct cxl_dpa_perf ram_perf; > > struct cxl_dpa_perf pmem_perf; > > > > + u8 nr_dc_region; > > + struct cxl_dc_region_info dc_region[CXL_MAX_DC_REGION]; > > + > > struct cxl_event_state event; > > struct cxl_poison_state poison; > > struct cxl_security_state security; > > > + > > +/* See CXL 3.0 Table 125 get dynamic capacity config Output Payload */ > > +struct cxl_mbox_get_dc_config_out { > > + u8 avail_region_count; > > + u8 rsvd[7]; > > + struct cxl_dc_region_config { > > + __le64 region_base; > > + __le64 region_decode_length; > > + __le64 region_length; > > + __le64 region_block_size; > > + __le32 region_dsmad_handle; > > + u8 flags; > > + u8 rsvd[3]; > > + } __packed region[]; > > +} __packed; > > +#define CXL_DYNAMIC_CAPACITY_SANITIZE_ON_RELEASE_FLAG BIT(0) > > +#define CXL_REGIONS_RETURNED(size_out) \ > > + ((size_out - 8) / sizeof(struct cxl_dc_region_config)) > > Can we make that 8 self documenting? > offsetof(struct cxl_dc_region_config, region) perhaps? As I said above I think this macro is wrong I'm adjusting to remove it. Thanks, Ira
fan wrote: > On Sun, Mar 24, 2024 at 04:18:06PM -0700, ira.weiny@intel.com wrote: > > From: Navneet Singh <navneet.singh@intel.com> > > [snip] > > > > +struct cxl_mbox_get_dc_config_in { > > + u8 region_count; > > + u8 start_region_index; > > +} __packed; > > + > > +/* See CXL 3.0 Table 125 get dynamic capacity config Output Payload */ > > +struct cxl_mbox_get_dc_config_out { > > + u8 avail_region_count; > > + u8 rsvd[7]; > > + struct cxl_dc_region_config { > > + __le64 region_base; > > + __le64 region_decode_length; > > + __le64 region_length; > > + __le64 region_block_size; > > + __le32 region_dsmad_handle; > > + u8 flags; > > + u8 rsvd[3]; > > + } __packed region[]; > > +} __packed; > > +#define CXL_DYNAMIC_CAPACITY_SANITIZE_ON_RELEASE_FLAG BIT(0) > > +#define CXL_REGIONS_RETURNED(size_out) \ > > + ((size_out - 8) / sizeof(struct cxl_dc_region_config)) > > Although the result may be unchanged, but in cxl spec r3.1, there are four > fields after the region configuration structure. Yes. This macro is not needed. The fields after the structure are of little use to the host at this time. So I'm going to leave them out until a use can be found for them. Ira
Jørgen Hansen wrote: > On 3/25/24 00:18, ira.weiny@intel.com wrote: > > > From: Navneet Singh <navneet.singh@intel.com> > > [snip] > > /** > > * struct cxl_memdev_state - Generic Type-3 Memory Device Class driver data > > * > > @@ -467,6 +482,8 @@ struct cxl_dev_state { > > * @enabled_cmds: Hardware commands found enabled in CEL. > > * @exclusive_cmds: Commands that are kernel-internal only > > * @total_bytes: sum of all possible capacities > > + * @static_cap: Sum of static RAM and PMEM capacities > > + * @dynamic_cap: Complete DPA range occupied by DC regions > > How about naming these total_range, static_cap and dynamic_range to make > it clear that the DPA range occupied by DC regions isn't necessarily > usable capacity (as opposed to the static_cap where the spec defines it > as usable capacity). I thought this was a good idea but on second thought these are not range variables at all. They really represent the various lengths of the resources. For total_bytes the documentation already says 'sum of all __possible__ capacities'. I think you have a point for the new fields though. They should all be named in some consistent manner and documented as such. So I propose: diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index 94531af018f8..9c18b229f69a 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -481,9 +481,9 @@ struct cxl_dc_region_info { * @dcd_cmds: List of DCD commands implemented by memory device * @enabled_cmds: Hardware commands found enabled in CEL. * @exclusive_cmds: Commands that are kernel-internal only - * @total_bytes: sum of all possible capacities - * @static_cap: Sum of static RAM and PMEM capacities - * @dynamic_cap: Complete DPA range occupied by DC regions + * @total_bytes: length of all possible capacities + * @static_bytes: length of possible static RAM and PMEM partitions + * @dynamic_bytes: length of possible DC partitions (DC Regions) * @volatile_only_bytes: hard volatile capacity * @persistent_only_bytes: hard persistent capacity * @partition_align_bytes: alignment size for partition-able capacity @@ -515,8 +515,8 @@ struct cxl_memdev_state { DECLARE_BITMAP(exclusive_cmds, CXL_MEM_COMMAND_ID_MAX); u64 total_bytes; - u64 static_cap; - u64 dynamic_cap; + u64 static_bytes; + u64 dynamic_bytes; u64 volatile_only_bytes; u64 persistent_only_bytes; u64 partition_align_bytes;
On Sun, Mar 24, 2024 at 04:18:06PM -0700, Ira Weiny wrote: > From: Navneet Singh <navneet.singh@intel.com> > > Devices can optionally support Dynamic Capacity (DC). These devices are > known as Dynamic Capacity Devices (DCD). > > Implement the DC mailbox commands as specified in CXL 3.1 section > 8.2.9.9.9 (opcodes 48XXh). Read the DC configuration and store the DC > region information in the device state. It seems worth mentioning that it validates against a bunch of alignment rules. Speaking of which... > > Signed-off-by: Navneet Singh <navneet.singh@intel.com> > Co-developed-by: Ira Weiny <ira.weiny@intel.com> > Signed-off-by: Ira Weiny <ira.weiny@intel.com> > > --- > Changes for v1 > [Jørgen: ensure CXL 2.0 device support by removing dc_event_log_size] > [iweiny/Jørgen: use get DC config command to signal DCD support] > [djiang: fix subject] > [Fan: add additional region configuration checks] > [Jonathan/djiang: split out region mode changes] > [Jonathan: fix up comments/kdoc] > [Jonathan: s/cxl_get_dc_id/cxl_get_dc_config/] > [Jonathan: use __free() in identify call] > [Jonathan: remove unneeded formatting changes] > [Jonathan: s/cxl_mbox_dynamic_capacity/cxl_mbox_get_dc_config_out/] > [Jonathan: s/cxl_mbox_get_dc_config/cxl_mbox_get_dc_config_in/] > [iweiny: remove type2 work dependancy/rebase on master] > [iweiny: fix 0day build issues] > --- > drivers/cxl/core/mbox.c | 184 +++++++++++++++++++++++++++++++++++++++++++++++- > drivers/cxl/cxlmem.h | 49 +++++++++++++ > drivers/cxl/pci.c | 4 ++ > 3 files changed, 236 insertions(+), 1 deletion(-) > > diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c > index ed4131c6f50b..14e8a7528a8b 100644 > --- a/drivers/cxl/core/mbox.c > +++ b/drivers/cxl/core/mbox.c > @@ -1123,7 +1123,7 @@ int cxl_dev_state_identify(struct cxl_memdev_state *mds) > if (rc < 0) > return rc; > > - mds->total_bytes = > + mds->static_cap = > le64_to_cpu(id.total_capacity) * CXL_CAPACITY_MULTIPLIER; > mds->volatile_only_bytes = > le64_to_cpu(id.volatile_capacity) * CXL_CAPACITY_MULTIPLIER; > @@ -1230,6 +1230,175 @@ int cxl_mem_sanitize(struct cxl_memdev *cxlmd, u16 cmd) > return rc; > } > > +static int cxl_dc_save_region_info(struct cxl_memdev_state *mds, u8 index, > + struct cxl_dc_region_config *region_config) > +{ > + struct cxl_dc_region_info *dcr = &mds->dc_region[index]; > + struct device *dev = mds->cxlds.dev; > + > + dcr->base = le64_to_cpu(region_config->region_base); > + dcr->decode_len = le64_to_cpu(region_config->region_decode_length); > + dcr->decode_len *= CXL_CAPACITY_MULTIPLIER; > + dcr->len = le64_to_cpu(region_config->region_length); > + dcr->blk_size = le64_to_cpu(region_config->region_block_size); > + dcr->dsmad_handle = le32_to_cpu(region_config->region_dsmad_handle); > + dcr->flags = region_config->flags; > + snprintf(dcr->name, CXL_DC_REGION_STRLEN, "dc%d", index); > + Below - where are these rules defined in CXL spec? Maybe one general comment referring to a CXL spec section if available? > + /* Check regions are in increasing DPA order */ Better to state the rule and who's rule it is: /* CXL spec mandates increasing DPA order */ > + /* Check regions are in increasing DPA order */ > + if (index > 0) { > + struct cxl_dc_region_info *prev_dcr = &mds->dc_region[index - 1]; > + > + if ((prev_dcr->base + prev_dcr->decode_len) > dcr->base) Is that allowing overlap at dcr->base? > + dev_err(dev, > + "DPA ordering violation for DC region %d and %d\n", > + index - 1, index); > + return -EINVAL; > + } > + } > + > + if (!IS_ALIGNED(dcr->base, SZ_256M) || > + !IS_ALIGNED(dcr->base, dcr->blk_size)) { > + dev_err(dev, "DC region %d invalid base %#llx blk size %#llx\n", index, > + dcr->base, dcr->blk_size); > + return -EINVAL; > + } > + > + if (dcr->decode_len == 0 || dcr->len == 0 || dcr->decode_len < dcr->len || > + !IS_ALIGNED(dcr->len, dcr->blk_size)) { > + dev_err(dev, "DC region %d invalid length; decode %#llx len %#llx blk size %#llx\n", > + index, dcr->decode_len, dcr->len, dcr->blk_size); > + return -EINVAL; > + } > + > + if (dcr->blk_size == 0 || dcr->blk_size % 0x40 || OK - I know 0x40 must be cache align, but only because I saw Jonathans comment. Please comment or macro. > + !is_power_of_2(dcr->blk_size)) { > + dev_err(dev, "DC region %d invalid block size; %#llx\n", > + index, dcr->blk_size); > + return -EINVAL; > + } > + > + dev_dbg(dev, > + "DC region %s DPA: %#llx LEN: %#llx BLKSZ: %#llx\n", > + dcr->name, dcr->base, dcr->decode_len, dcr->blk_size); > + > + return 0; > +} > + > +/* Returns the number of regions in dc_resp or -ERRNO */ > +static int cxl_get_dc_config(struct cxl_memdev_state *mds, u8 start_region, > + struct cxl_mbox_get_dc_config_out *dc_resp, > + size_t dc_resp_size) > +{ > + struct cxl_mbox_get_dc_config_in get_dc = (struct cxl_mbox_get_dc_config_in) { > + .region_count = CXL_MAX_DC_REGION, > + .start_region_index = start_region, > + }; > + struct cxl_mbox_cmd mbox_cmd = (struct cxl_mbox_cmd) { > + .opcode = CXL_MBOX_OP_GET_DC_CONFIG, > + .payload_in = &get_dc, > + .size_in = sizeof(get_dc), > + .size_out = dc_resp_size, > + .payload_out = dc_resp, > + .min_out = 1, > + }; > + struct device *dev = mds->cxlds.dev; > + int rc; > + > + rc = cxl_internal_send_cmd(mds, &mbox_cmd); > + if (rc < 0) > + return rc; > + > + rc = dc_resp->avail_region_count - start_region; > + > + /* > + * The number of regions in the payload may have been truncated due to > + * payload_size limits; if so adjust the returned count to match. > + */ > + if (mbox_cmd.size_out < sizeof(*dc_resp)) > + rc = CXL_REGIONS_RETURNED(mbox_cmd.size_out); > + > + dev_dbg(dev, "Read %d/%d DC regions\n", rc, dc_resp->avail_region_count); > + > + return rc; > +} > + > +static bool cxl_dcd_supported(struct cxl_memdev_state *mds) > +{ > + return test_bit(CXL_DCD_ENABLED_GET_CONFIG, mds->dcd_cmds); > +} > + > +/** > + * cxl_dev_dynamic_capacity_identify() - Reads the dynamic capacity > + * information from the device. > + * @mds: The memory device state > + * > + * Read Dynamic Capacity information from the device and populate the state > + * structures for later use. > + * > + * Return: 0 if identify was executed successfully, -ERRNO on error. > + */ > +int cxl_dev_dynamic_capacity_identify(struct cxl_memdev_state *mds) > +{ > + size_t dc_resp_size = mds->payload_size; > + struct device *dev = mds->cxlds.dev; > + u8 start_region, i; > + int rc = 0; > + > + for (i = 0; i < CXL_MAX_DC_REGION; i++) > + snprintf(mds->dc_region[i].name, CXL_DC_REGION_STRLEN, "<nil>"); > + > + /* Check GET_DC_CONFIG is supported by device */ Needless comment above due to nicely named cxl_dcd_supported() below > + if (!cxl_dcd_supported(mds)) { > + dev_dbg(dev, "DCD not supported\n"); > + return 0; > + } > + > + struct cxl_mbox_get_dc_config_out *dc_resp __free(kfree) = > + kvmalloc(dc_resp_size, GFP_KERNEL); > + if (!dc_resp) > + return -ENOMEM; > + > + start_region = 0; > + do { > + int j; > + > + rc = cxl_get_dc_config(mds, start_region, dc_resp, dc_resp_size); > + if (rc < 0) { > + dev_dbg(dev, "Failed to get DC config: %d\n", rc); > + return rc; > + } > + > + mds->nr_dc_region += rc; > + > + if (mds->nr_dc_region < 1 || mds->nr_dc_region > CXL_MAX_DC_REGION) { > + dev_err(dev, "Invalid num of dynamic capacity regions %d\n", > + mds->nr_dc_region); > + return -EINVAL; > + } > + > + for (i = start_region, j = 0; i < mds->nr_dc_region; i++, j++) { > + rc = cxl_dc_save_region_info(mds, i, &dc_resp->region[j]); > + if (rc) { > + dev_dbg(dev, "Failed to save region info: %d\n", rc); > + return rc; > + } > + } > + > + start_region = mds->nr_dc_region; > + > + } while (mds->nr_dc_region < dc_resp->avail_region_count); > + > + mds->dynamic_cap = > + mds->dc_region[mds->nr_dc_region - 1].base + > + mds->dc_region[mds->nr_dc_region - 1].decode_len - > + mds->dc_region[0].base; > + dev_dbg(dev, "Total dynamic capacity: %#llx\n", mds->dynamic_cap); > + > + return 0; > +} > +EXPORT_SYMBOL_NS_GPL(cxl_dev_dynamic_capacity_identify, CXL); > + > static int add_dpa_res(struct device *dev, struct resource *parent, > struct resource *res, resource_size_t start, > resource_size_t size, const char *type) > @@ -1260,8 +1429,12 @@ int cxl_mem_create_range_info(struct cxl_memdev_state *mds) > { > struct cxl_dev_state *cxlds = &mds->cxlds; > struct device *dev = cxlds->dev; > + size_t untenanted_mem; > int rc; > > + untenanted_mem = mds->dc_region[0].base - mds->static_cap; > + mds->total_bytes = mds->static_cap + untenanted_mem + mds->dynamic_cap; > + > if (!cxlds->media_ready) { > cxlds->dpa_res = DEFINE_RES_MEM(0, 0); > cxlds->ram_res = DEFINE_RES_MEM(0, 0); > @@ -1271,6 +1444,15 @@ int cxl_mem_create_range_info(struct cxl_memdev_state *mds) > > cxlds->dpa_res = DEFINE_RES_MEM(0, mds->total_bytes); > > + for (int i = 0; i < mds->nr_dc_region; i++) { > + struct cxl_dc_region_info *dcr = &mds->dc_region[i]; > + > + rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->dc_res[i], > + dcr->base, dcr->decode_len, dcr->name); > + if (rc) > + return rc; > + } > + > if (mds->partition_align_bytes == 0) { > rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->ram_res, 0, > mds->volatile_only_bytes, "ram"); > diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h > index 79a67cff9143..4624cf612c1e 100644 > --- a/drivers/cxl/cxlmem.h > +++ b/drivers/cxl/cxlmem.h > @@ -402,6 +402,7 @@ enum cxl_devtype { > CXL_DEVTYPE_CLASSMEM, > }; > > +#define CXL_MAX_DC_REGION 8 > /** > * struct cxl_dpa_perf - DPA performance property entry > * @dpa_range - range for DPA address > @@ -431,6 +432,8 @@ struct cxl_dpa_perf { > * @dpa_res: Overall DPA resource tree for the device > * @pmem_res: Active Persistent memory capacity configuration > * @ram_res: Active Volatile memory capacity configuration > + * @dc_res: Active Dynamic Capacity memory configuration for each possible > + * region > * @serial: PCIe Device Serial Number > * @type: Generic Memory Class device or Vendor Specific Memory device > */ > @@ -445,10 +448,22 @@ struct cxl_dev_state { > struct resource dpa_res; > struct resource pmem_res; > struct resource ram_res; > + struct resource dc_res[CXL_MAX_DC_REGION]; > u64 serial; > enum cxl_devtype type; > }; > > +#define CXL_DC_REGION_STRLEN 8 > +struct cxl_dc_region_info { > + u64 base; > + u64 decode_len; > + u64 len; > + u64 blk_size; > + u32 dsmad_handle; > + u8 flags; > + u8 name[CXL_DC_REGION_STRLEN]; > +}; > + > /** > * struct cxl_memdev_state - Generic Type-3 Memory Device Class driver data > * > @@ -467,6 +482,8 @@ struct cxl_dev_state { > * @enabled_cmds: Hardware commands found enabled in CEL. > * @exclusive_cmds: Commands that are kernel-internal only > * @total_bytes: sum of all possible capacities > + * @static_cap: Sum of static RAM and PMEM capacities > + * @dynamic_cap: Complete DPA range occupied by DC regions > * @volatile_only_bytes: hard volatile capacity > * @persistent_only_bytes: hard persistent capacity > * @partition_align_bytes: alignment size for partition-able capacity > @@ -474,6 +491,8 @@ struct cxl_dev_state { > * @active_persistent_bytes: sum of hard + soft persistent > * @next_volatile_bytes: volatile capacity change pending device reset > * @next_persistent_bytes: persistent capacity change pending device reset > + * @nr_dc_region: number of DC regions implemented in the memory device > + * @dc_region: array containing info about the DC regions > * @event: event log driver state > * @poison: poison driver state info > * @security: security driver state info > @@ -494,7 +513,10 @@ struct cxl_memdev_state { > DECLARE_BITMAP(dcd_cmds, CXL_DCD_ENABLED_MAX); > DECLARE_BITMAP(enabled_cmds, CXL_MEM_COMMAND_ID_MAX); > DECLARE_BITMAP(exclusive_cmds, CXL_MEM_COMMAND_ID_MAX); > + > u64 total_bytes; > + u64 static_cap; > + u64 dynamic_cap; > u64 volatile_only_bytes; > u64 persistent_only_bytes; > u64 partition_align_bytes; > @@ -506,6 +528,9 @@ struct cxl_memdev_state { > struct cxl_dpa_perf ram_perf; > struct cxl_dpa_perf pmem_perf; > > + u8 nr_dc_region; > + struct cxl_dc_region_info dc_region[CXL_MAX_DC_REGION]; > + > struct cxl_event_state event; > struct cxl_poison_state poison; > struct cxl_security_state security; > @@ -705,6 +730,29 @@ struct cxl_mbox_set_partition_info { > > #define CXL_SET_PARTITION_IMMEDIATE_FLAG BIT(0) > > +struct cxl_mbox_get_dc_config_in { > + u8 region_count; > + u8 start_region_index; > +} __packed; > + > +/* See CXL 3.0 Table 125 get dynamic capacity config Output Payload */ > +struct cxl_mbox_get_dc_config_out { > + u8 avail_region_count; > + u8 rsvd[7]; > + struct cxl_dc_region_config { > + __le64 region_base; > + __le64 region_decode_length; > + __le64 region_length; > + __le64 region_block_size; > + __le32 region_dsmad_handle; > + u8 flags; > + u8 rsvd[3]; > + } __packed region[]; > +} __packed; > +#define CXL_DYNAMIC_CAPACITY_SANITIZE_ON_RELEASE_FLAG BIT(0) > +#define CXL_REGIONS_RETURNED(size_out) \ > + ((size_out - 8) / sizeof(struct cxl_dc_region_config)) > + > /* Set Timestamp CXL 3.0 Spec 8.2.9.4.2 */ > struct cxl_mbox_set_timestamp_in { > __le64 timestamp; > @@ -828,6 +876,7 @@ enum { > int cxl_internal_send_cmd(struct cxl_memdev_state *mds, > struct cxl_mbox_cmd *cmd); > int cxl_dev_state_identify(struct cxl_memdev_state *mds); > +int cxl_dev_dynamic_capacity_identify(struct cxl_memdev_state *mds); > int cxl_await_media_ready(struct cxl_dev_state *cxlds); > int cxl_enumerate_cmds(struct cxl_memdev_state *mds); > int cxl_mem_create_range_info(struct cxl_memdev_state *mds); > diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c > index 2ff361e756d6..216881455364 100644 > --- a/drivers/cxl/pci.c > +++ b/drivers/cxl/pci.c > @@ -874,6 +874,10 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) > if (rc) > return rc; > > + rc = cxl_dev_dynamic_capacity_identify(mds); > + if (rc) > + return rc; > + > rc = cxl_mem_create_range_info(mds); > if (rc) > return rc; > > -- > 2.44.0 >
On 4/5/24 20:09, Ira Weiny wrote: > Jørgen Hansen wrote: >> On 3/25/24 00:18, ira.weiny@intel.com wrote: >> >>> From: Navneet Singh <navneet.singh@intel.com> >>> > > [snip] > >>> /** >>> * struct cxl_memdev_state - Generic Type-3 Memory Device Class driver data >>> * >>> @@ -467,6 +482,8 @@ struct cxl_dev_state { >>> * @enabled_cmds: Hardware commands found enabled in CEL. >>> * @exclusive_cmds: Commands that are kernel-internal only >>> * @total_bytes: sum of all possible capacities >>> + * @static_cap: Sum of static RAM and PMEM capacities >>> + * @dynamic_cap: Complete DPA range occupied by DC regions >> >> How about naming these total_range, static_cap and dynamic_range to make >> it clear that the DPA range occupied by DC regions isn't necessarily >> usable capacity (as opposed to the static_cap where the spec defines it >> as usable capacity). > > I thought this was a good idea but on second thought these are not range > variables at all. They really represent the various lengths of the > resources. > > For total_bytes the documentation already says 'sum of all __possible__ > capacities > > I think you have a point for the new fields though. They should all be > named in some consistent manner and documented as such. > > So I propose: > > diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h > index 94531af018f8..9c18b229f69a 100644 > --- a/drivers/cxl/cxlmem.h > +++ b/drivers/cxl/cxlmem.h > @@ -481,9 +481,9 @@ struct cxl_dc_region_info { > * @dcd_cmds: List of DCD commands implemented by memory device > * @enabled_cmds: Hardware commands found enabled in CEL. > * @exclusive_cmds: Commands that are kernel-internal only > - * @total_bytes: sum of all possible capacities > - * @static_cap: Sum of static RAM and PMEM capacities > - * @dynamic_cap: Complete DPA range occupied by DC regions > + * @total_bytes: length of all possible capacities > + * @static_bytes: length of possible static RAM and PMEM partitions > + * @dynamic_bytes: length of possible DC partitions (DC Regions) > * @volatile_only_bytes: hard volatile capacity > * @persistent_only_bytes: hard persistent capacity > * @partition_align_bytes: alignment size for partition-able capacity > @@ -515,8 +515,8 @@ struct cxl_memdev_state { > DECLARE_BITMAP(exclusive_cmds, CXL_MEM_COMMAND_ID_MAX); > > u64 total_bytes; > - u64 static_cap; > - u64 dynamic_cap; > + u64 static_bytes; > + u64 dynamic_bytes; > u64 volatile_only_bytes; > u64 persistent_only_bytes; > u64 partition_align_bytes; That looks good. My main concern was that the DC regions may be separated by gaps that take up part of the DPA range but isn't part of the usable capacity. Pre-DCD, total_bytes was in fact all the usable capacity of the device as reported by the device itself, but now it includes the potential gaps between DC regions as well as the potential gap between static and dynamic regions. Thanks, Jørgen
diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c index ed4131c6f50b..14e8a7528a8b 100644 --- a/drivers/cxl/core/mbox.c +++ b/drivers/cxl/core/mbox.c @@ -1123,7 +1123,7 @@ int cxl_dev_state_identify(struct cxl_memdev_state *mds) if (rc < 0) return rc; - mds->total_bytes = + mds->static_cap = le64_to_cpu(id.total_capacity) * CXL_CAPACITY_MULTIPLIER; mds->volatile_only_bytes = le64_to_cpu(id.volatile_capacity) * CXL_CAPACITY_MULTIPLIER; @@ -1230,6 +1230,175 @@ int cxl_mem_sanitize(struct cxl_memdev *cxlmd, u16 cmd) return rc; } +static int cxl_dc_save_region_info(struct cxl_memdev_state *mds, u8 index, + struct cxl_dc_region_config *region_config) +{ + struct cxl_dc_region_info *dcr = &mds->dc_region[index]; + struct device *dev = mds->cxlds.dev; + + dcr->base = le64_to_cpu(region_config->region_base); + dcr->decode_len = le64_to_cpu(region_config->region_decode_length); + dcr->decode_len *= CXL_CAPACITY_MULTIPLIER; + dcr->len = le64_to_cpu(region_config->region_length); + dcr->blk_size = le64_to_cpu(region_config->region_block_size); + dcr->dsmad_handle = le32_to_cpu(region_config->region_dsmad_handle); + dcr->flags = region_config->flags; + snprintf(dcr->name, CXL_DC_REGION_STRLEN, "dc%d", index); + + /* Check regions are in increasing DPA order */ + if (index > 0) { + struct cxl_dc_region_info *prev_dcr = &mds->dc_region[index - 1]; + + if ((prev_dcr->base + prev_dcr->decode_len) > dcr->base) { + dev_err(dev, + "DPA ordering violation for DC region %d and %d\n", + index - 1, index); + return -EINVAL; + } + } + + if (!IS_ALIGNED(dcr->base, SZ_256M) || + !IS_ALIGNED(dcr->base, dcr->blk_size)) { + dev_err(dev, "DC region %d invalid base %#llx blk size %#llx\n", index, + dcr->base, dcr->blk_size); + return -EINVAL; + } + + if (dcr->decode_len == 0 || dcr->len == 0 || dcr->decode_len < dcr->len || + !IS_ALIGNED(dcr->len, dcr->blk_size)) { + dev_err(dev, "DC region %d invalid length; decode %#llx len %#llx blk size %#llx\n", + index, dcr->decode_len, dcr->len, dcr->blk_size); + return -EINVAL; + } + + if (dcr->blk_size == 0 || dcr->blk_size % 0x40 || + !is_power_of_2(dcr->blk_size)) { + dev_err(dev, "DC region %d invalid block size; %#llx\n", + index, dcr->blk_size); + return -EINVAL; + } + + dev_dbg(dev, + "DC region %s DPA: %#llx LEN: %#llx BLKSZ: %#llx\n", + dcr->name, dcr->base, dcr->decode_len, dcr->blk_size); + + return 0; +} + +/* Returns the number of regions in dc_resp or -ERRNO */ +static int cxl_get_dc_config(struct cxl_memdev_state *mds, u8 start_region, + struct cxl_mbox_get_dc_config_out *dc_resp, + size_t dc_resp_size) +{ + struct cxl_mbox_get_dc_config_in get_dc = (struct cxl_mbox_get_dc_config_in) { + .region_count = CXL_MAX_DC_REGION, + .start_region_index = start_region, + }; + struct cxl_mbox_cmd mbox_cmd = (struct cxl_mbox_cmd) { + .opcode = CXL_MBOX_OP_GET_DC_CONFIG, + .payload_in = &get_dc, + .size_in = sizeof(get_dc), + .size_out = dc_resp_size, + .payload_out = dc_resp, + .min_out = 1, + }; + struct device *dev = mds->cxlds.dev; + int rc; + + rc = cxl_internal_send_cmd(mds, &mbox_cmd); + if (rc < 0) + return rc; + + rc = dc_resp->avail_region_count - start_region; + + /* + * The number of regions in the payload may have been truncated due to + * payload_size limits; if so adjust the returned count to match. + */ + if (mbox_cmd.size_out < sizeof(*dc_resp)) + rc = CXL_REGIONS_RETURNED(mbox_cmd.size_out); + + dev_dbg(dev, "Read %d/%d DC regions\n", rc, dc_resp->avail_region_count); + + return rc; +} + +static bool cxl_dcd_supported(struct cxl_memdev_state *mds) +{ + return test_bit(CXL_DCD_ENABLED_GET_CONFIG, mds->dcd_cmds); +} + +/** + * cxl_dev_dynamic_capacity_identify() - Reads the dynamic capacity + * information from the device. + * @mds: The memory device state + * + * Read Dynamic Capacity information from the device and populate the state + * structures for later use. + * + * Return: 0 if identify was executed successfully, -ERRNO on error. + */ +int cxl_dev_dynamic_capacity_identify(struct cxl_memdev_state *mds) +{ + size_t dc_resp_size = mds->payload_size; + struct device *dev = mds->cxlds.dev; + u8 start_region, i; + int rc = 0; + + for (i = 0; i < CXL_MAX_DC_REGION; i++) + snprintf(mds->dc_region[i].name, CXL_DC_REGION_STRLEN, "<nil>"); + + /* Check GET_DC_CONFIG is supported by device */ + if (!cxl_dcd_supported(mds)) { + dev_dbg(dev, "DCD not supported\n"); + return 0; + } + + struct cxl_mbox_get_dc_config_out *dc_resp __free(kfree) = + kvmalloc(dc_resp_size, GFP_KERNEL); + if (!dc_resp) + return -ENOMEM; + + start_region = 0; + do { + int j; + + rc = cxl_get_dc_config(mds, start_region, dc_resp, dc_resp_size); + if (rc < 0) { + dev_dbg(dev, "Failed to get DC config: %d\n", rc); + return rc; + } + + mds->nr_dc_region += rc; + + if (mds->nr_dc_region < 1 || mds->nr_dc_region > CXL_MAX_DC_REGION) { + dev_err(dev, "Invalid num of dynamic capacity regions %d\n", + mds->nr_dc_region); + return -EINVAL; + } + + for (i = start_region, j = 0; i < mds->nr_dc_region; i++, j++) { + rc = cxl_dc_save_region_info(mds, i, &dc_resp->region[j]); + if (rc) { + dev_dbg(dev, "Failed to save region info: %d\n", rc); + return rc; + } + } + + start_region = mds->nr_dc_region; + + } while (mds->nr_dc_region < dc_resp->avail_region_count); + + mds->dynamic_cap = + mds->dc_region[mds->nr_dc_region - 1].base + + mds->dc_region[mds->nr_dc_region - 1].decode_len - + mds->dc_region[0].base; + dev_dbg(dev, "Total dynamic capacity: %#llx\n", mds->dynamic_cap); + + return 0; +} +EXPORT_SYMBOL_NS_GPL(cxl_dev_dynamic_capacity_identify, CXL); + static int add_dpa_res(struct device *dev, struct resource *parent, struct resource *res, resource_size_t start, resource_size_t size, const char *type) @@ -1260,8 +1429,12 @@ int cxl_mem_create_range_info(struct cxl_memdev_state *mds) { struct cxl_dev_state *cxlds = &mds->cxlds; struct device *dev = cxlds->dev; + size_t untenanted_mem; int rc; + untenanted_mem = mds->dc_region[0].base - mds->static_cap; + mds->total_bytes = mds->static_cap + untenanted_mem + mds->dynamic_cap; + if (!cxlds->media_ready) { cxlds->dpa_res = DEFINE_RES_MEM(0, 0); cxlds->ram_res = DEFINE_RES_MEM(0, 0); @@ -1271,6 +1444,15 @@ int cxl_mem_create_range_info(struct cxl_memdev_state *mds) cxlds->dpa_res = DEFINE_RES_MEM(0, mds->total_bytes); + for (int i = 0; i < mds->nr_dc_region; i++) { + struct cxl_dc_region_info *dcr = &mds->dc_region[i]; + + rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->dc_res[i], + dcr->base, dcr->decode_len, dcr->name); + if (rc) + return rc; + } + if (mds->partition_align_bytes == 0) { rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->ram_res, 0, mds->volatile_only_bytes, "ram"); diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index 79a67cff9143..4624cf612c1e 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -402,6 +402,7 @@ enum cxl_devtype { CXL_DEVTYPE_CLASSMEM, }; +#define CXL_MAX_DC_REGION 8 /** * struct cxl_dpa_perf - DPA performance property entry * @dpa_range - range for DPA address @@ -431,6 +432,8 @@ struct cxl_dpa_perf { * @dpa_res: Overall DPA resource tree for the device * @pmem_res: Active Persistent memory capacity configuration * @ram_res: Active Volatile memory capacity configuration + * @dc_res: Active Dynamic Capacity memory configuration for each possible + * region * @serial: PCIe Device Serial Number * @type: Generic Memory Class device or Vendor Specific Memory device */ @@ -445,10 +448,22 @@ struct cxl_dev_state { struct resource dpa_res; struct resource pmem_res; struct resource ram_res; + struct resource dc_res[CXL_MAX_DC_REGION]; u64 serial; enum cxl_devtype type; }; +#define CXL_DC_REGION_STRLEN 8 +struct cxl_dc_region_info { + u64 base; + u64 decode_len; + u64 len; + u64 blk_size; + u32 dsmad_handle; + u8 flags; + u8 name[CXL_DC_REGION_STRLEN]; +}; + /** * struct cxl_memdev_state - Generic Type-3 Memory Device Class driver data * @@ -467,6 +482,8 @@ struct cxl_dev_state { * @enabled_cmds: Hardware commands found enabled in CEL. * @exclusive_cmds: Commands that are kernel-internal only * @total_bytes: sum of all possible capacities + * @static_cap: Sum of static RAM and PMEM capacities + * @dynamic_cap: Complete DPA range occupied by DC regions * @volatile_only_bytes: hard volatile capacity * @persistent_only_bytes: hard persistent capacity * @partition_align_bytes: alignment size for partition-able capacity @@ -474,6 +491,8 @@ struct cxl_dev_state { * @active_persistent_bytes: sum of hard + soft persistent * @next_volatile_bytes: volatile capacity change pending device reset * @next_persistent_bytes: persistent capacity change pending device reset + * @nr_dc_region: number of DC regions implemented in the memory device + * @dc_region: array containing info about the DC regions * @event: event log driver state * @poison: poison driver state info * @security: security driver state info @@ -494,7 +513,10 @@ struct cxl_memdev_state { DECLARE_BITMAP(dcd_cmds, CXL_DCD_ENABLED_MAX); DECLARE_BITMAP(enabled_cmds, CXL_MEM_COMMAND_ID_MAX); DECLARE_BITMAP(exclusive_cmds, CXL_MEM_COMMAND_ID_MAX); + u64 total_bytes; + u64 static_cap; + u64 dynamic_cap; u64 volatile_only_bytes; u64 persistent_only_bytes; u64 partition_align_bytes; @@ -506,6 +528,9 @@ struct cxl_memdev_state { struct cxl_dpa_perf ram_perf; struct cxl_dpa_perf pmem_perf; + u8 nr_dc_region; + struct cxl_dc_region_info dc_region[CXL_MAX_DC_REGION]; + struct cxl_event_state event; struct cxl_poison_state poison; struct cxl_security_state security; @@ -705,6 +730,29 @@ struct cxl_mbox_set_partition_info { #define CXL_SET_PARTITION_IMMEDIATE_FLAG BIT(0) +struct cxl_mbox_get_dc_config_in { + u8 region_count; + u8 start_region_index; +} __packed; + +/* See CXL 3.0 Table 125 get dynamic capacity config Output Payload */ +struct cxl_mbox_get_dc_config_out { + u8 avail_region_count; + u8 rsvd[7]; + struct cxl_dc_region_config { + __le64 region_base; + __le64 region_decode_length; + __le64 region_length; + __le64 region_block_size; + __le32 region_dsmad_handle; + u8 flags; + u8 rsvd[3]; + } __packed region[]; +} __packed; +#define CXL_DYNAMIC_CAPACITY_SANITIZE_ON_RELEASE_FLAG BIT(0) +#define CXL_REGIONS_RETURNED(size_out) \ + ((size_out - 8) / sizeof(struct cxl_dc_region_config)) + /* Set Timestamp CXL 3.0 Spec 8.2.9.4.2 */ struct cxl_mbox_set_timestamp_in { __le64 timestamp; @@ -828,6 +876,7 @@ enum { int cxl_internal_send_cmd(struct cxl_memdev_state *mds, struct cxl_mbox_cmd *cmd); int cxl_dev_state_identify(struct cxl_memdev_state *mds); +int cxl_dev_dynamic_capacity_identify(struct cxl_memdev_state *mds); int cxl_await_media_ready(struct cxl_dev_state *cxlds); int cxl_enumerate_cmds(struct cxl_memdev_state *mds); int cxl_mem_create_range_info(struct cxl_memdev_state *mds); diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index 2ff361e756d6..216881455364 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -874,6 +874,10 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (rc) return rc; + rc = cxl_dev_dynamic_capacity_identify(mds); + if (rc) + return rc; + rc = cxl_mem_create_range_info(mds); if (rc) return rc;