Message ID | 20240324-dcd-type2-upstream-v1-18-b7b00d623625@intel.com |
---|---|
State | Changes Requested |
Headers | show |
Series | DCD: Add support for Dynamic Capacity Devices (DCD) | expand |
On Sun, 24 Mar 2024 16:18:21 -0700 ira.weiny@intel.com wrote: > From: Navneet Singh <navneet.singh@intel.com> > > A dynamic capacity devices (DCD) send events to signal the host about > changes in the availability of Dynamic Capacity (DC) memory. These > events contain extents, the addition or removal of which may occur at > any time. > > Adding memory is straight forward. If no region exists the extent is > rejected. If a region does exist, a region extent is formed and > surfaced. > > Removing memory requires checking if the memory is currently in use. > Memory use tracking is added in a subsequent patch so here the memory is > never in use and the removal occurs immediately. > > Most often extents will be offered to and accepted by the host in well > defined chunks. However, part of an extent may be requested for > release. Simplify extent tracking by signaling removal of any extent > which overlaps the requested release range. > > Force removal is intended as a mechanism between the FM and the device > and intended only when the host is unresponsive or otherwise broken. > Purposely ignore force removal events. > > Process DCD extents. > > Recall that all devices of an interleave set must offer a corresponding > extent for the region extent to be realized. This patch limits > interleave to 1. Thus the 1:1 mapping between device extent and DAX > region extent allows immediate surfacing. > > Signed-off-by: Navneet Singh <navneet.singh@intel.com> > Co-developed-by: Ira Weiny <ira.weiny@intel.com> > Signed-off-by: Ira Weiny <ira.weiny@intel.com> ... > diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c > index 6b00e717e42b..7babac2d1c95 100644 > --- a/drivers/cxl/core/mbox.c > +++ b/drivers/cxl/core/mbox.c > @@ -870,6 +870,37 @@ int cxl_enumerate_cmds(struct cxl_memdev_state *mds) > } > EXPORT_SYMBOL_NS_GPL(cxl_enumerate_cmds, CXL); > > +static int cxl_notify_dc_extent(struct cxl_memdev_state *mds, > + enum dc_event event, > + struct cxl_dc_extent *dc_extent) > +{ > + struct cxl_drv_nd nd = (struct cxl_drv_nd) { > + .event = event, > + .dc_extent = dc_extent > + }; > + struct device *dev; > + int rc = -ENXIO; > + > + dev = &mds->cxlds.cxlmd->dev; > + dev_dbg(dev, "Notify: type %d DPA:%#llx LEN:%#llx\n", > + event, le64_to_cpu(dc_extent->start_dpa), > + le64_to_cpu(dc_extent->length)); > + > + device_lock(dev); guard(device)(dev); if (!dev->driver) return -ENXIO; ... > + if (dev->driver) { > + struct cxl_driver *mem_drv = to_cxl_drv(dev->driver); > + > + if (mem_drv->notify) { > + dev_dbg(dev, "Notify driver type %d DPA:%#llx LEN:%#llx\n", > + event, le64_to_cpu(dc_extent->start_dpa), > + le64_to_cpu(dc_extent->length)); > + rc = mem_drv->notify(dev, &nd); > + } > + } > + device_unlock(dev); > + return rc; > +} ... > +static int cxl_handle_dcd_add_event(struct cxl_memdev_state *mds, > + struct cxl_dc_extent *dc_extent) > +{ > + struct range alloc_range, *resp_range; > + struct device *dev = mds->cxlds.dev; > + int rc; > + > + alloc_range = (struct range){ > + .start = le64_to_cpu(dc_extent->start_dpa), > + .end = le64_to_cpu(dc_extent->start_dpa) + > + le64_to_cpu(dc_extent->length) - 1, > + }; > + resp_range = &alloc_range; Code structure is a little odd to follow as sets up a bunch of stuff that may or may not be used, perhaps duplicate final call. I'm not 100% convinced it is worth it though. rc = cxl_notify_dc_extents(mds, DCD_ADD_CAPACITY, dc_extent); if (rc) { dev_dbg(dev, "unconsumed DC extent DPA:%#llx LEN:%#llx\n", le64_to_cpu(dc_extent->start_dpa), le64_to_cpu(dc_extent->length)); return cxl_send_dc_cap_response(mds, NULL, CXL_MBOX_OP_ADD_DC_RESPONSE); } alloc_range = (struct range){ .start = le64_to_cpu(dc_extent->start_dpa), .end = le64_to_cpu(dc_extent->start_dpa) + le64_to_cpu(dc_extent->length) - 1, }; return cxl_send_dc_cap_response(mds, &alloc_range, CXL_MBOX_OP_ADD_DC_RESPONSE); > + > + rc = cxl_notify_dc_extent(mds, DCD_ADD_CAPACITY, dc_extent); > + if (rc) { > + dev_dbg(dev, "unconsumed DC extent DPA:%#llx LEN:%#llx\n", > + le64_to_cpu(dc_extent->start_dpa), > + le64_to_cpu(dc_extent->length)); > + resp_range = NULL; > + } > + > + return cxl_send_dc_cap_response(mds, resp_range, > + CXL_MBOX_OP_ADD_DC_RESPONSE); > +} > +static int cxl_handle_dcd_event_records(struct cxl_memdev_state *mds, > + struct cxl_event_record_raw *raw_rec) > +{ > + struct cxl_event_dcd *event = &raw_rec->event.dcd; > + struct cxl_dc_extent *dc_extent = &event->extent; > + struct device *dev = mds->cxlds.dev; > + uuid_t *id = &raw_rec->id; > + > + if (!uuid_equal(id, &CXL_EVENT_DC_EVENT_UUID)) > + return -EINVAL; > + > + dev_dbg(dev, "DCD event %s : DPA:%#llx LEN:%#llx\n", > + cxl_dcd_evt_type_str(event->event_type), > + le64_to_cpu(dc_extent->start_dpa), > + le64_to_cpu(dc_extent->length)); > + > + switch (event->event_type) { > + case DCD_ADD_CAPACITY: > + return cxl_handle_dcd_add_event(mds, dc_extent); > + case DCD_RELEASE_CAPACITY: > + return cxl_handle_dcd_release_event(mds, dc_extent); > + case DCD_FORCED_CAPACITY_RELEASE: > + dev_err_ratelimited(dev, "Forced release event ignored.\n"); > + return 0; > + default: > + return -EINVAL; > + } > + > + return 0; dead code. > +} > + > static void cxl_mem_get_records_log(struct cxl_memdev_state *mds, > enum cxl_event_log_type type) > { > @@ -1109,9 +1225,17 @@ static void cxl_mem_get_records_log(struct cxl_memdev_state *mds, > if (!nr_rec) > break; > > - for (i = 0; i < nr_rec; i++) > + for (i = 0; i < nr_rec; i++) { > __cxl_event_trace_record(cxlmd, type, > &payload->records[i]); > + if (type == CXL_EVENT_TYPE_DCD) { Perhaps flip condition so we can reduce indent. if (type != CXL_EVENT_TYPE_DCD) continue; rc = > + rc = cxl_handle_dcd_event_records(mds, > + &payload->records[i]); > + if (rc) > + dev_err_ratelimited(dev, "dcd event failed: %d\n", > + rc); > + } > + } > > if (payload->flags & CXL_GET_EVENT_FLAG_OVERFLOW) > trace_cxl_overflow(cxlmd, type, payload); > diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c > index 7635ff109578..a07d95136f0d 100644 > --- a/drivers/cxl/core/region.c > +++ b/drivers/cxl/core/region.c > @@ -1450,6 +1450,57 @@ static int cxl_region_validate_position(struct cxl_region *cxlr, > return 0; > } > > +int cxl_region_notify_extent(struct cxl_region *cxlr, enum dc_event event, > + struct region_extent *reg_ext) > +{ > + struct cxl_dax_region *cxlr_dax; > + struct device *dev; > + int rc = -ENXIO; > + > + cxlr_dax = cxlr->cxlr_dax; > + dev = &cxlr_dax->dev; > + dev_dbg(dev, "Trying notify: type %d HPA %#llx - %#llx\n", > + event, reg_ext->hpa_range.start, reg_ext->hpa_range.end); > + > + device_lock(dev); guard(device)(dev); or scoped_guard() if you are adding things in later patches (I haven't checked yet) > + if (dev->driver) { > + struct cxl_driver *reg_drv = to_cxl_drv(dev->driver); > + struct cxl_drv_nd nd = (struct cxl_drv_nd) { > + .event = event, > + .reg_ext = reg_ext, > + }; > + > + if (reg_drv->notify) { > + dev_dbg(dev, "Notify: type %d HPA %#llx - %#llx\n", > + event, reg_ext->hpa_range.start, > + reg_ext->hpa_range.end); > + rc = reg_drv->notify(dev, &nd); > + } > + } > + device_unlock(dev); > + return rc; > +} > + > +static void calc_hpa_range(struct cxl_endpoint_decoder *cxled, I'd be tempted to drag this to earlier patch. Whilst it may look over the top there to have a separate function I think that is cleaner than introducing the code and then factoring it out in a patch doing lots of stuff like this one. > + struct cxl_dax_region *cxlr_dax, > + struct cxl_dc_extent *dc_extent, > + struct range *dpa_range, > + struct range *hpa_range) > +{ > + resource_size_t dpa_offset, hpa; > + > + /* > + * Without interleave... > + * HPA offset == DPA offset > + * ... but do the math anyway > + */ > + dpa_offset = dpa_range->start - cxled->dpa_res->start; > + hpa = cxled->cxld.hpa_range.start + dpa_offset; > + > + hpa_range->start = hpa - cxlr_dax->hpa_range.start; > + hpa_range->end = hpa_range->start + range_len(dpa_range) - 1; > +} > + > static int extent_check_overlap(struct device *dev, void *arg) > { > struct range *new_range = arg; > @@ -1480,7 +1531,6 @@ int cxl_ed_add_one_extent(struct cxl_endpoint_decoder *cxled, > struct cxl_region *cxlr = cxled->cxld.region; > struct range ext_dpa_range, ext_hpa_range; > struct device *dev = &cxlr->dev; > - resource_size_t dpa_offset, hpa; > > /* > * Interleave ways == 1 means this coresponds to a 1:1 mapping between > @@ -1502,18 +1552,7 @@ int cxl_ed_add_one_extent(struct cxl_endpoint_decoder *cxled, > dev_dbg(dev, "Adding DC extent DPA %#llx - %#llx\n", > ext_dpa_range.start, ext_dpa_range.end); > > - /* > - * Without interleave... > - * HPA offset == DPA offset > - * ... but do the math anyway > - */ > - dpa_offset = ext_dpa_range.start - cxled->dpa_res->start; > - hpa = cxled->cxld.hpa_range.start + dpa_offset; > - > - ext_hpa_range = (struct range) { > - .start = hpa - cxlr->cxlr_dax->hpa_range.start, > - .end = ext_hpa_range.start + range_len(&ext_dpa_range) - 1, > - }; > + calc_hpa_range(cxled, cxlr->cxlr_dax, dc_extent, &ext_dpa_range, &ext_hpa_range); > > if (extent_overlaps(cxlr->cxlr_dax, &ext_hpa_range)) > return -EINVAL; > @@ -1527,6 +1566,80 @@ int cxl_ed_add_one_extent(struct cxl_endpoint_decoder *cxled, > cxled); > } > +static int cxl_rm_reg_ext_by_range(struct device *dev, void *data) > +{ > + struct rm_data *rm_data = data; > + struct region_extent *reg_ext; > + > + if (!is_region_extent(dev)) > + return 0; > + reg_ext = to_region_extent(dev); > + > + /* > + * Any extent which 'touches' the released range is notified > + * for removal. No partials of the extent are released. > + */ > + if (range_overlaps(rm_data->range, ®_ext->hpa_range)) { > + struct cxl_region *cxlr = rm_data->cxlr; > + > + dev_dbg(dev, "Remove DAX region ext HPA %#llx - %#llx\n", > + reg_ext->hpa_range.start, reg_ext->hpa_range.end); > + cxl_ed_rm_region_extent(cxlr, reg_ext); Is it worth improving efficiency if we have a precise match and returning 1 to stop iterating? Perhaps premature optimization. > + } > + return 0; > +} > + > +static int cxl_ed_rm_extent(struct cxl_endpoint_decoder *cxled, > + struct cxl_dc_extent *dc_extent) > +{ > + struct cxl_region *cxlr = cxled->cxld.region; > + struct range hpa_range; > + > + struct range rel_dpa_range = { > + .start = le64_to_cpu(dc_extent->start_dpa), > + .end = le64_to_cpu(dc_extent->start_dpa) + > + le64_to_cpu(dc_extent->length) - 1, > + }; > + > + calc_hpa_range(cxled, cxlr->cxlr_dax, dc_extent, &rel_dpa_range, &hpa_range); > + > + struct rm_data rm_data = { > + .cxlr = cxlr, > + .range = &hpa_range, > + }; > + > + return device_for_each_child(&cxlr->cxlr_dax->dev, &rm_data, > + cxl_rm_reg_ext_by_range); > +} > + > diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h > index 5379ad7f5852..156d7c9a8de5 100644 > --- a/drivers/cxl/cxl.h > +++ b/drivers/cxl/cxl.h > @@ -10,6 +10,7 @@ > #include <linux/log2.h> > #include <linux/node.h> > #include <linux/io.h> > +#include <linux/cxl-event.h> > > /** > * DOC: cxl objects > @@ -613,6 +614,14 @@ struct cxl_pmem_region { > struct cxl_pmem_region_mapping mapping[]; > }; > > +/* See CXL 3.0 8.2.9.2.1.5 */ Add a name for the section to help searching future spec > +enum dc_event { > + DCD_ADD_CAPACITY, > + DCD_RELEASE_CAPACITY, > + DCD_FORCED_CAPACITY_RELEASE, > + DCD_REGION_CONFIGURATION_UPDATED, > +}; > diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c > index 0c79d9ce877c..20832f09c40c 100644 > --- a/drivers/cxl/mem.c > +++ b/drivers/cxl/mem.c > @@ -103,6 +103,50 @@ static int cxl_debugfs_poison_clear(void *data, u64 dpa) > DEFINE_DEBUGFS_ATTRIBUTE(cxl_poison_clear_fops, NULL, > cxl_debugfs_poison_clear, "%llx\n"); > > +static int cxl_mem_notify(struct device *dev, struct cxl_drv_nd *nd) > +{ > + struct cxl_memdev *cxlmd = to_cxl_memdev(dev); > + struct cxl_port *endpoint = cxlmd->endpoint; > + struct cxl_endpoint_decoder *cxled; > + struct cxl_dc_extent *dc_extent; > + struct device *ep_dev; > + int rc; > + > + dc_extent = nd->dc_extent; > + dev_dbg(dev, "notify DC action %d DPA:%#llx LEN:%#llx\n", > + nd->event, le64_to_cpu(dc_extent->start_dpa), > + le64_to_cpu(dc_extent->length)); > + > + ep_dev = device_find_child(&endpoint->dev, dc_extent, Can use __free(put_device) magic here to deal with the trailing put device. Minor tidy up, but nice to avoid the rc = / put / return rc dance > + match_ep_decoder_by_range); > + if (!ep_dev) { > + dev_dbg(dev, "Extent DPA:%#llx LEN:%#llx not mapped; evt %d\n", > + le64_to_cpu(dc_extent->start_dpa), > + le64_to_cpu(dc_extent->length), nd->event); > + return -ENXIO; > + } > + > + cxled = to_cxl_endpoint_decoder(ep_dev); > + rc = cxl_ed_notify_extent(cxled, nd); > + put_device(ep_dev); > + return rc; > +} > diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h > index 03fa6d50d46f..6b745c913f96 100644 > --- a/include/linux/cxl-event.h > +++ b/include/linux/cxl-event.h > @@ -91,11 +91,42 @@ struct cxl_event_mem_module { > u8 reserved[0x3d]; > } __packed; > > +/* > + * CXL rev 3.1 section 8.2.9.2.1.6; Table 8-51 Carries forward from earlier. Throw a table heading in there for easy of searching future specs. > + */ > +#define CXL_DC_EXTENT_TAG_LEN 0x10 > +struct cxl_dc_extent { > + __le64 start_dpa; > + __le64 length; > + u8 tag[CXL_DC_EXTENT_TAG_LEN]; > + __le16 shared_extn_seq; > + u8 reserved[0x6]; > +} __packed; > + > +/* > + * Dynamic Capacity Event Record > + * CXL rev 3.1 section 8.2.9.2.1.6; Table 8-50 > + */ > +struct cxl_event_dcd { > + struct cxl_event_record_hdr hdr; > + u8 event_type; > + u8 validity_flags; > + __le16 host_id; Could perhaps add a comment that this field isn't ever set for host. It's there for FM event records when the host has sent the device an Add capacity response or Capacity is released. > + u8 region_index; > + u8 flags; > + u8 reserved1[0x2]; > + struct cxl_dc_extent extent; > + u8 reserved2[0x18]; > + __le32 num_avail_extents; > + __le32 num_avail_tags; > +} __packed; > + > union cxl_event { > struct cxl_event_generic generic; > struct cxl_event_gen_media gen_media; > struct cxl_event_dram dram; > struct cxl_event_mem_module mem_module; > + struct cxl_event_dcd dcd; > } __packed; > > /* >
ira.weiny@ wrote: > From: Navneet Singh <navneet.singh@intel.com> > > A dynamic capacity devices (DCD) send events to signal the host about > changes in the availability of Dynamic Capacity (DC) memory. These > events contain extents, the addition or removal of which may occur at > any time. > > Adding memory is straight forward. If no region exists the extent is > rejected. If a region does exist, a region extent is formed and > surfaced. > > Removing memory requires checking if the memory is currently in use. > Memory use tracking is added in a subsequent patch so here the memory is > never in use and the removal occurs immediately. > > Most often extents will be offered to and accepted by the host in well > defined chunks. However, part of an extent may be requested for > release. Simplify extent tracking by signaling removal of any extent > which overlaps the requested release range. > > Force removal is intended as a mechanism between the FM and the device > and intended only when the host is unresponsive or otherwise broken. > Purposely ignore force removal events. > > Process DCD extents. > > Recall that all devices of an interleave set must offer a corresponding > extent for the region extent to be realized. This patch limits > interleave to 1. Thus the 1:1 mapping between device extent and DAX > region extent allows immediate surfacing. > > Signed-off-by: Navneet Singh <navneet.singh@intel.com> > Co-developed-by: Ira Weiny <ira.weiny@intel.com> > Signed-off-by: Ira Weiny <ira.weiny@intel.com> > > --- > Changes for v1 > [iweiny: remove all xarrays] > [iweiny: entirely new architecture] > --- > drivers/cxl/core/extent.c | 4 ++ > drivers/cxl/core/mbox.c | 142 +++++++++++++++++++++++++++++++++++++++++++--- > drivers/cxl/core/region.c | 139 ++++++++++++++++++++++++++++++++++++++++----- > drivers/cxl/cxl.h | 34 +++++++++++ > drivers/cxl/cxlmem.h | 21 +++---- > drivers/cxl/mem.c | 45 +++++++++++++++ > drivers/dax/cxl.c | 22 +++++++ > include/linux/cxl-event.h | 31 ++++++++++ > 8 files changed, 405 insertions(+), 33 deletions(-) > [..] > diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c > index 7635ff109578..a07d95136f0d 100644 > --- a/drivers/cxl/core/region.c > +++ b/drivers/cxl/core/region.c [..] > @@ -1502,18 +1552,7 @@ int cxl_ed_add_one_extent(struct cxl_endpoint_decoder *cxled, > dev_dbg(dev, "Adding DC extent DPA %#llx - %#llx\n", > ext_dpa_range.start, ext_dpa_range.end); > > - /* > - * Without interleave... > - * HPA offset == DPA offset > - * ... but do the math anyway > - */ > - dpa_offset = ext_dpa_range.start - cxled->dpa_res->start; > - hpa = cxled->cxld.hpa_range.start + dpa_offset; > - > - ext_hpa_range = (struct range) { > - .start = hpa - cxlr->cxlr_dax->hpa_range.start, > - .end = ext_hpa_range.start + range_len(&ext_dpa_range) - 1, > - }; Please don't refactor code that just got added in the same series. Upon seeing that this wants a common helper in this patch, go back to the original patch and put it in a helper from the beginning. [..] > diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h > index 5379ad7f5852..156d7c9a8de5 100644 > --- a/drivers/cxl/cxl.h > +++ b/drivers/cxl/cxl.h [..] > @@ -891,10 +900,18 @@ bool is_cxl_region(struct device *dev); > > extern struct bus_type cxl_bus_type; I skipped ahead here in the review since notification organization feels wrong. > +/* Driver Notifier Data */ > +struct cxl_drv_nd { I never would have guessed that cxl_drv_nd meant cxl driver notifier data, it might be able to be jettisoned. > + enum dc_event event; > + struct cxl_dc_extent *dc_extent; > + struct region_extent *reg_ext; > +}; > + > struct cxl_driver { > const char *name; > int (*probe)(struct device *dev); > void (*remove)(struct device *dev); > + int (*notify)(struct device *dev, struct cxl_drv_nd *nd); First, this feels an overly DCD specific mechanism to inflict on the core generic 'struct cxl_driver'. Most 'struct cxl_driver' instances do not need any 'notify' callback and 'struct cxl_drv_nd' makes this even less relevant to the core 'struct cxl_driver' definition. Second, it leads to 2 anonymous ->notify() callbacks wht too deep of a stack. It feels as if the resulting code is being actively evasive. Given that the event handling code already knows how to lookup a 'struct cxl_region', as Alison demonstrated in her DPA->HPA series, it should be straightforward to lookup a 'struct cxl_dax_region' without a notifying the cxl_mem driver. So my expectation is just enough DCD event parsing to determine when the payload applies to given cxl_dax_region. Then define a: struct cxl_dax_region_driver { struct cxl_driver driver; void (*notify)(struct cxl_dax_region *cxlr_dax, ...); }; ...to send the payload over for further processing. If a cxl_dax_region device instance cannot be found, just drop the event record.
diff --git a/drivers/cxl/core/extent.c b/drivers/cxl/core/extent.c index 487c220f1c3c..e98acd98ebe2 100644 --- a/drivers/cxl/core/extent.c +++ b/drivers/cxl/core/extent.c @@ -118,6 +118,10 @@ int dax_region_create_ext(struct cxl_dax_region *cxlr_dax, if (rc) goto err; + rc = cxl_region_notify_extent(cxled->cxld.region, DCD_ADD_CAPACITY, reg_ext); + if (rc) + goto err; + dev_dbg(dev, "DAX region extent HPA %#llx - %#llx\n", reg_ext->hpa_range.start, reg_ext->hpa_range.end); diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c index 6b00e717e42b..7babac2d1c95 100644 --- a/drivers/cxl/core/mbox.c +++ b/drivers/cxl/core/mbox.c @@ -870,6 +870,37 @@ int cxl_enumerate_cmds(struct cxl_memdev_state *mds) } EXPORT_SYMBOL_NS_GPL(cxl_enumerate_cmds, CXL); +static int cxl_notify_dc_extent(struct cxl_memdev_state *mds, + enum dc_event event, + struct cxl_dc_extent *dc_extent) +{ + struct cxl_drv_nd nd = (struct cxl_drv_nd) { + .event = event, + .dc_extent = dc_extent + }; + struct device *dev; + int rc = -ENXIO; + + dev = &mds->cxlds.cxlmd->dev; + dev_dbg(dev, "Notify: type %d DPA:%#llx LEN:%#llx\n", + event, le64_to_cpu(dc_extent->start_dpa), + le64_to_cpu(dc_extent->length)); + + device_lock(dev); + if (dev->driver) { + struct cxl_driver *mem_drv = to_cxl_drv(dev->driver); + + if (mem_drv->notify) { + dev_dbg(dev, "Notify driver type %d DPA:%#llx LEN:%#llx\n", + event, le64_to_cpu(dc_extent->start_dpa), + le64_to_cpu(dc_extent->length)); + rc = mem_drv->notify(dev, &nd); + } + } + device_unlock(dev); + return rc; +} + static int cxl_validate_extent(struct cxl_memdev_state *mds, struct cxl_dc_extent *dc_extent) { @@ -897,8 +928,8 @@ static int cxl_validate_extent(struct cxl_memdev_state *mds, return -EINVAL; } -static bool cxl_dc_extent_in_ed(struct cxl_endpoint_decoder *cxled, - struct cxl_dc_extent *extent) +bool cxl_dc_extent_in_ed(struct cxl_endpoint_decoder *cxled, + struct cxl_dc_extent *extent) { uint64_t start = le64_to_cpu(extent->start_dpa); uint64_t length = le64_to_cpu(extent->length); @@ -916,6 +947,7 @@ static bool cxl_dc_extent_in_ed(struct cxl_endpoint_decoder *cxled, return range_contains(&ed_range, &ext_range); } +EXPORT_SYMBOL_NS_GPL(cxl_dc_extent_in_ed, CXL); void cxl_event_trace_record(const struct cxl_memdev *cxlmd, enum cxl_event_log_type type, @@ -1027,15 +1059,20 @@ static int cxl_send_dc_cap_response(struct cxl_memdev_state *mds, size_t size; struct cxl_mbox_dc_response *dc_res __free(kfree); - size = struct_size(dc_res, extent_list, 1); + if (!extent) + size = struct_size(dc_res, extent_list, 0); + else + size = struct_size(dc_res, extent_list, 1); dc_res = kzalloc(size, GFP_KERNEL); if (!dc_res) return -ENOMEM; - dc_res->extent_list[0].dpa_start = cpu_to_le64(extent->start); - memset(dc_res->extent_list[0].reserved, 0, 8); - dc_res->extent_list[0].length = cpu_to_le64(range_len(extent)); - dc_res->extent_list_size = cpu_to_le32(1); + if (extent) { + dc_res->extent_list[0].dpa_start = cpu_to_le64(extent->start); + memset(dc_res->extent_list[0].reserved, 0, 8); + dc_res->extent_list[0].length = cpu_to_le64(range_len(extent)); + dc_res->extent_list_size = cpu_to_le32(1); + } mbox_cmd = (struct cxl_mbox_cmd) { .opcode = opcode, @@ -1072,6 +1109,85 @@ void cxl_release_ed_extent(struct cxl_ed_extent *extent) } EXPORT_SYMBOL_NS_GPL(cxl_release_ed_extent, CXL); +static int cxl_handle_dcd_release_event(struct cxl_memdev_state *mds, + struct cxl_dc_extent *dc_extent) +{ + return cxl_notify_dc_extent(mds, DCD_RELEASE_CAPACITY, dc_extent); +} + +static int cxl_handle_dcd_add_event(struct cxl_memdev_state *mds, + struct cxl_dc_extent *dc_extent) +{ + struct range alloc_range, *resp_range; + struct device *dev = mds->cxlds.dev; + int rc; + + alloc_range = (struct range){ + .start = le64_to_cpu(dc_extent->start_dpa), + .end = le64_to_cpu(dc_extent->start_dpa) + + le64_to_cpu(dc_extent->length) - 1, + }; + resp_range = &alloc_range; + + rc = cxl_notify_dc_extent(mds, DCD_ADD_CAPACITY, dc_extent); + if (rc) { + dev_dbg(dev, "unconsumed DC extent DPA:%#llx LEN:%#llx\n", + le64_to_cpu(dc_extent->start_dpa), + le64_to_cpu(dc_extent->length)); + resp_range = NULL; + } + + return cxl_send_dc_cap_response(mds, resp_range, + CXL_MBOX_OP_ADD_DC_RESPONSE); +} + +static char *cxl_dcd_evt_type_str(u8 type) +{ + switch (type) { + case DCD_ADD_CAPACITY: + return "add"; + case DCD_RELEASE_CAPACITY: + return "release"; + case DCD_FORCED_CAPACITY_RELEASE: + return "force release"; + default: + break; + } + + return "<unknown>"; +} + +static int cxl_handle_dcd_event_records(struct cxl_memdev_state *mds, + struct cxl_event_record_raw *raw_rec) +{ + struct cxl_event_dcd *event = &raw_rec->event.dcd; + struct cxl_dc_extent *dc_extent = &event->extent; + struct device *dev = mds->cxlds.dev; + uuid_t *id = &raw_rec->id; + + if (!uuid_equal(id, &CXL_EVENT_DC_EVENT_UUID)) + return -EINVAL; + + dev_dbg(dev, "DCD event %s : DPA:%#llx LEN:%#llx\n", + cxl_dcd_evt_type_str(event->event_type), + le64_to_cpu(dc_extent->start_dpa), + le64_to_cpu(dc_extent->length)); + + switch (event->event_type) { + case DCD_ADD_CAPACITY: + return cxl_handle_dcd_add_event(mds, dc_extent); + case DCD_RELEASE_CAPACITY: + return cxl_handle_dcd_release_event(mds, dc_extent); + case DCD_FORCED_CAPACITY_RELEASE: + dev_err_ratelimited(dev, "Forced release event ignored.\n"); + return 0; + default: + return -EINVAL; + } + + return 0; +} + static void cxl_mem_get_records_log(struct cxl_memdev_state *mds, enum cxl_event_log_type type) { @@ -1109,9 +1225,17 @@ static void cxl_mem_get_records_log(struct cxl_memdev_state *mds, if (!nr_rec) break; - for (i = 0; i < nr_rec; i++) + for (i = 0; i < nr_rec; i++) { __cxl_event_trace_record(cxlmd, type, &payload->records[i]); + if (type == CXL_EVENT_TYPE_DCD) { + rc = cxl_handle_dcd_event_records(mds, + &payload->records[i]); + if (rc) + dev_err_ratelimited(dev, "dcd event failed: %d\n", + rc); + } + } if (payload->flags & CXL_GET_EVENT_FLAG_OVERFLOW) trace_cxl_overflow(cxlmd, type, payload); @@ -1143,6 +1267,8 @@ void cxl_mem_get_event_records(struct cxl_memdev_state *mds, u32 status) { dev_dbg(mds->cxlds.dev, "Reading event logs: %x\n", status); + if (cxl_dcd_supported(mds) && (status & CXLDEV_EVENT_STATUS_DCD)) + cxl_mem_get_records_log(mds, CXL_EVENT_TYPE_DCD); if (status & CXLDEV_EVENT_STATUS_FATAL) cxl_mem_get_records_log(mds, CXL_EVENT_TYPE_FATAL); if (status & CXLDEV_EVENT_STATUS_FAIL) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 7635ff109578..a07d95136f0d 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -1450,6 +1450,57 @@ static int cxl_region_validate_position(struct cxl_region *cxlr, return 0; } +int cxl_region_notify_extent(struct cxl_region *cxlr, enum dc_event event, + struct region_extent *reg_ext) +{ + struct cxl_dax_region *cxlr_dax; + struct device *dev; + int rc = -ENXIO; + + cxlr_dax = cxlr->cxlr_dax; + dev = &cxlr_dax->dev; + dev_dbg(dev, "Trying notify: type %d HPA %#llx - %#llx\n", + event, reg_ext->hpa_range.start, reg_ext->hpa_range.end); + + device_lock(dev); + if (dev->driver) { + struct cxl_driver *reg_drv = to_cxl_drv(dev->driver); + struct cxl_drv_nd nd = (struct cxl_drv_nd) { + .event = event, + .reg_ext = reg_ext, + }; + + if (reg_drv->notify) { + dev_dbg(dev, "Notify: type %d HPA %#llx - %#llx\n", + event, reg_ext->hpa_range.start, + reg_ext->hpa_range.end); + rc = reg_drv->notify(dev, &nd); + } + } + device_unlock(dev); + return rc; +} + +static void calc_hpa_range(struct cxl_endpoint_decoder *cxled, + struct cxl_dax_region *cxlr_dax, + struct cxl_dc_extent *dc_extent, + struct range *dpa_range, + struct range *hpa_range) +{ + resource_size_t dpa_offset, hpa; + + /* + * Without interleave... + * HPA offset == DPA offset + * ... but do the math anyway + */ + dpa_offset = dpa_range->start - cxled->dpa_res->start; + hpa = cxled->cxld.hpa_range.start + dpa_offset; + + hpa_range->start = hpa - cxlr_dax->hpa_range.start; + hpa_range->end = hpa_range->start + range_len(dpa_range) - 1; +} + static int extent_check_overlap(struct device *dev, void *arg) { struct range *new_range = arg; @@ -1480,7 +1531,6 @@ int cxl_ed_add_one_extent(struct cxl_endpoint_decoder *cxled, struct cxl_region *cxlr = cxled->cxld.region; struct range ext_dpa_range, ext_hpa_range; struct device *dev = &cxlr->dev; - resource_size_t dpa_offset, hpa; /* * Interleave ways == 1 means this coresponds to a 1:1 mapping between @@ -1502,18 +1552,7 @@ int cxl_ed_add_one_extent(struct cxl_endpoint_decoder *cxled, dev_dbg(dev, "Adding DC extent DPA %#llx - %#llx\n", ext_dpa_range.start, ext_dpa_range.end); - /* - * Without interleave... - * HPA offset == DPA offset - * ... but do the math anyway - */ - dpa_offset = ext_dpa_range.start - cxled->dpa_res->start; - hpa = cxled->cxld.hpa_range.start + dpa_offset; - - ext_hpa_range = (struct range) { - .start = hpa - cxlr->cxlr_dax->hpa_range.start, - .end = ext_hpa_range.start + range_len(&ext_dpa_range) - 1, - }; + calc_hpa_range(cxled, cxlr->cxlr_dax, dc_extent, &ext_dpa_range, &ext_hpa_range); if (extent_overlaps(cxlr->cxlr_dax, &ext_hpa_range)) return -EINVAL; @@ -1527,6 +1566,80 @@ int cxl_ed_add_one_extent(struct cxl_endpoint_decoder *cxled, cxled); } +static void cxl_ed_rm_region_extent(struct cxl_region *cxlr, + struct region_extent *reg_ext) +{ + cxl_region_notify_extent(cxlr, DCD_RELEASE_CAPACITY, reg_ext); +} + +struct rm_data { + struct cxl_region *cxlr; + struct range *range; +}; + +static int cxl_rm_reg_ext_by_range(struct device *dev, void *data) +{ + struct rm_data *rm_data = data; + struct region_extent *reg_ext; + + if (!is_region_extent(dev)) + return 0; + reg_ext = to_region_extent(dev); + + /* + * Any extent which 'touches' the released range is notified + * for removal. No partials of the extent are released. + */ + if (range_overlaps(rm_data->range, ®_ext->hpa_range)) { + struct cxl_region *cxlr = rm_data->cxlr; + + dev_dbg(dev, "Remove DAX region ext HPA %#llx - %#llx\n", + reg_ext->hpa_range.start, reg_ext->hpa_range.end); + cxl_ed_rm_region_extent(cxlr, reg_ext); + } + return 0; +} + +static int cxl_ed_rm_extent(struct cxl_endpoint_decoder *cxled, + struct cxl_dc_extent *dc_extent) +{ + struct cxl_region *cxlr = cxled->cxld.region; + struct range hpa_range; + + struct range rel_dpa_range = { + .start = le64_to_cpu(dc_extent->start_dpa), + .end = le64_to_cpu(dc_extent->start_dpa) + + le64_to_cpu(dc_extent->length) - 1, + }; + + calc_hpa_range(cxled, cxlr->cxlr_dax, dc_extent, &rel_dpa_range, &hpa_range); + + struct rm_data rm_data = { + .cxlr = cxlr, + .range = &hpa_range, + }; + + return device_for_each_child(&cxlr->cxlr_dax->dev, &rm_data, + cxl_rm_reg_ext_by_range); +} + +int cxl_ed_notify_extent(struct cxl_endpoint_decoder *cxled, + struct cxl_drv_nd *nd) +{ + switch (nd->event) { + case DCD_ADD_CAPACITY: + return cxl_ed_add_one_extent(cxled, nd->dc_extent); + case DCD_RELEASE_CAPACITY: + return cxl_ed_rm_extent(cxled, nd->dc_extent); + case DCD_FORCED_CAPACITY_RELEASE: + default: + dev_err(&cxled->cxld.dev, "Unknown DC event %d\n", nd->event); + break; + } + return -ENXIO; +} +EXPORT_SYMBOL_NS_GPL(cxl_ed_notify_extent, CXL); + static int cxl_region_attach_position(struct cxl_region *cxlr, struct cxl_root_decoder *cxlrd, struct cxl_endpoint_decoder *cxled, diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 5379ad7f5852..156d7c9a8de5 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -10,6 +10,7 @@ #include <linux/log2.h> #include <linux/node.h> #include <linux/io.h> +#include <linux/cxl-event.h> /** * DOC: cxl objects @@ -613,6 +614,14 @@ struct cxl_pmem_region { struct cxl_pmem_region_mapping mapping[]; }; +/* See CXL 3.0 8.2.9.2.1.5 */ +enum dc_event { + DCD_ADD_CAPACITY, + DCD_RELEASE_CAPACITY, + DCD_FORCED_CAPACITY_RELEASE, + DCD_REGION_CONFIGURATION_UPDATED, +}; + struct cxl_dax_region { struct device dev; struct cxl_region *cxlr; @@ -891,10 +900,18 @@ bool is_cxl_region(struct device *dev); extern struct bus_type cxl_bus_type; +/* Driver Notifier Data */ +struct cxl_drv_nd { + enum dc_event event; + struct cxl_dc_extent *dc_extent; + struct region_extent *reg_ext; +}; + struct cxl_driver { const char *name; int (*probe)(struct device *dev); void (*remove)(struct device *dev); + int (*notify)(struct device *dev, struct cxl_drv_nd *nd); struct device_driver drv; int id; }; @@ -933,6 +950,8 @@ bool is_cxl_nvdimm(struct device *dev); bool is_cxl_nvdimm_bridge(struct device *dev); int devm_cxl_add_nvdimm(struct cxl_memdev *cxlmd); struct cxl_nvdimm_bridge *cxl_find_nvdimm_bridge(struct cxl_memdev *cxlmd); +bool cxl_dc_extent_in_ed(struct cxl_endpoint_decoder *cxled, + struct cxl_dc_extent *extent); #ifdef CONFIG_CXL_REGION bool is_cxl_pmem_region(struct device *dev); @@ -940,6 +959,10 @@ struct cxl_pmem_region *to_cxl_pmem_region(struct device *dev); int cxl_add_to_region(struct cxl_port *root, struct cxl_endpoint_decoder *cxled); struct cxl_dax_region *to_cxl_dax_region(struct device *dev); +int cxl_ed_notify_extent(struct cxl_endpoint_decoder *cxled, + struct cxl_drv_nd *nd); +int cxl_region_notify_extent(struct cxl_region *cxlr, enum dc_event event, + struct region_extent *reg_ext); #else static inline bool is_cxl_pmem_region(struct device *dev) { @@ -958,6 +981,17 @@ static inline struct cxl_dax_region *to_cxl_dax_region(struct device *dev) { return NULL; } +static inline int cxl_ed_notify_extent(struct cxl_endpoint_decoder *cxled, + struct cxl_drv_nd *nd) +{ + return 0; +} +static inline int cxl_region_notify_extent(struct cxl_region *cxlr, + enum dc_event event, + struct region_extent *reg_ext) +{ + return 0; +} #endif void cxl_endpoint_parse_cdat(struct cxl_port *port); diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index 8f2d8944d334..eb10cae99ff0 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -619,18 +619,6 @@ struct cxl_mbox_dc_response { } __packed extent_list[]; } __packed; -/* - * CXL rev 3.1 section 8.2.9.2.1.6; Table 8-51 - */ -#define CXL_DC_EXTENT_TAG_LEN 0x10 -struct cxl_dc_extent { - __le64 start_dpa; - __le64 length; - u8 tag[CXL_DC_EXTENT_TAG_LEN]; - __le16 shared_extn_seq; - u8 reserved[6]; -} __packed; - /* * Get Dynamic Capacity Extent List; Input Payload * CXL rev 3.1 section 8.2.9.9.9.2; Table 8-166 @@ -714,6 +702,14 @@ struct cxl_mbox_identify { UUID_INIT(0xfe927475, 0xdd59, 0x4339, 0xa5, 0x86, 0x79, 0xba, 0xb1, \ 0x13, 0xb7, 0x74) +/* + * Dynamic Capacity Event Record + * CXL rev 3.1 section 8.2.9.2.1; Table 8-43 + */ +#define CXL_EVENT_DC_EVENT_UUID \ + UUID_INIT(0xca95afa7, 0xf183, 0x4018, 0x8c, 0x2f, 0x95, 0x26, 0x8e, \ + 0x10, 0x1a, 0x2a) + /* * Get Event Records output payload * CXL rev 3.0 section 8.2.9.2.2; Table 8-50 @@ -739,6 +735,7 @@ enum cxl_event_log_type { CXL_EVENT_TYPE_WARN, CXL_EVENT_TYPE_FAIL, CXL_EVENT_TYPE_FATAL, + CXL_EVENT_TYPE_DCD, CXL_EVENT_TYPE_MAX }; diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c index 0c79d9ce877c..20832f09c40c 100644 --- a/drivers/cxl/mem.c +++ b/drivers/cxl/mem.c @@ -103,6 +103,50 @@ static int cxl_debugfs_poison_clear(void *data, u64 dpa) DEFINE_DEBUGFS_ATTRIBUTE(cxl_poison_clear_fops, NULL, cxl_debugfs_poison_clear, "%llx\n"); +static int match_ep_decoder_by_range(struct device *dev, void *data) +{ + struct cxl_dc_extent *dc_extent = data; + struct cxl_endpoint_decoder *cxled; + + if (!is_endpoint_decoder(dev)) + return 0; + + cxled = to_cxl_endpoint_decoder(dev); + if (!cxled->cxld.region) + return 0; + + return cxl_dc_extent_in_ed(cxled, dc_extent); +} + +static int cxl_mem_notify(struct device *dev, struct cxl_drv_nd *nd) +{ + struct cxl_memdev *cxlmd = to_cxl_memdev(dev); + struct cxl_port *endpoint = cxlmd->endpoint; + struct cxl_endpoint_decoder *cxled; + struct cxl_dc_extent *dc_extent; + struct device *ep_dev; + int rc; + + dc_extent = nd->dc_extent; + dev_dbg(dev, "notify DC action %d DPA:%#llx LEN:%#llx\n", + nd->event, le64_to_cpu(dc_extent->start_dpa), + le64_to_cpu(dc_extent->length)); + + ep_dev = device_find_child(&endpoint->dev, dc_extent, + match_ep_decoder_by_range); + if (!ep_dev) { + dev_dbg(dev, "Extent DPA:%#llx LEN:%#llx not mapped; evt %d\n", + le64_to_cpu(dc_extent->start_dpa), + le64_to_cpu(dc_extent->length), nd->event); + return -ENXIO; + } + + cxled = to_cxl_endpoint_decoder(ep_dev); + rc = cxl_ed_notify_extent(cxled, nd); + put_device(ep_dev); + return rc; +} + static int cxl_mem_probe(struct device *dev) { struct cxl_memdev *cxlmd = to_cxl_memdev(dev); @@ -244,6 +288,7 @@ __ATTRIBUTE_GROUPS(cxl_mem); static struct cxl_driver cxl_mem_driver = { .name = "cxl_mem", .probe = cxl_mem_probe, + .notify = cxl_mem_notify, .id = CXL_DEVICE_MEMORY_EXPANDER, .drv = { .dev_groups = cxl_mem_groups, diff --git a/drivers/dax/cxl.c b/drivers/dax/cxl.c index 70bdc7a878ab..83ee45aff69a 100644 --- a/drivers/dax/cxl.c +++ b/drivers/dax/cxl.c @@ -42,6 +42,27 @@ static void cxl_dax_region_add_extents(struct cxl_dax_region *cxlr_dax, device_for_each_child(&cxlr_dax->dev, dax_region, cxl_dax_region_add_extent); } +static int cxl_dax_region_notify(struct device *dev, + struct cxl_drv_nd *nd) +{ + struct cxl_dax_region *cxlr_dax = to_cxl_dax_region(dev); + struct dax_region *dax_region = dev_get_drvdata(dev); + struct region_extent *reg_ext = nd->reg_ext; + + switch (nd->event) { + case DCD_ADD_CAPACITY: + return __cxl_dax_region_add_extent(dax_region, reg_ext); + case DCD_RELEASE_CAPACITY: + return 0; + case DCD_FORCED_CAPACITY_RELEASE: + default: + dev_err(&cxlr_dax->dev, "Unknown DC event %d\n", nd->event); + break; + } + + return -ENXIO; +} + static int cxl_dax_region_probe(struct device *dev) { struct cxl_dax_region *cxlr_dax = to_cxl_dax_region(dev); @@ -85,6 +106,7 @@ static int cxl_dax_region_probe(struct device *dev) static struct cxl_driver cxl_dax_region_driver = { .name = "cxl_dax_region", .probe = cxl_dax_region_probe, + .notify = cxl_dax_region_notify, .id = CXL_DEVICE_DAX_REGION, .drv = { .suppress_bind_attrs = true, diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h index 03fa6d50d46f..6b745c913f96 100644 --- a/include/linux/cxl-event.h +++ b/include/linux/cxl-event.h @@ -91,11 +91,42 @@ struct cxl_event_mem_module { u8 reserved[0x3d]; } __packed; +/* + * CXL rev 3.1 section 8.2.9.2.1.6; Table 8-51 + */ +#define CXL_DC_EXTENT_TAG_LEN 0x10 +struct cxl_dc_extent { + __le64 start_dpa; + __le64 length; + u8 tag[CXL_DC_EXTENT_TAG_LEN]; + __le16 shared_extn_seq; + u8 reserved[0x6]; +} __packed; + +/* + * Dynamic Capacity Event Record + * CXL rev 3.1 section 8.2.9.2.1.6; Table 8-50 + */ +struct cxl_event_dcd { + struct cxl_event_record_hdr hdr; + u8 event_type; + u8 validity_flags; + __le16 host_id; + u8 region_index; + u8 flags; + u8 reserved1[0x2]; + struct cxl_dc_extent extent; + u8 reserved2[0x18]; + __le32 num_avail_extents; + __le32 num_avail_tags; +} __packed; + union cxl_event { struct cxl_event_generic generic; struct cxl_event_gen_media gen_media; struct cxl_event_dram dram; struct cxl_event_mem_module mem_module; + struct cxl_event_dcd dcd; } __packed; /*