Message ID | 20241029151922.459139-9-kbusch@meta.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | write hints with nvme fdp, scsi streams | expand |
On 10/29/24 08:19, Keith Busch wrote: > From: Kanchan Joshi <joshi.k@samsung.com> > > Flexible Data Placement (FDP), as ratified in TP 4146a, allows the host > to control the placement of logical blocks so as to reduce the SSD WAF. > Userspace can send the write hint information using io_uring or fcntl. > > Fetch the placement-identifiers if the device supports FDP. The incoming > write-hint is mapped to a placement-identifier, which in turn is set in > the DSPEC field of the write command. > > Signed-off-by: Kanchan Joshi <joshi.k@samsung.com> > Signed-off-by: Hui Qi <hui81.qi@samsung.com> > Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com> > Reviewed-by: Hannes Reinecke <hare@suse.de> > Signed-off-by: Keith Busch <kbusch@kernel.org> > --- > drivers/nvme/host/core.c | 84 ++++++++++++++++++++++++++++++++++++++++ > drivers/nvme/host/nvme.h | 5 +++ > include/linux/nvme.h | 19 +++++++++ > 3 files changed, 108 insertions(+) > > diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c > index 3de7555a7de74..bd7b89912ddb9 100644 > --- a/drivers/nvme/host/core.c > +++ b/drivers/nvme/host/core.c > @@ -44,6 +44,20 @@ struct nvme_ns_info { > bool is_removed; > }; > > +struct nvme_fdp_ruh_status_desc { > + u16 pid; > + u16 ruhid; > + u32 earutr; > + u64 ruamw; > + u8 rsvd16[16]; > +}; > + > +struct nvme_fdp_ruh_status { > + u8 rsvd0[14]; > + __le16 nruhsd; > + struct nvme_fdp_ruh_status_desc ruhsd[]; > +}; > + > unsigned int admin_timeout = 60; > module_param(admin_timeout, uint, 0644); > MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands"); > @@ -657,6 +671,7 @@ static void nvme_free_ns_head(struct kref *ref) > ida_free(&head->subsys->ns_ida, head->instance); > cleanup_srcu_struct(&head->srcu); > nvme_put_subsystem(head->subsys); > + kfree(head->plids); > kfree(head); > } > > @@ -974,6 +989,13 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, > if (req->cmd_flags & REQ_RAHEAD) > dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; > > + if (req->write_hint && ns->head->nr_plids) { > + u16 hint = max(req->write_hint, ns->head->nr_plids); > + > + dsmgmt |= ns->head->plids[hint - 1] << 16; > + control |= NVME_RW_DTYPE_DPLCMT; > + } > + > if (req->cmd_flags & REQ_ATOMIC && !nvme_valid_atomic_write(req)) > return BLK_STS_INVAL; > > @@ -2105,6 +2127,52 @@ static int nvme_update_ns_info_generic(struct nvme_ns *ns, > return ret; > } > > +static int nvme_fetch_fdp_plids(struct nvme_ns *ns, u32 nsid) > +{ > + struct nvme_fdp_ruh_status_desc *ruhsd; > + struct nvme_ns_head *head = ns->head; > + struct nvme_fdp_ruh_status *ruhs; > + struct nvme_command c = {}; > + int size, ret, i; > + > + if (head->plids) > + return 0; > + > + size = struct_size(ruhs, ruhsd, NVME_MAX_PLIDS); > + ruhs = kzalloc(size, GFP_KERNEL); > + if (!ruhs) > + return -ENOMEM; > + > + c.imr.opcode = nvme_cmd_io_mgmt_recv; > + c.imr.nsid = cpu_to_le32(nsid); > + c.imr.mo = 0x1; can we please add some comment where values are hardcoded ? > + c.imr.numd = cpu_to_le32((size >> 2) - 1); > + > + ret = nvme_submit_sync_cmd(ns->queue, &c, ruhs, size); > + if (ret) > + goto out; > + > + i = le16_to_cpu(ruhs->nruhsd); instead of i why can't we use local variable nr_plids ? > + if (!i) > + goto out; > + > + ns->head->nr_plids = min_t(u16, i, NVME_MAX_PLIDS); > + head->plids = kcalloc(ns->head->nr_plids, sizeof(head->plids), > + GFP_KERNEL); > + if (!head->plids) { > + ret = -ENOMEM; > + goto out; > + } > + > + for (i = 0; i < ns->head->nr_plids; i++) { > + ruhsd = &ruhs->ruhsd[i]; > + head->plids[i] = le16_to_cpu(ruhsd->pid); > + } > +out: > + kfree(ruhs); > + return ret; > +} > + > static int nvme_update_ns_info_block(struct nvme_ns *ns, > struct nvme_ns_info *info) > { > @@ -2141,6 +2209,19 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, > goto out; > } > > + if (ns->ctrl->ctratt & NVME_CTRL_ATTR_FDPS) { > + ret = nvme_fetch_fdp_plids(ns, info->nsid); > + if (ret) > + dev_warn(ns->ctrl->device, > + "FDP failure status:0x%x\n", ret); > + if (ret < 0) > + goto out; > + } else { > + ns->head->nr_plids = 0; > + kfree(ns->head->plids); > + ns->head->plids = NULL; > + } > + > blk_mq_freeze_queue(ns->disk->queue); > ns->head->lba_shift = id->lbaf[lbaf].ds; > ns->head->nuse = le64_to_cpu(id->nuse); > @@ -2171,6 +2252,9 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, > if (!nvme_init_integrity(ns->head, &lim, info)) > capacity = 0; > > + lim.max_write_hints = ns->head->nr_plids; > + if (lim.max_write_hints) > + lim.features |= BLK_FEAT_PLACEMENT_HINTS; > ret = queue_limits_commit_update(ns->disk->queue, &lim); > if (ret) { > blk_mq_unfreeze_queue(ns->disk->queue); > diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h > index 093cb423f536b..cec8e5d96377b 100644 > --- a/drivers/nvme/host/nvme.h > +++ b/drivers/nvme/host/nvme.h > @@ -454,6 +454,8 @@ struct nvme_ns_ids { > u8 csi; > }; > > +#define NVME_MAX_PLIDS (NVME_CTRL_PAGE_SIZE / sizeof(16)) this calculates how many plids can fit into the ctrl page size ? sorry but I didn't understand sizeof(16) here, since plids are u16 nvme_ns_head -> u16 *plidsshould this be sizeof(u16) ? -ck
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 3de7555a7de74..bd7b89912ddb9 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -44,6 +44,20 @@ struct nvme_ns_info { bool is_removed; }; +struct nvme_fdp_ruh_status_desc { + u16 pid; + u16 ruhid; + u32 earutr; + u64 ruamw; + u8 rsvd16[16]; +}; + +struct nvme_fdp_ruh_status { + u8 rsvd0[14]; + __le16 nruhsd; + struct nvme_fdp_ruh_status_desc ruhsd[]; +}; + unsigned int admin_timeout = 60; module_param(admin_timeout, uint, 0644); MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands"); @@ -657,6 +671,7 @@ static void nvme_free_ns_head(struct kref *ref) ida_free(&head->subsys->ns_ida, head->instance); cleanup_srcu_struct(&head->srcu); nvme_put_subsystem(head->subsys); + kfree(head->plids); kfree(head); } @@ -974,6 +989,13 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, if (req->cmd_flags & REQ_RAHEAD) dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; + if (req->write_hint && ns->head->nr_plids) { + u16 hint = max(req->write_hint, ns->head->nr_plids); + + dsmgmt |= ns->head->plids[hint - 1] << 16; + control |= NVME_RW_DTYPE_DPLCMT; + } + if (req->cmd_flags & REQ_ATOMIC && !nvme_valid_atomic_write(req)) return BLK_STS_INVAL; @@ -2105,6 +2127,52 @@ static int nvme_update_ns_info_generic(struct nvme_ns *ns, return ret; } +static int nvme_fetch_fdp_plids(struct nvme_ns *ns, u32 nsid) +{ + struct nvme_fdp_ruh_status_desc *ruhsd; + struct nvme_ns_head *head = ns->head; + struct nvme_fdp_ruh_status *ruhs; + struct nvme_command c = {}; + int size, ret, i; + + if (head->plids) + return 0; + + size = struct_size(ruhs, ruhsd, NVME_MAX_PLIDS); + ruhs = kzalloc(size, GFP_KERNEL); + if (!ruhs) + return -ENOMEM; + + c.imr.opcode = nvme_cmd_io_mgmt_recv; + c.imr.nsid = cpu_to_le32(nsid); + c.imr.mo = 0x1; + c.imr.numd = cpu_to_le32((size >> 2) - 1); + + ret = nvme_submit_sync_cmd(ns->queue, &c, ruhs, size); + if (ret) + goto out; + + i = le16_to_cpu(ruhs->nruhsd); + if (!i) + goto out; + + ns->head->nr_plids = min_t(u16, i, NVME_MAX_PLIDS); + head->plids = kcalloc(ns->head->nr_plids, sizeof(head->plids), + GFP_KERNEL); + if (!head->plids) { + ret = -ENOMEM; + goto out; + } + + for (i = 0; i < ns->head->nr_plids; i++) { + ruhsd = &ruhs->ruhsd[i]; + head->plids[i] = le16_to_cpu(ruhsd->pid); + } +out: + kfree(ruhs); + return ret; +} + static int nvme_update_ns_info_block(struct nvme_ns *ns, struct nvme_ns_info *info) { @@ -2141,6 +2209,19 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, goto out; } + if (ns->ctrl->ctratt & NVME_CTRL_ATTR_FDPS) { + ret = nvme_fetch_fdp_plids(ns, info->nsid); + if (ret) + dev_warn(ns->ctrl->device, + "FDP failure status:0x%x\n", ret); + if (ret < 0) + goto out; + } else { + ns->head->nr_plids = 0; + kfree(ns->head->plids); + ns->head->plids = NULL; + } + blk_mq_freeze_queue(ns->disk->queue); ns->head->lba_shift = id->lbaf[lbaf].ds; ns->head->nuse = le64_to_cpu(id->nuse); @@ -2171,6 +2252,9 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, if (!nvme_init_integrity(ns->head, &lim, info)) capacity = 0; + lim.max_write_hints = ns->head->nr_plids; + if (lim.max_write_hints) + lim.features |= BLK_FEAT_PLACEMENT_HINTS; ret = queue_limits_commit_update(ns->disk->queue, &lim); if (ret) { blk_mq_unfreeze_queue(ns->disk->queue); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 093cb423f536b..cec8e5d96377b 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -454,6 +454,8 @@ struct nvme_ns_ids { u8 csi; }; +#define NVME_MAX_PLIDS (NVME_CTRL_PAGE_SIZE / sizeof(16)) + /* * Anchor structure for namespaces. There is one for each namespace in a * NVMe subsystem that any of our controllers can see, and the namespace @@ -490,6 +492,9 @@ struct nvme_ns_head { struct device cdev_device; struct gendisk *disk; + + u16 nr_plids; + u16 *plids; #ifdef CONFIG_NVME_MULTIPATH struct bio_list requeue_list; spinlock_t requeue_lock; diff --git a/include/linux/nvme.h b/include/linux/nvme.h index b58d9405d65e0..a954eaee5b0f3 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -275,6 +275,7 @@ enum nvme_ctrl_attr { NVME_CTRL_ATTR_HID_128_BIT = (1 << 0), NVME_CTRL_ATTR_TBKAS = (1 << 6), NVME_CTRL_ATTR_ELBAS = (1 << 15), + NVME_CTRL_ATTR_FDPS = (1 << 19), }; struct nvme_id_ctrl { @@ -843,6 +844,7 @@ enum nvme_opcode { nvme_cmd_resv_register = 0x0d, nvme_cmd_resv_report = 0x0e, nvme_cmd_resv_acquire = 0x11, + nvme_cmd_io_mgmt_recv = 0x12, nvme_cmd_resv_release = 0x15, nvme_cmd_zone_mgmt_send = 0x79, nvme_cmd_zone_mgmt_recv = 0x7a, @@ -864,6 +866,7 @@ enum nvme_opcode { nvme_opcode_name(nvme_cmd_resv_register), \ nvme_opcode_name(nvme_cmd_resv_report), \ nvme_opcode_name(nvme_cmd_resv_acquire), \ + nvme_opcode_name(nvme_cmd_io_mgmt_recv), \ nvme_opcode_name(nvme_cmd_resv_release), \ nvme_opcode_name(nvme_cmd_zone_mgmt_send), \ nvme_opcode_name(nvme_cmd_zone_mgmt_recv), \ @@ -1015,6 +1018,7 @@ enum { NVME_RW_PRINFO_PRCHK_GUARD = 1 << 12, NVME_RW_PRINFO_PRACT = 1 << 13, NVME_RW_DTYPE_STREAMS = 1 << 4, + NVME_RW_DTYPE_DPLCMT = 2 << 4, NVME_WZ_DEAC = 1 << 9, }; @@ -1102,6 +1106,20 @@ struct nvme_zone_mgmt_recv_cmd { __le32 cdw14[2]; }; +struct nvme_io_mgmt_recv_cmd { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __le64 rsvd2[2]; + union nvme_data_ptr dptr; + __u8 mo; + __u8 rsvd11; + __u16 mos; + __le32 numd; + __le32 cdw12[4]; +}; + enum { NVME_ZRA_ZONE_REPORT = 0, NVME_ZRASF_ZONE_REPORT_ALL = 0, @@ -1822,6 +1840,7 @@ struct nvme_command { struct nvmf_auth_receive_command auth_receive; struct nvme_dbbuf dbbuf; struct nvme_directive_cmd directive; + struct nvme_io_mgmt_recv_cmd imr; }; };