Message ID | 20190502114801.23116-9-mlevitsk@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | RFC: NVME MDEV | expand |
On Thu, 2019-05-02 at 14:47 +0300, Maxim Levitsky wrote: > Note that currently the number of hw queues reserved for mdev, > has to be pre determined on module load. > > (I used to allocate the queues dynamicaly on demand, but > recent changes to allocate polled/read queues made > this somewhat difficult, so I dropped this for now) > > Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com> > --- > drivers/nvme/host/pci.c | 375 ++++++++++++++++++++++++++++++++++++++- > drivers/nvme/mdev/host.c | 46 ++--- > drivers/nvme/mdev/io.c | 46 +++-- > drivers/nvme/mdev/mmio.c | 3 - > 4 files changed, 421 insertions(+), 49 deletions(-) The changes in drivers/nvme/mdev aren't supposed to be here, this was some code moving around to reduce the diff in the generic block layer support code, it supposed to go to the main mdev commit. Best regards, Maxim Levitsky
On 5/2/2019 5:47 AM, Maxim Levitsky wrote: > Note that currently the number of hw queues reserved for mdev, > has to be pre determined on module load. > > (I used to allocate the queues dynamicaly on demand, but > recent changes to allocate polled/read queues made > this somewhat difficult, so I dropped this for now) > > Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com> > --- > drivers/nvme/host/pci.c | 375 ++++++++++++++++++++++++++++++++++++++- > drivers/nvme/mdev/host.c | 46 ++--- > drivers/nvme/mdev/io.c | 46 +++-- > drivers/nvme/mdev/mmio.c | 3 - > 4 files changed, 421 insertions(+), 49 deletions(-) > > diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c > index 282f28c851c1..87507e710374 100644 > --- a/drivers/nvme/host/pci.c > +++ b/drivers/nvme/host/pci.c > @@ -23,6 +23,7 @@ > #include <linux/io-64-nonatomic-lo-hi.h> > #include <linux/sed-opal.h> > #include <linux/pci-p2pdma.h> > +#include "../mdev/mdev.h" > > #include "trace.h" > #include "nvme.h" > @@ -32,6 +33,7 @@ > > #define SGES_PER_PAGE (PAGE_SIZE / sizeof(struct nvme_sgl_desc)) > > +#define USE_SMALL_PRP_POOL(nprps) ((nprps) < (256 / 8)) > /* > * These can be higher, but we need to ensure that any command doesn't > * require an sg allocation that needs more than a page of data. > @@ -83,12 +85,24 @@ static int poll_queues = 0; > module_param_cb(poll_queues, &queue_count_ops, &poll_queues, 0644); > MODULE_PARM_DESC(poll_queues, "Number of queues to use for polled IO."); > > +static int mdev_queues; > +#ifdef CONFIG_NVME_MDEV > +module_param_cb(mdev_queues, &queue_count_ops, &mdev_queues, 0644); > +MODULE_PARM_DESC(mdev_queues, "Number of queues to use for mediated VFIO"); > +#endif > + > struct nvme_dev; > struct nvme_queue; > > static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown); > static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode); > > +#ifdef CONFIG_NVME_MDEV > +static void nvme_ext_queue_reset(struct nvme_dev *dev, u16 qid); > +#else > +static void nvme_ext_queue_reset(struct nvme_dev *dev, u16 qid) {} > +#endif > + > /* > * Represents an NVM Express device. Each nvme_dev is a PCI function. > */ > @@ -103,6 +117,7 @@ struct nvme_dev { > unsigned online_queues; > unsigned max_qid; > unsigned io_queues[HCTX_MAX_TYPES]; > + unsigned int mdev_queues; > unsigned int num_vecs; > int q_depth; > u32 db_stride; > @@ -110,6 +125,7 @@ struct nvme_dev { > unsigned long bar_mapped_size; > struct work_struct remove_work; > struct mutex shutdown_lock; > + struct mutex ext_dev_lock; > bool subsystem; > u64 cmb_size; > bool cmb_use_sqes; > @@ -172,6 +188,16 @@ static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl) > return container_of(ctrl, struct nvme_dev, ctrl); > } > > +/* Simplified IO descriptor for MDEV use */ > +struct nvme_ext_iod { > + struct list_head link; > + u32 user_tag; > + int nprps; > + struct nvme_ext_data_iter *saved_iter; > + dma_addr_t first_prplist_dma; > + __le64 *prpslists[NVME_MAX_SEGS]; > +}; > + > /* > * An NVM Express queue. Each device has at least two (one for admin > * commands and one for I/O commands). > @@ -196,15 +222,26 @@ struct nvme_queue { > u16 qid; > u8 cq_phase; > unsigned long flags; > + > #define NVMEQ_ENABLED 0 > #define NVMEQ_SQ_CMB 1 > #define NVMEQ_DELETE_ERROR 2 > #define NVMEQ_POLLED 3 > +#define NVMEQ_EXTERNAL 4 > + > u32 *dbbuf_sq_db; > u32 *dbbuf_cq_db; > u32 *dbbuf_sq_ei; > u32 *dbbuf_cq_ei; > struct completion delete_done; > + > + /* queue passthrough for external use */ > + struct { > + int inflight; > + struct nvme_ext_iod *iods; > + struct list_head free_iods; > + struct list_head used_iods; > + } ext; > }; > > /* > @@ -255,7 +292,7 @@ static inline void _nvme_check_size(void) > > static unsigned int max_io_queues(void) > { > - return num_possible_cpus() + write_queues + poll_queues; > + return num_possible_cpus() + write_queues + poll_queues + mdev_queues; > } > > static unsigned int max_queue_count(void) > @@ -1066,6 +1103,7 @@ static irqreturn_t nvme_irq(int irq, void *data) > * the irq handler, even if that was on another CPU. > */ > rmb(); > + > if (nvmeq->cq_head != nvmeq->last_cq_head) > ret = IRQ_HANDLED; > nvme_process_cq(nvmeq, &start, &end, -1); > @@ -1553,7 +1591,11 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid) > memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth)); > nvme_dbbuf_init(dev, nvmeq, qid); > dev->online_queues++; > + > wmb(); /* ensure the first interrupt sees the initialization */ > + > + if (test_bit(NVMEQ_EXTERNAL, &nvmeq->flags)) > + nvme_ext_queue_reset(nvmeq->dev, qid); > } > > static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled) > @@ -1759,7 +1801,7 @@ static int nvme_create_io_queues(struct nvme_dev *dev) > } > > max = min(dev->max_qid, dev->ctrl.queue_count - 1); > - if (max != 1 && dev->io_queues[HCTX_TYPE_POLL]) { > + if (max != 1) { > rw_queues = dev->io_queues[HCTX_TYPE_DEFAULT] + > dev->io_queues[HCTX_TYPE_READ]; > } else { > @@ -2095,14 +2137,23 @@ static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues) > * Poll queues don't need interrupts, but we need at least one IO > * queue left over for non-polled IO. > */ > - this_p_queues = poll_queues; > + this_p_queues = poll_queues + mdev_queues; > if (this_p_queues >= nr_io_queues) { > this_p_queues = nr_io_queues - 1; > irq_queues = 1; > } else { > irq_queues = nr_io_queues - this_p_queues + 1; > } > + > + if (mdev_queues > this_p_queues) { > + mdev_queues = this_p_queues; > + this_p_queues = 0; > + } else { > + this_p_queues -= mdev_queues; > + } > + > dev->io_queues[HCTX_TYPE_POLL] = this_p_queues; > + dev->mdev_queues = mdev_queues; > > /* Initialize for the single interrupt case */ > dev->io_queues[HCTX_TYPE_DEFAULT] = 1; > @@ -2170,7 +2221,8 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) > > dev->num_vecs = result; > result = max(result - 1, 1); > - dev->max_qid = result + dev->io_queues[HCTX_TYPE_POLL]; > + dev->max_qid = result + dev->io_queues[HCTX_TYPE_POLL] + > + dev->mdev_queues; > > /* > * Should investigate if there's a performance win from allocating > @@ -2193,10 +2245,11 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) > nvme_suspend_io_queues(dev); > goto retry; > } > - dev_info(dev->ctrl.device, "%d/%d/%d default/read/poll queues\n", > + dev_info(dev->ctrl.device, "%d/%d/%d/%d default/read/poll/mdev queues\n", > dev->io_queues[HCTX_TYPE_DEFAULT], > dev->io_queues[HCTX_TYPE_READ], > - dev->io_queues[HCTX_TYPE_POLL]); > + dev->io_queues[HCTX_TYPE_POLL], > + dev->mdev_queues); > return 0; > } > > @@ -2623,6 +2676,301 @@ static void nvme_remove_dead_ctrl_work(struct work_struct *work) > nvme_put_ctrl(&dev->ctrl); > } > > +#ifdef CONFIG_NVME_MDEV > +static void nvme_ext_free_iod(struct nvme_dev *dev, struct nvme_ext_iod *iod) > +{ > + int i = 0, max_prp, nprps = iod->nprps; > + dma_addr_t dma = iod->first_prplist_dma; > + > + if (iod->saved_iter) { > + iod->saved_iter->release(iod->saved_iter); > + iod->saved_iter = NULL; > + } > + > + if (--nprps < 2) { > + goto out; > + } else if (USE_SMALL_PRP_POOL(nprps)) { > + dma_pool_free(dev->prp_small_pool, iod->prpslists[0], dma); > + goto out; > + } > + > + max_prp = (dev->ctrl.page_size >> 3) - 1; > + while (nprps > 0) { > + if (i > 0) { > + dma = iod->prpslists[i - 1][max_prp]; > + if (nprps == 1) > + break; > + } > + dma_pool_free(dev->prp_page_pool, iod->prpslists[i++], dma); > + nprps -= max_prp; > + } > +out: > + iod->nprps = -1; > + iod->first_prplist_dma = 0; > + iod->user_tag = 0xDEADDEAD; > +} > + > +static int nvme_ext_setup_iod(struct nvme_dev *dev, struct nvme_ext_iod *iod, > + struct nvme_common_command *cmd, > + struct nvme_ext_data_iter *iter) > +{ > + int ret, i, j; > + __le64 *prp_list; > + dma_addr_t prp_dma; > + struct dma_pool *pool; > + int max_prp = (dev->ctrl.page_size >> 3) - 1; > + > + iod->saved_iter = iter && iter->release ? iter : NULL; > + iod->nprps = iter ? iter->count : 0; > + cmd->dptr.prp1 = 0; > + cmd->dptr.prp2 = 0; > + cmd->metadata = 0; > + > + if (!iter) > + return 0; > + > + /* put first pointer*/ > + cmd->dptr.prp1 = cpu_to_le64(iter->host_iova); > + if (iter->count == 1) > + return 0; > + > + ret = iter->next(iter); > + if (ret) > + goto error; > + > + /* if only have one more pointer, put it to second data pointer*/ > + if (iter->count == 1) { > + cmd->dptr.prp2 = cpu_to_le64(iter->host_iova); > + return 0; > + } > + > + pool = USE_SMALL_PRP_POOL(iter->count) ? dev->prp_small_pool : > + dev->prp_page_pool; > + > + /* Allocate prp lists as needed and fill them */ > + for (i = 0 ; i < NVME_MAX_SEGS && iter->count ; i++) { > + prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma); > + if (!prp_list) { > + ret = -ENOMEM; > + goto error; > + } > + > + iod->prpslists[i++] = prp_list; > + > + if (i == 1) { > + iod->first_prplist_dma = prp_dma; > + cmd->dptr.prp2 = cpu_to_le64(prp_dma); > + j = 0; > + } else { > + prp_list[0] = iod->prpslists[i - 1][max_prp]; > + iod->prpslists[i - 1][max_prp] = prp_dma; > + j = 1; > + } > + > + while (j <= max_prp && iter->count) { > + prp_list[j++] = iter->host_iova; > + ret = iter->next(iter); > + if (ret) > + goto error; > + } > + } > + > + if (iter->count) { > + ret = -ENOSPC; > + goto error; > + } > + return 0; > +error: > + iod->nprps -= iter->count; > + nvme_ext_free_iod(dev, iod); > + return ret; > +} > + > +static int nvme_ext_queues_available(struct nvme_ctrl *ctrl) > +{ > + struct nvme_dev *dev = to_nvme_dev(ctrl); > + unsigned int ret = 0, qid; > + unsigned int first_mdev_q = dev->online_queues - dev->mdev_queues; > + > + for (qid = first_mdev_q; qid < dev->online_queues; qid++) { > + struct nvme_queue *nvmeq = &dev->queues[qid]; > + > + if (!test_bit(NVMEQ_EXTERNAL, &nvmeq->flags)) > + ret++; > + } > + return ret; > +} > + > +static void nvme_ext_queue_reset(struct nvme_dev *dev, u16 qid) > +{ > + struct nvme_queue *nvmeq = &dev->queues[qid]; > + struct nvme_ext_iod *iod, *tmp; > + > + list_for_each_entry_safe(iod, tmp, &nvmeq->ext.used_iods, link) { > + if (iod->saved_iter && iod->saved_iter->release) { > + iod->saved_iter->release(iod->saved_iter); > + iod->saved_iter = NULL; > + list_move(&iod->link, &nvmeq->ext.free_iods); > + } > + } > + > + nvmeq->ext.inflight = 0; > +} > + > +static int nvme_ext_queue_alloc(struct nvme_ctrl *ctrl, u16 *ret_qid) > +{ > + struct nvme_dev *dev = to_nvme_dev(ctrl); > + struct nvme_queue *nvmeq; > + int ret = 0, qid, i; > + unsigned int first_mdev_q = dev->online_queues - dev->mdev_queues; > + > + mutex_lock(&dev->ext_dev_lock); > + > + /* find a polled queue to allocate */ > + for (qid = dev->online_queues - 1 ; qid >= first_mdev_q ; qid--) { > + nvmeq = &dev->queues[qid]; > + if (!test_bit(NVMEQ_EXTERNAL, &nvmeq->flags)) > + break; > + } > + > + if (qid < first_mdev_q) { > + ret = -ENOSPC; > + goto out; > + } > + > + INIT_LIST_HEAD(&nvmeq->ext.free_iods); > + INIT_LIST_HEAD(&nvmeq->ext.used_iods); > + > + nvmeq->ext.iods = > + vzalloc_node(sizeof(struct nvme_ext_iod) * nvmeq->q_depth, > + dev_to_node(dev->dev)); > + > + if (!nvmeq->ext.iods) { > + ret = -ENOMEM; > + goto out; > + } > + > + for (i = 0 ; i < nvmeq->q_depth ; i++) > + list_add_tail(&nvmeq->ext.iods[i].link, &nvmeq->ext.free_iods); > + > + set_bit(NVMEQ_EXTERNAL, &nvmeq->flags); > + *ret_qid = qid; > +out: > + mutex_unlock(&dev->ext_dev_lock); > + return ret; > +} > + > +static void nvme_ext_queue_free(struct nvme_ctrl *ctrl, u16 qid) > +{ > + struct nvme_dev *dev = to_nvme_dev(ctrl); > + struct nvme_queue *nvmeq; > + > + mutex_lock(&dev->ext_dev_lock); > + nvmeq = &dev->queues[qid]; > + > + if (WARN_ON(!test_bit(NVMEQ_EXTERNAL, &nvmeq->flags))) > + return; This condition is probably not expected to happen (since its a warning) but do you need to unlock the ext_dev_lock before returning? > + > + nvme_ext_queue_reset(dev, qid); > + > + vfree(nvmeq->ext.iods); > + nvmeq->ext.iods = NULL; > + INIT_LIST_HEAD(&nvmeq->ext.free_iods); > + INIT_LIST_HEAD(&nvmeq->ext.used_iods); > + > + clear_bit(NVMEQ_EXTERNAL, &nvmeq->flags); > + mutex_unlock(&dev->ext_dev_lock); > +} > + > +static int nvme_ext_queue_submit(struct nvme_ctrl *ctrl, u16 qid, u32 user_tag, > + struct nvme_command *command, > + struct nvme_ext_data_iter *iter) > +{ > + struct nvme_dev *dev = to_nvme_dev(ctrl); > + struct nvme_queue *nvmeq = &dev->queues[qid]; > + struct nvme_ext_iod *iod; > + int ret; > + > + if (WARN_ON(!test_bit(NVMEQ_EXTERNAL, &nvmeq->flags))) > + return -EINVAL; > + > + if (list_empty(&nvmeq->ext.free_iods)) > + return -1; > + > + iod = list_first_entry(&nvmeq->ext.free_iods, > + struct nvme_ext_iod, link); > + > + list_move(&iod->link, &nvmeq->ext.used_iods); > + > + command->common.command_id = cpu_to_le16(iod - nvmeq->ext.iods); > + iod->user_tag = user_tag; > + > + ret = nvme_ext_setup_iod(dev, iod, &command->common, iter); > + if (ret) { > + list_move(&iod->link, &nvmeq->ext.free_iods); > + return ret; > + } > + > + nvmeq->ext.inflight++; > + nvme_submit_cmd(nvmeq, command, true); > + return 0; > +} > + > +static int nvme_ext_queue_poll(struct nvme_ctrl *ctrl, u16 qid, > + struct nvme_ext_cmd_result *results, > + unsigned int max_len) > +{ > + struct nvme_dev *dev = to_nvme_dev(ctrl); > + struct nvme_queue *nvmeq = &dev->queues[qid]; > + u16 old_head; > + int i, j; > + > + if (WARN_ON(!test_bit(NVMEQ_EXTERNAL, &nvmeq->flags))) > + return -EINVAL; > + > + if (nvmeq->ext.inflight == 0) > + return -1; > + > + old_head = nvmeq->cq_head; > + > + for (i = 0 ; nvme_cqe_pending(nvmeq) && i < max_len ; i++) { > + u16 status = le16_to_cpu(nvmeq->cqes[nvmeq->cq_head].status); > + u16 tag = le16_to_cpu(nvmeq->cqes[nvmeq->cq_head].command_id); > + > + results[i].status = status >> 1; > + results[i].tag = (u32)tag; > + nvme_update_cq_head(nvmeq); > + } > + > + if (old_head != nvmeq->cq_head) > + nvme_ring_cq_doorbell(nvmeq); > + > + for (j = 0 ; j < i ; j++) { > + u16 tag = results[j].tag & 0xFFFF; > + struct nvme_ext_iod *iod = &nvmeq->ext.iods[tag]; > + > + if (WARN_ON(tag >= nvmeq->q_depth || iod->nprps == -1)) > + continue; > + > + results[j].tag = iod->user_tag; > + nvme_ext_free_iod(dev, iod); > + list_move(&iod->link, &nvmeq->ext.free_iods); > + nvmeq->ext.inflight--; > + } > + > + WARN_ON(nvmeq->ext.inflight < 0); > + return i; > +} > + > +static bool nvme_ext_queue_full(struct nvme_ctrl *ctrl, u16 qid) > +{ > + struct nvme_dev *dev = to_nvme_dev(ctrl); > + struct nvme_queue *nvmeq = &dev->queues[qid]; > + > + return nvmeq->ext.inflight < nvmeq->q_depth - 1; > +} > +#endif > + > static int nvme_pci_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val) > { > *val = readl(to_nvme_dev(ctrl)->bar + off); > @@ -2652,13 +3000,25 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = { > .name = "pcie", > .module = THIS_MODULE, > .flags = NVME_F_METADATA_SUPPORTED | > - NVME_F_PCI_P2PDMA, > + NVME_F_PCI_P2PDMA | > + NVME_F_MDEV_SUPPORTED | > + NVME_F_MDEV_DMA_SUPPORTED, > + > .reg_read32 = nvme_pci_reg_read32, > .reg_write32 = nvme_pci_reg_write32, > .reg_read64 = nvme_pci_reg_read64, > .free_ctrl = nvme_pci_free_ctrl, > .submit_async_event = nvme_pci_submit_async_event, > .get_address = nvme_pci_get_address, > + > +#ifdef CONFIG_NVME_MDEV > + .ext_queues_available = nvme_ext_queues_available, > + .ext_queue_alloc = nvme_ext_queue_alloc, > + .ext_queue_free = nvme_ext_queue_free, > + .ext_queue_submit = nvme_ext_queue_submit, > + .ext_queue_poll = nvme_ext_queue_poll, > + .ext_queue_full = nvme_ext_queue_full, > +#endif > }; > > static int nvme_dev_map(struct nvme_dev *dev) > @@ -2747,6 +3107,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) > INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work); > INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work); > mutex_init(&dev->shutdown_lock); > + mutex_init(&dev->ext_dev_lock); > > result = nvme_setup_prp_pools(dev); > if (result) > diff --git a/drivers/nvme/mdev/host.c b/drivers/nvme/mdev/host.c > index 5766bad7e909..6590946b86c2 100644 > --- a/drivers/nvme/mdev/host.c > +++ b/drivers/nvme/mdev/host.c > @@ -48,19 +48,21 @@ static struct nvme_mdev_hctrl *nvme_mdev_hctrl_create(struct nvme_ctrl *ctrl) > return NULL; > } > > + hctrl = kzalloc_node(sizeof(*hctrl), GFP_KERNEL, > + dev_to_node(ctrl->dev)); > + if (!hctrl) > + return NULL; > + > nr_host_queues = ctrl->ops->ext_queues_available(ctrl); > max_lba_transfer = ctrl->max_hw_sectors >> (PAGE_SHIFT - 9); > > if (nr_host_queues == 0) { > dev_info(ctrl->dev, > "no support for mdev - no mdev reserved queues available"); > + kfree(hctrl); > return NULL; > } > > - hctrl = kzalloc_node(sizeof(*hctrl), GFP_KERNEL, > - dev_to_node(ctrl->dev)); > - if (!hctrl) > - return NULL; > > kref_init(&hctrl->ref); > mutex_init(&hctrl->lock); > @@ -180,6 +182,24 @@ void nvme_mdev_hctrl_hqs_unreserve(struct nvme_mdev_hctrl *hctrl, > mutex_unlock(&hctrl->lock); > } > > +/* Check if IO passthrough is supported for given IO optcode */ > +bool nvme_mdev_hctrl_hq_check_op(struct nvme_mdev_hctrl *hctrl, u8 optcode) > +{ > + switch (optcode) { > + case nvme_cmd_flush: > + case nvme_cmd_read: > + case nvme_cmd_write: > + /* these are mandatory*/ > + return true; > + case nvme_cmd_write_zeroes: > + return (hctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES); > + case nvme_cmd_dsm: > + return (hctrl->oncs & NVME_CTRL_ONCS_DSM); > + default: > + return false; > + } > +} > + > /* Allocate a host IO queue */ > int nvme_mdev_hctrl_hq_alloc(struct nvme_mdev_hctrl *hctrl) > { > @@ -204,23 +224,7 @@ bool nvme_mdev_hctrl_hq_can_submit(struct nvme_mdev_hctrl *hctrl, u16 qid) > return hctrl->nvme_ctrl->ops->ext_queue_full(hctrl->nvme_ctrl, qid); > } > > -/* Check if IO passthrough is supported for given IO optcode */ > -bool nvme_mdev_hctrl_hq_check_op(struct nvme_mdev_hctrl *hctrl, u8 optcode) > -{ > - switch (optcode) { > - case nvme_cmd_flush: > - case nvme_cmd_read: > - case nvme_cmd_write: > - /* these are mandatory*/ > - return true; > - case nvme_cmd_write_zeroes: > - return (hctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES); > - case nvme_cmd_dsm: > - return (hctrl->oncs & NVME_CTRL_ONCS_DSM); > - default: > - return false; > - } > -} > + > > /* Submit a IO passthrough command */ > int nvme_mdev_hctrl_hq_submit(struct nvme_mdev_hctrl *hctrl, > diff --git a/drivers/nvme/mdev/io.c b/drivers/nvme/mdev/io.c > index a731196d0365..59837540fec2 100644 > --- a/drivers/nvme/mdev/io.c > +++ b/drivers/nvme/mdev/io.c > @@ -11,14 +11,16 @@ > #include <linux/ktime.h> > #include "priv.h" > > + > struct io_ctx { > struct nvme_mdev_hctrl *hctrl; > struct nvme_mdev_vctrl *vctrl; > > const struct nvme_command *in; > - struct nvme_command out; > struct nvme_mdev_vns *ns; > struct nvme_ext_data_iter udatait; > + > + struct nvme_command out; > struct nvme_ext_data_iter *kdatait; > > ktime_t last_io_t; > @@ -28,6 +30,20 @@ struct io_ctx { > unsigned int arb_burst; > }; > > +/* Check if we need to read a command from the admin queue */ > +static bool nvme_mdev_adm_needs_processing(struct io_ctx *ctx) > +{ > + if (!timeout(ctx->last_admin_poll_time, > + ctx->vctrl->now, ctx->admin_poll_rate_ms)) > + return false; > + > + if (nvme_mdev_vsq_has_data(ctx->vctrl, &ctx->vctrl->vsqs[0])) > + return true; > + > + ctx->last_admin_poll_time = ctx->vctrl->now; > + return false; > +} > + > /* Handle read/write command.*/ > static int nvme_mdev_io_translate_rw(struct io_ctx *ctx) > { > @@ -229,6 +245,7 @@ static int nvme_mdev_io_translate_cmd(struct io_ctx *ctx) > } > } > > +/* process a user submission queue */ > static bool nvme_mdev_io_process_sq(struct io_ctx *ctx, u16 sqid) > { > struct nvme_vsq *vsq = &ctx->vctrl->vsqs[sqid]; > @@ -275,7 +292,13 @@ static bool nvme_mdev_io_process_sq(struct io_ctx *ctx, u16 sqid) > return true; > } > > -/* process host replies to the passed through commands */ > +/* process a user completion queue */ > +static void nvme_mdev_io_process_cq(struct io_ctx *ctx, u16 cqid) > +{ > + nvme_mdev_vcq_process(ctx->vctrl, cqid, true); > +} > + > +/* process hardware completion queue */ > static int nvme_mdev_io_process_hwq(struct io_ctx *ctx, u16 hwq) > { > int n, i; > @@ -301,22 +324,9 @@ static int nvme_mdev_io_process_hwq(struct io_ctx *ctx, u16 hwq) > return n; > } > > -/* Check if we need to read a command from the admin queue */ > -static bool nvme_mdev_adm_needs_processing(struct io_ctx *ctx) > -{ > - if (!timeout(ctx->last_admin_poll_time, > - ctx->vctrl->now, ctx->admin_poll_rate_ms)) > - return false; > - > - if (nvme_mdev_vsq_has_data(ctx->vctrl, &ctx->vctrl->vsqs[0])) > - return true; > - > - ctx->last_admin_poll_time = ctx->vctrl->now; > - return false; > -} > > /* do polling till one of events stops it */ > -static void nvme_mdev_io_maintask(struct io_ctx *ctx) > +static void nvme_mdev_io_polling_loop(struct io_ctx *ctx) > { > struct nvme_mdev_vctrl *vctrl = ctx->vctrl; > u16 i, cqid, sqid, hsqcnt; > @@ -353,7 +363,7 @@ static void nvme_mdev_io_maintask(struct io_ctx *ctx) > /* process the completions from the guest*/ > cqid = 1; > for_each_set_bit_from(cqid, vctrl->vcq_en, MAX_VIRTUAL_QUEUES) > - nvme_mdev_vcq_process(vctrl, cqid, true); > + nvme_mdev_io_process_cq(ctx, cqid); > > /* process the completions from the hardware*/ > for (i = 0 ; i < hsqcnt ; i++) > @@ -470,7 +480,7 @@ static int nvme_mdev_io_polling_thread(void *data) > if (kthread_should_stop()) > break; > > - nvme_mdev_io_maintask(&ctx); > + nvme_mdev_io_polling_loop(&ctx); > } > > _DBG(ctx.vctrl, "IO: iothread stopped\n"); > diff --git a/drivers/nvme/mdev/mmio.c b/drivers/nvme/mdev/mmio.c > index cf03c1f22f4c..a80962bf4a3d 100644 > --- a/drivers/nvme/mdev/mmio.c > +++ b/drivers/nvme/mdev/mmio.c > @@ -54,9 +54,6 @@ static const struct vm_operations_struct nvme_mdev_mmio_dbs_vm_ops = { > bool nvme_mdev_mmio_db_check(struct nvme_mdev_vctrl *vctrl, > u16 qid, u16 size, u16 db) > { > - if (get_current() != vctrl->iothread) > - lockdep_assert_held(&vctrl->lock); > - > if (db < size) > return true; > if (qid == 0) { >
On Thu, 2019-05-02 at 15:12 -0600, Heitke, Kenneth wrote: > > On 5/2/2019 5:47 AM, Maxim Levitsky wrote: > > Note that currently the number of hw queues reserved for mdev, > > has to be pre determined on module load. > > > > (I used to allocate the queues dynamicaly on demand, but > > recent changes to allocate polled/read queues made > > this somewhat difficult, so I dropped this for now) > > > > Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com> > > --- > > drivers/nvme/host/pci.c | 375 ++++++++++++++++++++++++++++++++++++++- > > drivers/nvme/mdev/host.c | 46 ++--- > > drivers/nvme/mdev/io.c | 46 +++-- > > drivers/nvme/mdev/mmio.c | 3 - > > 4 files changed, 421 insertions(+), 49 deletions(-) > > > > diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c > > index 282f28c851c1..87507e710374 100644 > > --- a/drivers/nvme/host/pci.c > > +++ b/drivers/nvme/host/pci.c > > @@ -23,6 +23,7 @@ > > #include <linux/io-64-nonatomic-lo-hi.h> > > #include <linux/sed-opal.h> > > #include <linux/pci-p2pdma.h> > > +#include "../mdev/mdev.h" > > > > #include "trace.h" > > #include "nvme.h" > > @@ -32,6 +33,7 @@ > > > > #define SGES_PER_PAGE (PAGE_SIZE / sizeof(struct nvme_sgl_desc)) > > > > +#define USE_SMALL_PRP_POOL(nprps) ((nprps) < (256 / 8)) > > /* > > * These can be higher, but we need to ensure that any command doesn't > > * require an sg allocation that needs more than a page of data. > > @@ -83,12 +85,24 @@ static int poll_queues = 0; > > module_param_cb(poll_queues, &queue_count_ops, &poll_queues, 0644); > > MODULE_PARM_DESC(poll_queues, "Number of queues to use for polled IO."); > > > > +static int mdev_queues; > > +#ifdef CONFIG_NVME_MDEV > > +module_param_cb(mdev_queues, &queue_count_ops, &mdev_queues, 0644); > > +MODULE_PARM_DESC(mdev_queues, "Number of queues to use for mediated VFIO"); > > +#endif > > + > > struct nvme_dev; > > struct nvme_queue; > > > > static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown); > > static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode); > > > > +#ifdef CONFIG_NVME_MDEV > > +static void nvme_ext_queue_reset(struct nvme_dev *dev, u16 qid); > > +#else > > +static void nvme_ext_queue_reset(struct nvme_dev *dev, u16 qid) {} > > +#endif > > + > > /* > > * Represents an NVM Express device. Each nvme_dev is a PCI function. > > */ > > @@ -103,6 +117,7 @@ struct nvme_dev { > > unsigned online_queues; > > unsigned max_qid; > > unsigned io_queues[HCTX_MAX_TYPES]; > > + unsigned int mdev_queues; > > unsigned int num_vecs; > > int q_depth; > > u32 db_stride; > > @@ -110,6 +125,7 @@ struct nvme_dev { > > unsigned long bar_mapped_size; > > struct work_struct remove_work; > > struct mutex shutdown_lock; > > + struct mutex ext_dev_lock; > > bool subsystem; > > u64 cmb_size; > > bool cmb_use_sqes; > > @@ -172,6 +188,16 @@ static inline struct nvme_dev *to_nvme_dev(struct > > nvme_ctrl *ctrl) > > return container_of(ctrl, struct nvme_dev, ctrl); > > } > > > > +/* Simplified IO descriptor for MDEV use */ > > +struct nvme_ext_iod { > > + struct list_head link; > > + u32 user_tag; > > + int nprps; > > + struct nvme_ext_data_iter *saved_iter; > > + dma_addr_t first_prplist_dma; > > + __le64 *prpslists[NVME_MAX_SEGS]; > > +}; > > + > > /* > > * An NVM Express queue. Each device has at least two (one for admin > > * commands and one for I/O commands). > > @@ -196,15 +222,26 @@ struct nvme_queue { > > u16 qid; > > u8 cq_phase; > > unsigned long flags; > > + > > #define NVMEQ_ENABLED 0 > > #define NVMEQ_SQ_CMB 1 > > #define NVMEQ_DELETE_ERROR 2 > > #define NVMEQ_POLLED 3 > > +#define NVMEQ_EXTERNAL 4 > > + > > u32 *dbbuf_sq_db; > > u32 *dbbuf_cq_db; > > u32 *dbbuf_sq_ei; > > u32 *dbbuf_cq_ei; > > struct completion delete_done; > > + > > + /* queue passthrough for external use */ > > + struct { > > + int inflight; > > + struct nvme_ext_iod *iods; > > + struct list_head free_iods; > > + struct list_head used_iods; > > + } ext; > > }; > > > > /* > > @@ -255,7 +292,7 @@ static inline void _nvme_check_size(void) > > > > static unsigned int max_io_queues(void) > > { > > - return num_possible_cpus() + write_queues + poll_queues; > > + return num_possible_cpus() + write_queues + poll_queues + mdev_queues; > > } > > > > static unsigned int max_queue_count(void) > > @@ -1066,6 +1103,7 @@ static irqreturn_t nvme_irq(int irq, void *data) > > * the irq handler, even if that was on another CPU. > > */ > > rmb(); > > + > > if (nvmeq->cq_head != nvmeq->last_cq_head) > > ret = IRQ_HANDLED; > > nvme_process_cq(nvmeq, &start, &end, -1); > > @@ -1553,7 +1591,11 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, > > u16 qid) > > memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth)); > > nvme_dbbuf_init(dev, nvmeq, qid); > > dev->online_queues++; > > + > > wmb(); /* ensure the first interrupt sees the initialization */ > > + > > + if (test_bit(NVMEQ_EXTERNAL, &nvmeq->flags)) > > + nvme_ext_queue_reset(nvmeq->dev, qid); > > } > > > > static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool > > polled) > > @@ -1759,7 +1801,7 @@ static int nvme_create_io_queues(struct nvme_dev *dev) > > } > > > > max = min(dev->max_qid, dev->ctrl.queue_count - 1); > > - if (max != 1 && dev->io_queues[HCTX_TYPE_POLL]) { > > + if (max != 1) { > > rw_queues = dev->io_queues[HCTX_TYPE_DEFAULT] + > > dev->io_queues[HCTX_TYPE_READ]; > > } else { > > @@ -2095,14 +2137,23 @@ static int nvme_setup_irqs(struct nvme_dev *dev, > > unsigned int nr_io_queues) > > * Poll queues don't need interrupts, but we need at least one IO > > * queue left over for non-polled IO. > > */ > > - this_p_queues = poll_queues; > > + this_p_queues = poll_queues + mdev_queues; > > if (this_p_queues >= nr_io_queues) { > > this_p_queues = nr_io_queues - 1; > > irq_queues = 1; > > } else { > > irq_queues = nr_io_queues - this_p_queues + 1; > > } > > + > > + if (mdev_queues > this_p_queues) { > > + mdev_queues = this_p_queues; > > + this_p_queues = 0; > > + } else { > > + this_p_queues -= mdev_queues; > > + } > > + > > dev->io_queues[HCTX_TYPE_POLL] = this_p_queues; > > + dev->mdev_queues = mdev_queues; > > > > /* Initialize for the single interrupt case */ > > dev->io_queues[HCTX_TYPE_DEFAULT] = 1; > > @@ -2170,7 +2221,8 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) > > > > dev->num_vecs = result; > > result = max(result - 1, 1); > > - dev->max_qid = result + dev->io_queues[HCTX_TYPE_POLL]; > > + dev->max_qid = result + dev->io_queues[HCTX_TYPE_POLL] + > > + dev->mdev_queues; > > > > /* > > * Should investigate if there's a performance win from allocating > > @@ -2193,10 +2245,11 @@ static int nvme_setup_io_queues(struct nvme_dev > > *dev) > > nvme_suspend_io_queues(dev); > > goto retry; > > } > > - dev_info(dev->ctrl.device, "%d/%d/%d default/read/poll queues\n", > > + dev_info(dev->ctrl.device, "%d/%d/%d/%d default/read/poll/mdev > > queues\n", > > dev->io_queues[HCTX_TYPE_DEFAULT], > > dev->io_queues[HCTX_TYPE_READ], > > - dev->io_queues[HCTX_TYPE_POLL]); > > + dev->io_queues[HCTX_TYPE_POLL], > > + dev->mdev_queues); > > return 0; > > } > > > > @@ -2623,6 +2676,301 @@ static void nvme_remove_dead_ctrl_work(struct > > work_struct *work) > > nvme_put_ctrl(&dev->ctrl); > > } > > > > +#ifdef CONFIG_NVME_MDEV > > +static void nvme_ext_free_iod(struct nvme_dev *dev, struct nvme_ext_iod > > *iod) > > +{ > > + int i = 0, max_prp, nprps = iod->nprps; > > + dma_addr_t dma = iod->first_prplist_dma; > > + > > + if (iod->saved_iter) { > > + iod->saved_iter->release(iod->saved_iter); > > + iod->saved_iter = NULL; > > + } > > + > > + if (--nprps < 2) { > > + goto out; > > + } else if (USE_SMALL_PRP_POOL(nprps)) { > > + dma_pool_free(dev->prp_small_pool, iod->prpslists[0], dma); > > + goto out; > > + } > > + > > + max_prp = (dev->ctrl.page_size >> 3) - 1; > > + while (nprps > 0) { > > + if (i > 0) { > > + dma = iod->prpslists[i - 1][max_prp]; > > + if (nprps == 1) > > + break; > > + } > > + dma_pool_free(dev->prp_page_pool, iod->prpslists[i++], dma); > > + nprps -= max_prp; > > + } > > +out: > > + iod->nprps = -1; > > + iod->first_prplist_dma = 0; > > + iod->user_tag = 0xDEADDEAD; > > +} > > + > > +static int nvme_ext_setup_iod(struct nvme_dev *dev, struct nvme_ext_iod > > *iod, > > + struct nvme_common_command *cmd, > > + struct nvme_ext_data_iter *iter) > > +{ > > + int ret, i, j; > > + __le64 *prp_list; > > + dma_addr_t prp_dma; > > + struct dma_pool *pool; > > + int max_prp = (dev->ctrl.page_size >> 3) - 1; > > + > > + iod->saved_iter = iter && iter->release ? iter : NULL; > > + iod->nprps = iter ? iter->count : 0; > > + cmd->dptr.prp1 = 0; > > + cmd->dptr.prp2 = 0; > > + cmd->metadata = 0; > > + > > + if (!iter) > > + return 0; > > + > > + /* put first pointer*/ > > + cmd->dptr.prp1 = cpu_to_le64(iter->host_iova); > > + if (iter->count == 1) > > + return 0; > > + > > + ret = iter->next(iter); > > + if (ret) > > + goto error; > > + > > + /* if only have one more pointer, put it to second data pointer*/ > > + if (iter->count == 1) { > > + cmd->dptr.prp2 = cpu_to_le64(iter->host_iova); > > + return 0; > > + } > > + > > + pool = USE_SMALL_PRP_POOL(iter->count) ? dev->prp_small_pool : > > + dev->prp_page_pool; > > + > > + /* Allocate prp lists as needed and fill them */ > > + for (i = 0 ; i < NVME_MAX_SEGS && iter->count ; i++) { > > + prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma); > > + if (!prp_list) { > > + ret = -ENOMEM; > > + goto error; > > + } > > + > > + iod->prpslists[i++] = prp_list; > > + > > + if (i == 1) { > > + iod->first_prplist_dma = prp_dma; > > + cmd->dptr.prp2 = cpu_to_le64(prp_dma); > > + j = 0; > > + } else { > > + prp_list[0] = iod->prpslists[i - 1][max_prp]; > > + iod->prpslists[i - 1][max_prp] = prp_dma; > > + j = 1; > > + } > > + > > + while (j <= max_prp && iter->count) { > > + prp_list[j++] = iter->host_iova; > > + ret = iter->next(iter); > > + if (ret) > > + goto error; > > + } > > + } > > + > > + if (iter->count) { > > + ret = -ENOSPC; > > + goto error; > > + } > > + return 0; > > +error: > > + iod->nprps -= iter->count; > > + nvme_ext_free_iod(dev, iod); > > + return ret; > > +} > > + > > +static int nvme_ext_queues_available(struct nvme_ctrl *ctrl) > > +{ > > + struct nvme_dev *dev = to_nvme_dev(ctrl); > > + unsigned int ret = 0, qid; > > + unsigned int first_mdev_q = dev->online_queues - dev->mdev_queues; > > + > > + for (qid = first_mdev_q; qid < dev->online_queues; qid++) { > > + struct nvme_queue *nvmeq = &dev->queues[qid]; > > + > > + if (!test_bit(NVMEQ_EXTERNAL, &nvmeq->flags)) > > + ret++; > > + } > > + return ret; > > +} > > + > > +static void nvme_ext_queue_reset(struct nvme_dev *dev, u16 qid) > > +{ > > + struct nvme_queue *nvmeq = &dev->queues[qid]; > > + struct nvme_ext_iod *iod, *tmp; > > + > > + list_for_each_entry_safe(iod, tmp, &nvmeq->ext.used_iods, link) { > > + if (iod->saved_iter && iod->saved_iter->release) { > > + iod->saved_iter->release(iod->saved_iter); > > + iod->saved_iter = NULL; > > + list_move(&iod->link, &nvmeq->ext.free_iods); > > + } > > + } > > + > > + nvmeq->ext.inflight = 0; > > +} > > + > > +static int nvme_ext_queue_alloc(struct nvme_ctrl *ctrl, u16 *ret_qid) > > +{ > > + struct nvme_dev *dev = to_nvme_dev(ctrl); > > + struct nvme_queue *nvmeq; > > + int ret = 0, qid, i; > > + unsigned int first_mdev_q = dev->online_queues - dev->mdev_queues; > > + > > + mutex_lock(&dev->ext_dev_lock); > > + > > + /* find a polled queue to allocate */ > > + for (qid = dev->online_queues - 1 ; qid >= first_mdev_q ; qid--) { > > + nvmeq = &dev->queues[qid]; > > + if (!test_bit(NVMEQ_EXTERNAL, &nvmeq->flags)) > > + break; > > + } > > + > > + if (qid < first_mdev_q) { > > + ret = -ENOSPC; > > + goto out; > > + } > > + > > + INIT_LIST_HEAD(&nvmeq->ext.free_iods); > > + INIT_LIST_HEAD(&nvmeq->ext.used_iods); > > + > > + nvmeq->ext.iods = > > + vzalloc_node(sizeof(struct nvme_ext_iod) * nvmeq->q_depth, > > + dev_to_node(dev->dev)); > > + > > + if (!nvmeq->ext.iods) { > > + ret = -ENOMEM; > > + goto out; > > + } > > + > > + for (i = 0 ; i < nvmeq->q_depth ; i++) > > + list_add_tail(&nvmeq->ext.iods[i].link, &nvmeq->ext.free_iods); > > + > > + set_bit(NVMEQ_EXTERNAL, &nvmeq->flags); > > + *ret_qid = qid; > > +out: > > + mutex_unlock(&dev->ext_dev_lock); > > + return ret; > > +} > > + > > +static void nvme_ext_queue_free(struct nvme_ctrl *ctrl, u16 qid) > > +{ > > + struct nvme_dev *dev = to_nvme_dev(ctrl); > > + struct nvme_queue *nvmeq; > > + > > + mutex_lock(&dev->ext_dev_lock); > > + nvmeq = &dev->queues[qid]; > > + > > + if (WARN_ON(!test_bit(NVMEQ_EXTERNAL, &nvmeq->flags))) > > + return; > > This condition is probably not expected to happen (since its a warning) > but do you need to unlock the ext_dev_lock before returning? This is true, I will fix this. This used to be BUG_ON, but due to checkpatch.pl complains I turned them all to WARN_ON, and missed this. Thanks, Best regards, Maxim Levitsky > > > + > > + nvme_ext_queue_reset(dev, qid); > > + > > + vfree(nvmeq->ext.iods); > > + nvmeq->ext.iods = NULL; > > + INIT_LIST_HEAD(&nvmeq->ext.free_iods); > > + INIT_LIST_HEAD(&nvmeq->ext.used_iods); > > + > > + clear_bit(NVMEQ_EXTERNAL, &nvmeq->flags); > > + mutex_unlock(&dev->ext_dev_lock); > > +} > > + > > +static int nvme_ext_queue_submit(struct nvme_ctrl *ctrl, u16 qid, u32 > > user_tag, > > + struct nvme_command *command, > > + struct nvme_ext_data_iter *iter) > > +{ > > + struct nvme_dev *dev = to_nvme_dev(ctrl); > > + struct nvme_queue *nvmeq = &dev->queues[qid]; > > + struct nvme_ext_iod *iod; > > + int ret; > > + > > + if (WARN_ON(!test_bit(NVMEQ_EXTERNAL, &nvmeq->flags))) > > + return -EINVAL; > > + > > + if (list_empty(&nvmeq->ext.free_iods)) > > + return -1; > > + > > + iod = list_first_entry(&nvmeq->ext.free_iods, > > + struct nvme_ext_iod, link); > > + > > + list_move(&iod->link, &nvmeq->ext.used_iods); > > + > > + command->common.command_id = cpu_to_le16(iod - nvmeq->ext.iods); > > + iod->user_tag = user_tag; > > + > > + ret = nvme_ext_setup_iod(dev, iod, &command->common, iter); > > + if (ret) { > > + list_move(&iod->link, &nvmeq->ext.free_iods); > > + return ret; > > + } > > + > > + nvmeq->ext.inflight++; > > + nvme_submit_cmd(nvmeq, command, true); > > + return 0; > > +} > > + > > +static int nvme_ext_queue_poll(struct nvme_ctrl *ctrl, u16 qid, > > + struct nvme_ext_cmd_result *results, > > + unsigned int max_len) > > +{ > > + struct nvme_dev *dev = to_nvme_dev(ctrl); > > + struct nvme_queue *nvmeq = &dev->queues[qid]; > > + u16 old_head; > > + int i, j; > > + > > + if (WARN_ON(!test_bit(NVMEQ_EXTERNAL, &nvmeq->flags))) > > + return -EINVAL; > > + > > + if (nvmeq->ext.inflight == 0) > > + return -1; > > + > > + old_head = nvmeq->cq_head; > > + > > + for (i = 0 ; nvme_cqe_pending(nvmeq) && i < max_len ; i++) { > > + u16 status = le16_to_cpu(nvmeq->cqes[nvmeq->cq_head].status); > > + u16 tag = le16_to_cpu(nvmeq->cqes[nvmeq->cq_head].command_id); > > + > > + results[i].status = status >> 1; > > + results[i].tag = (u32)tag; > > + nvme_update_cq_head(nvmeq); > > + } > > + > > + if (old_head != nvmeq->cq_head) > > + nvme_ring_cq_doorbell(nvmeq); > > + > > + for (j = 0 ; j < i ; j++) { > > + u16 tag = results[j].tag & 0xFFFF; > > + struct nvme_ext_iod *iod = &nvmeq->ext.iods[tag]; > > + > > + if (WARN_ON(tag >= nvmeq->q_depth || iod->nprps == -1)) > > + continue; > > + > > + results[j].tag = iod->user_tag; > > + nvme_ext_free_iod(dev, iod); > > + list_move(&iod->link, &nvmeq->ext.free_iods); > > + nvmeq->ext.inflight--; > > + } > > + > > + WARN_ON(nvmeq->ext.inflight < 0); > > + return i; > > +} > > + > > +static bool nvme_ext_queue_full(struct nvme_ctrl *ctrl, u16 qid) > > +{ > > + struct nvme_dev *dev = to_nvme_dev(ctrl); > > + struct nvme_queue *nvmeq = &dev->queues[qid]; > > + > > + return nvmeq->ext.inflight < nvmeq->q_depth - 1; > > +} > > +#endif > > + > > static int nvme_pci_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val) > > { > > *val = readl(to_nvme_dev(ctrl)->bar + off); > > @@ -2652,13 +3000,25 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops > > = { > > .name = "pcie", > > .module = THIS_MODULE, > > .flags = NVME_F_METADATA_SUPPORTED | > > - NVME_F_PCI_P2PDMA, > > + NVME_F_PCI_P2PDMA | > > + NVME_F_MDEV_SUPPORTED | > > + NVME_F_MDEV_DMA_SUPPORTED, > > + > > .reg_read32 = nvme_pci_reg_read32, > > .reg_write32 = nvme_pci_reg_write32, > > .reg_read64 = nvme_pci_reg_read64, > > .free_ctrl = nvme_pci_free_ctrl, > > .submit_async_event = nvme_pci_submit_async_event, > > .get_address = nvme_pci_get_address, > > + > > +#ifdef CONFIG_NVME_MDEV > > + .ext_queues_available = nvme_ext_queues_available, > > + .ext_queue_alloc = nvme_ext_queue_alloc, > > + .ext_queue_free = nvme_ext_queue_free, > > + .ext_queue_submit = nvme_ext_queue_submit, > > + .ext_queue_poll = nvme_ext_queue_poll, > > + .ext_queue_full = nvme_ext_queue_full, > > +#endif > > }; > > > > static int nvme_dev_map(struct nvme_dev *dev) > > @@ -2747,6 +3107,7 @@ static int nvme_probe(struct pci_dev *pdev, const > > struct pci_device_id *id) > > INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work); > > INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work); > > mutex_init(&dev->shutdown_lock); > > + mutex_init(&dev->ext_dev_lock); > > > > result = nvme_setup_prp_pools(dev); > > if (result) > > diff --git a/drivers/nvme/mdev/host.c b/drivers/nvme/mdev/host.c > > index 5766bad7e909..6590946b86c2 100644 > > --- a/drivers/nvme/mdev/host.c > > +++ b/drivers/nvme/mdev/host.c > > @@ -48,19 +48,21 @@ static struct nvme_mdev_hctrl > > *nvme_mdev_hctrl_create(struct nvme_ctrl *ctrl) > > return NULL; > > } > > > > + hctrl = kzalloc_node(sizeof(*hctrl), GFP_KERNEL, > > + dev_to_node(ctrl->dev)); > > + if (!hctrl) > > + return NULL; > > + > > nr_host_queues = ctrl->ops->ext_queues_available(ctrl); > > max_lba_transfer = ctrl->max_hw_sectors >> (PAGE_SHIFT - 9); > > > > if (nr_host_queues == 0) { > > dev_info(ctrl->dev, > > "no support for mdev - no mdev reserved queues > > available"); > > + kfree(hctrl); > > return NULL; > > } > > > > - hctrl = kzalloc_node(sizeof(*hctrl), GFP_KERNEL, > > - dev_to_node(ctrl->dev)); > > - if (!hctrl) > > - return NULL; > > > > kref_init(&hctrl->ref); > > mutex_init(&hctrl->lock); > > @@ -180,6 +182,24 @@ void nvme_mdev_hctrl_hqs_unreserve(struct > > nvme_mdev_hctrl *hctrl, > > mutex_unlock(&hctrl->lock); > > } > > > > +/* Check if IO passthrough is supported for given IO optcode */ > > +bool nvme_mdev_hctrl_hq_check_op(struct nvme_mdev_hctrl *hctrl, u8 optcode) > > +{ > > + switch (optcode) { > > + case nvme_cmd_flush: > > + case nvme_cmd_read: > > + case nvme_cmd_write: > > + /* these are mandatory*/ > > + return true; > > + case nvme_cmd_write_zeroes: > > + return (hctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES); > > + case nvme_cmd_dsm: > > + return (hctrl->oncs & NVME_CTRL_ONCS_DSM); > > + default: > > + return false; > > + } > > +} > > + > > /* Allocate a host IO queue */ > > int nvme_mdev_hctrl_hq_alloc(struct nvme_mdev_hctrl *hctrl) > > { > > @@ -204,23 +224,7 @@ bool nvme_mdev_hctrl_hq_can_submit(struct > > nvme_mdev_hctrl *hctrl, u16 qid) > > return hctrl->nvme_ctrl->ops->ext_queue_full(hctrl->nvme_ctrl, qid); > > } > > > > -/* Check if IO passthrough is supported for given IO optcode */ > > -bool nvme_mdev_hctrl_hq_check_op(struct nvme_mdev_hctrl *hctrl, u8 optcode) > > -{ > > - switch (optcode) { > > - case nvme_cmd_flush: > > - case nvme_cmd_read: > > - case nvme_cmd_write: > > - /* these are mandatory*/ > > - return true; > > - case nvme_cmd_write_zeroes: > > - return (hctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES); > > - case nvme_cmd_dsm: > > - return (hctrl->oncs & NVME_CTRL_ONCS_DSM); > > - default: > > - return false; > > - } > > -} > > + > > > > /* Submit a IO passthrough command */ > > int nvme_mdev_hctrl_hq_submit(struct nvme_mdev_hctrl *hctrl, > > diff --git a/drivers/nvme/mdev/io.c b/drivers/nvme/mdev/io.c > > index a731196d0365..59837540fec2 100644 > > --- a/drivers/nvme/mdev/io.c > > +++ b/drivers/nvme/mdev/io.c > > @@ -11,14 +11,16 @@ > > #include <linux/ktime.h> > > #include "priv.h" > > > > + > > struct io_ctx { > > struct nvme_mdev_hctrl *hctrl; > > struct nvme_mdev_vctrl *vctrl; > > > > const struct nvme_command *in; > > - struct nvme_command out; > > struct nvme_mdev_vns *ns; > > struct nvme_ext_data_iter udatait; > > + > > + struct nvme_command out; > > struct nvme_ext_data_iter *kdatait; > > > > ktime_t last_io_t; > > @@ -28,6 +30,20 @@ struct io_ctx { > > unsigned int arb_burst; > > }; > > > > +/* Check if we need to read a command from the admin queue */ > > +static bool nvme_mdev_adm_needs_processing(struct io_ctx *ctx) > > +{ > > + if (!timeout(ctx->last_admin_poll_time, > > + ctx->vctrl->now, ctx->admin_poll_rate_ms)) > > + return false; > > + > > + if (nvme_mdev_vsq_has_data(ctx->vctrl, &ctx->vctrl->vsqs[0])) > > + return true; > > + > > + ctx->last_admin_poll_time = ctx->vctrl->now; > > + return false; > > +} > > + > > /* Handle read/write command.*/ > > static int nvme_mdev_io_translate_rw(struct io_ctx *ctx) > > { > > @@ -229,6 +245,7 @@ static int nvme_mdev_io_translate_cmd(struct io_ctx > > *ctx) > > } > > } > > > > +/* process a user submission queue */ > > static bool nvme_mdev_io_process_sq(struct io_ctx *ctx, u16 sqid) > > { > > struct nvme_vsq *vsq = &ctx->vctrl->vsqs[sqid]; > > @@ -275,7 +292,13 @@ static bool nvme_mdev_io_process_sq(struct io_ctx *ctx, > > u16 sqid) > > return true; > > } > > > > -/* process host replies to the passed through commands */ > > +/* process a user completion queue */ > > +static void nvme_mdev_io_process_cq(struct io_ctx *ctx, u16 cqid) > > +{ > > + nvme_mdev_vcq_process(ctx->vctrl, cqid, true); > > +} > > + > > +/* process hardware completion queue */ > > static int nvme_mdev_io_process_hwq(struct io_ctx *ctx, u16 hwq) > > { > > int n, i; > > @@ -301,22 +324,9 @@ static int nvme_mdev_io_process_hwq(struct io_ctx *ctx, > > u16 hwq) > > return n; > > } > > > > -/* Check if we need to read a command from the admin queue */ > > -static bool nvme_mdev_adm_needs_processing(struct io_ctx *ctx) > > -{ > > - if (!timeout(ctx->last_admin_poll_time, > > - ctx->vctrl->now, ctx->admin_poll_rate_ms)) > > - return false; > > - > > - if (nvme_mdev_vsq_has_data(ctx->vctrl, &ctx->vctrl->vsqs[0])) > > - return true; > > - > > - ctx->last_admin_poll_time = ctx->vctrl->now; > > - return false; > > -} > > > > /* do polling till one of events stops it */ > > -static void nvme_mdev_io_maintask(struct io_ctx *ctx) > > +static void nvme_mdev_io_polling_loop(struct io_ctx *ctx) > > { > > struct nvme_mdev_vctrl *vctrl = ctx->vctrl; > > u16 i, cqid, sqid, hsqcnt; > > @@ -353,7 +363,7 @@ static void nvme_mdev_io_maintask(struct io_ctx *ctx) > > /* process the completions from the guest*/ > > cqid = 1; > > for_each_set_bit_from(cqid, vctrl->vcq_en, MAX_VIRTUAL_QUEUES) > > - nvme_mdev_vcq_process(vctrl, cqid, true); > > + nvme_mdev_io_process_cq(ctx, cqid); > > > > /* process the completions from the hardware*/ > > for (i = 0 ; i < hsqcnt ; i++) > > @@ -470,7 +480,7 @@ static int nvme_mdev_io_polling_thread(void *data) > > if (kthread_should_stop()) > > break; > > > > - nvme_mdev_io_maintask(&ctx); > > + nvme_mdev_io_polling_loop(&ctx); > > } > > > > _DBG(ctx.vctrl, "IO: iothread stopped\n"); > > diff --git a/drivers/nvme/mdev/mmio.c b/drivers/nvme/mdev/mmio.c > > index cf03c1f22f4c..a80962bf4a3d 100644 > > --- a/drivers/nvme/mdev/mmio.c > > +++ b/drivers/nvme/mdev/mmio.c > > @@ -54,9 +54,6 @@ static const struct vm_operations_struct > > nvme_mdev_mmio_dbs_vm_ops = { > > bool nvme_mdev_mmio_db_check(struct nvme_mdev_vctrl *vctrl, > > u16 qid, u16 size, u16 db) > > { > > - if (get_current() != vctrl->iothread) > > - lockdep_assert_held(&vctrl->lock); > > - > > if (db < size) > > return true; > > if (qid == 0) { > >
On Fri, May 03, 2019 at 12:20:17AM +0300, Maxim Levitsky wrote: > On Thu, 2019-05-02 at 15:12 -0600, Heitke, Kenneth wrote: > > On 5/2/2019 5:47 AM, Maxim Levitsky wrote: > > > +static void nvme_ext_queue_free(struct nvme_ctrl *ctrl, u16 qid) > > > +{ > > > + struct nvme_dev *dev = to_nvme_dev(ctrl); > > > + struct nvme_queue *nvmeq; > > > + > > > + mutex_lock(&dev->ext_dev_lock); > > > + nvmeq = &dev->queues[qid]; > > > + > > > + if (WARN_ON(!test_bit(NVMEQ_EXTERNAL, &nvmeq->flags))) > > > + return; > > > > This condition is probably not expected to happen (since its a warning) > > but do you need to unlock the ext_dev_lock before returning? > > This is true, I will fix this. This used to be BUG_ON, but due to checkpatch.pl > complains I turned them all to WARN_ON, and missed this. Gentle reminder to trim your replies to the relevant context. It's much easier to read when we don't need to scroll through hundreds of unnecessary lines.
On Fri, 2019-05-03 at 06:09 -0600, Keith Busch wrote: > On Fri, May 03, 2019 at 12:20:17AM +0300, Maxim Levitsky wrote: > > On Thu, 2019-05-02 at 15:12 -0600, Heitke, Kenneth wrote: > > > On 5/2/2019 5:47 AM, Maxim Levitsky wrote: > > > > +static void nvme_ext_queue_free(struct nvme_ctrl *ctrl, u16 qid) > > > > +{ > > > > + struct nvme_dev *dev = to_nvme_dev(ctrl); > > > > + struct nvme_queue *nvmeq; > > > > + > > > > + mutex_lock(&dev->ext_dev_lock); > > > > + nvmeq = &dev->queues[qid]; > > > > + > > > > + if (WARN_ON(!test_bit(NVMEQ_EXTERNAL, &nvmeq->flags))) > > > > + return; > > > > > > This condition is probably not expected to happen (since its a warning) > > > but do you need to unlock the ext_dev_lock before returning? > > > > This is true, I will fix this. This used to be BUG_ON, but due to > > checkpatch.pl > > complains I turned them all to WARN_ON, and missed this. > > Gentle reminder to trim your replies to the relevant context. It's > much easier to read when we don't need to scroll through hundreds of > unnecessary lines. I fully agree, sorry! Next time I will do this. Best regards, Maxim Levitsky
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 282f28c851c1..87507e710374 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -23,6 +23,7 @@ #include <linux/io-64-nonatomic-lo-hi.h> #include <linux/sed-opal.h> #include <linux/pci-p2pdma.h> +#include "../mdev/mdev.h" #include "trace.h" #include "nvme.h" @@ -32,6 +33,7 @@ #define SGES_PER_PAGE (PAGE_SIZE / sizeof(struct nvme_sgl_desc)) +#define USE_SMALL_PRP_POOL(nprps) ((nprps) < (256 / 8)) /* * These can be higher, but we need to ensure that any command doesn't * require an sg allocation that needs more than a page of data. @@ -83,12 +85,24 @@ static int poll_queues = 0; module_param_cb(poll_queues, &queue_count_ops, &poll_queues, 0644); MODULE_PARM_DESC(poll_queues, "Number of queues to use for polled IO."); +static int mdev_queues; +#ifdef CONFIG_NVME_MDEV +module_param_cb(mdev_queues, &queue_count_ops, &mdev_queues, 0644); +MODULE_PARM_DESC(mdev_queues, "Number of queues to use for mediated VFIO"); +#endif + struct nvme_dev; struct nvme_queue; static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown); static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode); +#ifdef CONFIG_NVME_MDEV +static void nvme_ext_queue_reset(struct nvme_dev *dev, u16 qid); +#else +static void nvme_ext_queue_reset(struct nvme_dev *dev, u16 qid) {} +#endif + /* * Represents an NVM Express device. Each nvme_dev is a PCI function. */ @@ -103,6 +117,7 @@ struct nvme_dev { unsigned online_queues; unsigned max_qid; unsigned io_queues[HCTX_MAX_TYPES]; + unsigned int mdev_queues; unsigned int num_vecs; int q_depth; u32 db_stride; @@ -110,6 +125,7 @@ struct nvme_dev { unsigned long bar_mapped_size; struct work_struct remove_work; struct mutex shutdown_lock; + struct mutex ext_dev_lock; bool subsystem; u64 cmb_size; bool cmb_use_sqes; @@ -172,6 +188,16 @@ static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl) return container_of(ctrl, struct nvme_dev, ctrl); } +/* Simplified IO descriptor for MDEV use */ +struct nvme_ext_iod { + struct list_head link; + u32 user_tag; + int nprps; + struct nvme_ext_data_iter *saved_iter; + dma_addr_t first_prplist_dma; + __le64 *prpslists[NVME_MAX_SEGS]; +}; + /* * An NVM Express queue. Each device has at least two (one for admin * commands and one for I/O commands). @@ -196,15 +222,26 @@ struct nvme_queue { u16 qid; u8 cq_phase; unsigned long flags; + #define NVMEQ_ENABLED 0 #define NVMEQ_SQ_CMB 1 #define NVMEQ_DELETE_ERROR 2 #define NVMEQ_POLLED 3 +#define NVMEQ_EXTERNAL 4 + u32 *dbbuf_sq_db; u32 *dbbuf_cq_db; u32 *dbbuf_sq_ei; u32 *dbbuf_cq_ei; struct completion delete_done; + + /* queue passthrough for external use */ + struct { + int inflight; + struct nvme_ext_iod *iods; + struct list_head free_iods; + struct list_head used_iods; + } ext; }; /* @@ -255,7 +292,7 @@ static inline void _nvme_check_size(void) static unsigned int max_io_queues(void) { - return num_possible_cpus() + write_queues + poll_queues; + return num_possible_cpus() + write_queues + poll_queues + mdev_queues; } static unsigned int max_queue_count(void) @@ -1066,6 +1103,7 @@ static irqreturn_t nvme_irq(int irq, void *data) * the irq handler, even if that was on another CPU. */ rmb(); + if (nvmeq->cq_head != nvmeq->last_cq_head) ret = IRQ_HANDLED; nvme_process_cq(nvmeq, &start, &end, -1); @@ -1553,7 +1591,11 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid) memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth)); nvme_dbbuf_init(dev, nvmeq, qid); dev->online_queues++; + wmb(); /* ensure the first interrupt sees the initialization */ + + if (test_bit(NVMEQ_EXTERNAL, &nvmeq->flags)) + nvme_ext_queue_reset(nvmeq->dev, qid); } static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled) @@ -1759,7 +1801,7 @@ static int nvme_create_io_queues(struct nvme_dev *dev) } max = min(dev->max_qid, dev->ctrl.queue_count - 1); - if (max != 1 && dev->io_queues[HCTX_TYPE_POLL]) { + if (max != 1) { rw_queues = dev->io_queues[HCTX_TYPE_DEFAULT] + dev->io_queues[HCTX_TYPE_READ]; } else { @@ -2095,14 +2137,23 @@ static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues) * Poll queues don't need interrupts, but we need at least one IO * queue left over for non-polled IO. */ - this_p_queues = poll_queues; + this_p_queues = poll_queues + mdev_queues; if (this_p_queues >= nr_io_queues) { this_p_queues = nr_io_queues - 1; irq_queues = 1; } else { irq_queues = nr_io_queues - this_p_queues + 1; } + + if (mdev_queues > this_p_queues) { + mdev_queues = this_p_queues; + this_p_queues = 0; + } else { + this_p_queues -= mdev_queues; + } + dev->io_queues[HCTX_TYPE_POLL] = this_p_queues; + dev->mdev_queues = mdev_queues; /* Initialize for the single interrupt case */ dev->io_queues[HCTX_TYPE_DEFAULT] = 1; @@ -2170,7 +2221,8 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) dev->num_vecs = result; result = max(result - 1, 1); - dev->max_qid = result + dev->io_queues[HCTX_TYPE_POLL]; + dev->max_qid = result + dev->io_queues[HCTX_TYPE_POLL] + + dev->mdev_queues; /* * Should investigate if there's a performance win from allocating @@ -2193,10 +2245,11 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) nvme_suspend_io_queues(dev); goto retry; } - dev_info(dev->ctrl.device, "%d/%d/%d default/read/poll queues\n", + dev_info(dev->ctrl.device, "%d/%d/%d/%d default/read/poll/mdev queues\n", dev->io_queues[HCTX_TYPE_DEFAULT], dev->io_queues[HCTX_TYPE_READ], - dev->io_queues[HCTX_TYPE_POLL]); + dev->io_queues[HCTX_TYPE_POLL], + dev->mdev_queues); return 0; } @@ -2623,6 +2676,301 @@ static void nvme_remove_dead_ctrl_work(struct work_struct *work) nvme_put_ctrl(&dev->ctrl); } +#ifdef CONFIG_NVME_MDEV +static void nvme_ext_free_iod(struct nvme_dev *dev, struct nvme_ext_iod *iod) +{ + int i = 0, max_prp, nprps = iod->nprps; + dma_addr_t dma = iod->first_prplist_dma; + + if (iod->saved_iter) { + iod->saved_iter->release(iod->saved_iter); + iod->saved_iter = NULL; + } + + if (--nprps < 2) { + goto out; + } else if (USE_SMALL_PRP_POOL(nprps)) { + dma_pool_free(dev->prp_small_pool, iod->prpslists[0], dma); + goto out; + } + + max_prp = (dev->ctrl.page_size >> 3) - 1; + while (nprps > 0) { + if (i > 0) { + dma = iod->prpslists[i - 1][max_prp]; + if (nprps == 1) + break; + } + dma_pool_free(dev->prp_page_pool, iod->prpslists[i++], dma); + nprps -= max_prp; + } +out: + iod->nprps = -1; + iod->first_prplist_dma = 0; + iod->user_tag = 0xDEADDEAD; +} + +static int nvme_ext_setup_iod(struct nvme_dev *dev, struct nvme_ext_iod *iod, + struct nvme_common_command *cmd, + struct nvme_ext_data_iter *iter) +{ + int ret, i, j; + __le64 *prp_list; + dma_addr_t prp_dma; + struct dma_pool *pool; + int max_prp = (dev->ctrl.page_size >> 3) - 1; + + iod->saved_iter = iter && iter->release ? iter : NULL; + iod->nprps = iter ? iter->count : 0; + cmd->dptr.prp1 = 0; + cmd->dptr.prp2 = 0; + cmd->metadata = 0; + + if (!iter) + return 0; + + /* put first pointer*/ + cmd->dptr.prp1 = cpu_to_le64(iter->host_iova); + if (iter->count == 1) + return 0; + + ret = iter->next(iter); + if (ret) + goto error; + + /* if only have one more pointer, put it to second data pointer*/ + if (iter->count == 1) { + cmd->dptr.prp2 = cpu_to_le64(iter->host_iova); + return 0; + } + + pool = USE_SMALL_PRP_POOL(iter->count) ? dev->prp_small_pool : + dev->prp_page_pool; + + /* Allocate prp lists as needed and fill them */ + for (i = 0 ; i < NVME_MAX_SEGS && iter->count ; i++) { + prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma); + if (!prp_list) { + ret = -ENOMEM; + goto error; + } + + iod->prpslists[i++] = prp_list; + + if (i == 1) { + iod->first_prplist_dma = prp_dma; + cmd->dptr.prp2 = cpu_to_le64(prp_dma); + j = 0; + } else { + prp_list[0] = iod->prpslists[i - 1][max_prp]; + iod->prpslists[i - 1][max_prp] = prp_dma; + j = 1; + } + + while (j <= max_prp && iter->count) { + prp_list[j++] = iter->host_iova; + ret = iter->next(iter); + if (ret) + goto error; + } + } + + if (iter->count) { + ret = -ENOSPC; + goto error; + } + return 0; +error: + iod->nprps -= iter->count; + nvme_ext_free_iod(dev, iod); + return ret; +} + +static int nvme_ext_queues_available(struct nvme_ctrl *ctrl) +{ + struct nvme_dev *dev = to_nvme_dev(ctrl); + unsigned int ret = 0, qid; + unsigned int first_mdev_q = dev->online_queues - dev->mdev_queues; + + for (qid = first_mdev_q; qid < dev->online_queues; qid++) { + struct nvme_queue *nvmeq = &dev->queues[qid]; + + if (!test_bit(NVMEQ_EXTERNAL, &nvmeq->flags)) + ret++; + } + return ret; +} + +static void nvme_ext_queue_reset(struct nvme_dev *dev, u16 qid) +{ + struct nvme_queue *nvmeq = &dev->queues[qid]; + struct nvme_ext_iod *iod, *tmp; + + list_for_each_entry_safe(iod, tmp, &nvmeq->ext.used_iods, link) { + if (iod->saved_iter && iod->saved_iter->release) { + iod->saved_iter->release(iod->saved_iter); + iod->saved_iter = NULL; + list_move(&iod->link, &nvmeq->ext.free_iods); + } + } + + nvmeq->ext.inflight = 0; +} + +static int nvme_ext_queue_alloc(struct nvme_ctrl *ctrl, u16 *ret_qid) +{ + struct nvme_dev *dev = to_nvme_dev(ctrl); + struct nvme_queue *nvmeq; + int ret = 0, qid, i; + unsigned int first_mdev_q = dev->online_queues - dev->mdev_queues; + + mutex_lock(&dev->ext_dev_lock); + + /* find a polled queue to allocate */ + for (qid = dev->online_queues - 1 ; qid >= first_mdev_q ; qid--) { + nvmeq = &dev->queues[qid]; + if (!test_bit(NVMEQ_EXTERNAL, &nvmeq->flags)) + break; + } + + if (qid < first_mdev_q) { + ret = -ENOSPC; + goto out; + } + + INIT_LIST_HEAD(&nvmeq->ext.free_iods); + INIT_LIST_HEAD(&nvmeq->ext.used_iods); + + nvmeq->ext.iods = + vzalloc_node(sizeof(struct nvme_ext_iod) * nvmeq->q_depth, + dev_to_node(dev->dev)); + + if (!nvmeq->ext.iods) { + ret = -ENOMEM; + goto out; + } + + for (i = 0 ; i < nvmeq->q_depth ; i++) + list_add_tail(&nvmeq->ext.iods[i].link, &nvmeq->ext.free_iods); + + set_bit(NVMEQ_EXTERNAL, &nvmeq->flags); + *ret_qid = qid; +out: + mutex_unlock(&dev->ext_dev_lock); + return ret; +} + +static void nvme_ext_queue_free(struct nvme_ctrl *ctrl, u16 qid) +{ + struct nvme_dev *dev = to_nvme_dev(ctrl); + struct nvme_queue *nvmeq; + + mutex_lock(&dev->ext_dev_lock); + nvmeq = &dev->queues[qid]; + + if (WARN_ON(!test_bit(NVMEQ_EXTERNAL, &nvmeq->flags))) + return; + + nvme_ext_queue_reset(dev, qid); + + vfree(nvmeq->ext.iods); + nvmeq->ext.iods = NULL; + INIT_LIST_HEAD(&nvmeq->ext.free_iods); + INIT_LIST_HEAD(&nvmeq->ext.used_iods); + + clear_bit(NVMEQ_EXTERNAL, &nvmeq->flags); + mutex_unlock(&dev->ext_dev_lock); +} + +static int nvme_ext_queue_submit(struct nvme_ctrl *ctrl, u16 qid, u32 user_tag, + struct nvme_command *command, + struct nvme_ext_data_iter *iter) +{ + struct nvme_dev *dev = to_nvme_dev(ctrl); + struct nvme_queue *nvmeq = &dev->queues[qid]; + struct nvme_ext_iod *iod; + int ret; + + if (WARN_ON(!test_bit(NVMEQ_EXTERNAL, &nvmeq->flags))) + return -EINVAL; + + if (list_empty(&nvmeq->ext.free_iods)) + return -1; + + iod = list_first_entry(&nvmeq->ext.free_iods, + struct nvme_ext_iod, link); + + list_move(&iod->link, &nvmeq->ext.used_iods); + + command->common.command_id = cpu_to_le16(iod - nvmeq->ext.iods); + iod->user_tag = user_tag; + + ret = nvme_ext_setup_iod(dev, iod, &command->common, iter); + if (ret) { + list_move(&iod->link, &nvmeq->ext.free_iods); + return ret; + } + + nvmeq->ext.inflight++; + nvme_submit_cmd(nvmeq, command, true); + return 0; +} + +static int nvme_ext_queue_poll(struct nvme_ctrl *ctrl, u16 qid, + struct nvme_ext_cmd_result *results, + unsigned int max_len) +{ + struct nvme_dev *dev = to_nvme_dev(ctrl); + struct nvme_queue *nvmeq = &dev->queues[qid]; + u16 old_head; + int i, j; + + if (WARN_ON(!test_bit(NVMEQ_EXTERNAL, &nvmeq->flags))) + return -EINVAL; + + if (nvmeq->ext.inflight == 0) + return -1; + + old_head = nvmeq->cq_head; + + for (i = 0 ; nvme_cqe_pending(nvmeq) && i < max_len ; i++) { + u16 status = le16_to_cpu(nvmeq->cqes[nvmeq->cq_head].status); + u16 tag = le16_to_cpu(nvmeq->cqes[nvmeq->cq_head].command_id); + + results[i].status = status >> 1; + results[i].tag = (u32)tag; + nvme_update_cq_head(nvmeq); + } + + if (old_head != nvmeq->cq_head) + nvme_ring_cq_doorbell(nvmeq); + + for (j = 0 ; j < i ; j++) { + u16 tag = results[j].tag & 0xFFFF; + struct nvme_ext_iod *iod = &nvmeq->ext.iods[tag]; + + if (WARN_ON(tag >= nvmeq->q_depth || iod->nprps == -1)) + continue; + + results[j].tag = iod->user_tag; + nvme_ext_free_iod(dev, iod); + list_move(&iod->link, &nvmeq->ext.free_iods); + nvmeq->ext.inflight--; + } + + WARN_ON(nvmeq->ext.inflight < 0); + return i; +} + +static bool nvme_ext_queue_full(struct nvme_ctrl *ctrl, u16 qid) +{ + struct nvme_dev *dev = to_nvme_dev(ctrl); + struct nvme_queue *nvmeq = &dev->queues[qid]; + + return nvmeq->ext.inflight < nvmeq->q_depth - 1; +} +#endif + static int nvme_pci_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val) { *val = readl(to_nvme_dev(ctrl)->bar + off); @@ -2652,13 +3000,25 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = { .name = "pcie", .module = THIS_MODULE, .flags = NVME_F_METADATA_SUPPORTED | - NVME_F_PCI_P2PDMA, + NVME_F_PCI_P2PDMA | + NVME_F_MDEV_SUPPORTED | + NVME_F_MDEV_DMA_SUPPORTED, + .reg_read32 = nvme_pci_reg_read32, .reg_write32 = nvme_pci_reg_write32, .reg_read64 = nvme_pci_reg_read64, .free_ctrl = nvme_pci_free_ctrl, .submit_async_event = nvme_pci_submit_async_event, .get_address = nvme_pci_get_address, + +#ifdef CONFIG_NVME_MDEV + .ext_queues_available = nvme_ext_queues_available, + .ext_queue_alloc = nvme_ext_queue_alloc, + .ext_queue_free = nvme_ext_queue_free, + .ext_queue_submit = nvme_ext_queue_submit, + .ext_queue_poll = nvme_ext_queue_poll, + .ext_queue_full = nvme_ext_queue_full, +#endif }; static int nvme_dev_map(struct nvme_dev *dev) @@ -2747,6 +3107,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work); INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work); mutex_init(&dev->shutdown_lock); + mutex_init(&dev->ext_dev_lock); result = nvme_setup_prp_pools(dev); if (result) diff --git a/drivers/nvme/mdev/host.c b/drivers/nvme/mdev/host.c index 5766bad7e909..6590946b86c2 100644 --- a/drivers/nvme/mdev/host.c +++ b/drivers/nvme/mdev/host.c @@ -48,19 +48,21 @@ static struct nvme_mdev_hctrl *nvme_mdev_hctrl_create(struct nvme_ctrl *ctrl) return NULL; } + hctrl = kzalloc_node(sizeof(*hctrl), GFP_KERNEL, + dev_to_node(ctrl->dev)); + if (!hctrl) + return NULL; + nr_host_queues = ctrl->ops->ext_queues_available(ctrl); max_lba_transfer = ctrl->max_hw_sectors >> (PAGE_SHIFT - 9); if (nr_host_queues == 0) { dev_info(ctrl->dev, "no support for mdev - no mdev reserved queues available"); + kfree(hctrl); return NULL; } - hctrl = kzalloc_node(sizeof(*hctrl), GFP_KERNEL, - dev_to_node(ctrl->dev)); - if (!hctrl) - return NULL; kref_init(&hctrl->ref); mutex_init(&hctrl->lock); @@ -180,6 +182,24 @@ void nvme_mdev_hctrl_hqs_unreserve(struct nvme_mdev_hctrl *hctrl, mutex_unlock(&hctrl->lock); } +/* Check if IO passthrough is supported for given IO optcode */ +bool nvme_mdev_hctrl_hq_check_op(struct nvme_mdev_hctrl *hctrl, u8 optcode) +{ + switch (optcode) { + case nvme_cmd_flush: + case nvme_cmd_read: + case nvme_cmd_write: + /* these are mandatory*/ + return true; + case nvme_cmd_write_zeroes: + return (hctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES); + case nvme_cmd_dsm: + return (hctrl->oncs & NVME_CTRL_ONCS_DSM); + default: + return false; + } +} + /* Allocate a host IO queue */ int nvme_mdev_hctrl_hq_alloc(struct nvme_mdev_hctrl *hctrl) { @@ -204,23 +224,7 @@ bool nvme_mdev_hctrl_hq_can_submit(struct nvme_mdev_hctrl *hctrl, u16 qid) return hctrl->nvme_ctrl->ops->ext_queue_full(hctrl->nvme_ctrl, qid); } -/* Check if IO passthrough is supported for given IO optcode */ -bool nvme_mdev_hctrl_hq_check_op(struct nvme_mdev_hctrl *hctrl, u8 optcode) -{ - switch (optcode) { - case nvme_cmd_flush: - case nvme_cmd_read: - case nvme_cmd_write: - /* these are mandatory*/ - return true; - case nvme_cmd_write_zeroes: - return (hctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES); - case nvme_cmd_dsm: - return (hctrl->oncs & NVME_CTRL_ONCS_DSM); - default: - return false; - } -} + /* Submit a IO passthrough command */ int nvme_mdev_hctrl_hq_submit(struct nvme_mdev_hctrl *hctrl, diff --git a/drivers/nvme/mdev/io.c b/drivers/nvme/mdev/io.c index a731196d0365..59837540fec2 100644 --- a/drivers/nvme/mdev/io.c +++ b/drivers/nvme/mdev/io.c @@ -11,14 +11,16 @@ #include <linux/ktime.h> #include "priv.h" + struct io_ctx { struct nvme_mdev_hctrl *hctrl; struct nvme_mdev_vctrl *vctrl; const struct nvme_command *in; - struct nvme_command out; struct nvme_mdev_vns *ns; struct nvme_ext_data_iter udatait; + + struct nvme_command out; struct nvme_ext_data_iter *kdatait; ktime_t last_io_t; @@ -28,6 +30,20 @@ struct io_ctx { unsigned int arb_burst; }; +/* Check if we need to read a command from the admin queue */ +static bool nvme_mdev_adm_needs_processing(struct io_ctx *ctx) +{ + if (!timeout(ctx->last_admin_poll_time, + ctx->vctrl->now, ctx->admin_poll_rate_ms)) + return false; + + if (nvme_mdev_vsq_has_data(ctx->vctrl, &ctx->vctrl->vsqs[0])) + return true; + + ctx->last_admin_poll_time = ctx->vctrl->now; + return false; +} + /* Handle read/write command.*/ static int nvme_mdev_io_translate_rw(struct io_ctx *ctx) { @@ -229,6 +245,7 @@ static int nvme_mdev_io_translate_cmd(struct io_ctx *ctx) } } +/* process a user submission queue */ static bool nvme_mdev_io_process_sq(struct io_ctx *ctx, u16 sqid) { struct nvme_vsq *vsq = &ctx->vctrl->vsqs[sqid]; @@ -275,7 +292,13 @@ static bool nvme_mdev_io_process_sq(struct io_ctx *ctx, u16 sqid) return true; } -/* process host replies to the passed through commands */ +/* process a user completion queue */ +static void nvme_mdev_io_process_cq(struct io_ctx *ctx, u16 cqid) +{ + nvme_mdev_vcq_process(ctx->vctrl, cqid, true); +} + +/* process hardware completion queue */ static int nvme_mdev_io_process_hwq(struct io_ctx *ctx, u16 hwq) { int n, i; @@ -301,22 +324,9 @@ static int nvme_mdev_io_process_hwq(struct io_ctx *ctx, u16 hwq) return n; } -/* Check if we need to read a command from the admin queue */ -static bool nvme_mdev_adm_needs_processing(struct io_ctx *ctx) -{ - if (!timeout(ctx->last_admin_poll_time, - ctx->vctrl->now, ctx->admin_poll_rate_ms)) - return false; - - if (nvme_mdev_vsq_has_data(ctx->vctrl, &ctx->vctrl->vsqs[0])) - return true; - - ctx->last_admin_poll_time = ctx->vctrl->now; - return false; -} /* do polling till one of events stops it */ -static void nvme_mdev_io_maintask(struct io_ctx *ctx) +static void nvme_mdev_io_polling_loop(struct io_ctx *ctx) { struct nvme_mdev_vctrl *vctrl = ctx->vctrl; u16 i, cqid, sqid, hsqcnt; @@ -353,7 +363,7 @@ static void nvme_mdev_io_maintask(struct io_ctx *ctx) /* process the completions from the guest*/ cqid = 1; for_each_set_bit_from(cqid, vctrl->vcq_en, MAX_VIRTUAL_QUEUES) - nvme_mdev_vcq_process(vctrl, cqid, true); + nvme_mdev_io_process_cq(ctx, cqid); /* process the completions from the hardware*/ for (i = 0 ; i < hsqcnt ; i++) @@ -470,7 +480,7 @@ static int nvme_mdev_io_polling_thread(void *data) if (kthread_should_stop()) break; - nvme_mdev_io_maintask(&ctx); + nvme_mdev_io_polling_loop(&ctx); } _DBG(ctx.vctrl, "IO: iothread stopped\n"); diff --git a/drivers/nvme/mdev/mmio.c b/drivers/nvme/mdev/mmio.c index cf03c1f22f4c..a80962bf4a3d 100644 --- a/drivers/nvme/mdev/mmio.c +++ b/drivers/nvme/mdev/mmio.c @@ -54,9 +54,6 @@ static const struct vm_operations_struct nvme_mdev_mmio_dbs_vm_ops = { bool nvme_mdev_mmio_db_check(struct nvme_mdev_vctrl *vctrl, u16 qid, u16 size, u16 db) { - if (get_current() != vctrl->iothread) - lockdep_assert_held(&vctrl->lock); - if (db < size) return true; if (qid == 0) {
Note that currently the number of hw queues reserved for mdev, has to be pre determined on module load. (I used to allocate the queues dynamicaly on demand, but recent changes to allocate polled/read queues made this somewhat difficult, so I dropped this for now) Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com> --- drivers/nvme/host/pci.c | 375 ++++++++++++++++++++++++++++++++++++++- drivers/nvme/mdev/host.c | 46 ++--- drivers/nvme/mdev/io.c | 46 +++-- drivers/nvme/mdev/mmio.c | 3 - 4 files changed, 421 insertions(+), 49 deletions(-)