Message ID | 20210901131434.31158-1-mgurtovoy@nvidia.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [v3,1/1] virtio-blk: avoid preallocating big SGL for data | expand |
On Wed, Sep 01, 2021 at 04:14:34PM +0300, Max Gurtovoy wrote: > No need to pre-allocate a big buffer for the IO SGL anymore. If a device > has lots of deep queues, preallocation for the sg list can consume > substantial amounts of memory. For HW virtio-blk device, nr_hw_queues > can be 64 or 128 and each queue's depth might be 128. This means the > resulting preallocation for the data SGLs is big. > > Switch to runtime allocation for SGL for lists longer than 2 entries. > This is the approach used by NVMe drivers so it should be reasonable for > virtio block as well. Runtime SGL allocation has always been the case > for the legacy I/O path so this is nothing new. > > The preallocated small SGL depends on SG_CHAIN so if the ARCH doesn't > support SG_CHAIN, use only runtime allocation for the SGL. > > Re-organize the setup of the IO request to fit the new sg chain > mechanism. > > No performance degradation was seen (fio libaio engine with 16 jobs and > 128 iodepth): > > IO size IOPs Rand Read (before/after) IOPs Rand Write (before/after) > -------- --------------------------------- ---------------------------------- > 512B 318K/316K 329K/325K > > 4KB 323K/321K 353K/349K > > 16KB 199K/208K 250K/275K > > 128KB 36K/36.1K 39.2K/41.7K > > Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com> > Reviewed-by: Israel Rukshin <israelr@nvidia.com> Could you use something to give confidence intervals maybe? As it is it looks like a 1-2% regression for 512B and 4KB. > --- > > changes from V2: > - initialize vbr->out_hdr.sector during virtblk_setup_cmd > > changes from V1: > - Kconfig update (from Christoph) > - Re-order cmd setup (from Christoph) > - use flexible sg pointer in the cmd (from Christoph) > - added perf numbers to commit msg (from Feng Li) > > --- > drivers/block/Kconfig | 1 + > drivers/block/virtio_blk.c | 155 +++++++++++++++++++++++-------------- > 2 files changed, 100 insertions(+), 56 deletions(-) > > diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig > index 63056cfd4b62..ca25a122b8ee 100644 > --- a/drivers/block/Kconfig > +++ b/drivers/block/Kconfig > @@ -395,6 +395,7 @@ config XEN_BLKDEV_BACKEND > config VIRTIO_BLK > tristate "Virtio block driver" > depends on VIRTIO > + select SG_POOL > help > This is the virtual block driver for virtio. It can be used with > QEMU based VMMs (like KVM or Xen). Say Y or M. > diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c > index 9332fc4e9b31..bdd6d415bd20 100644 > --- a/drivers/block/virtio_blk.c > +++ b/drivers/block/virtio_blk.c > @@ -24,6 +24,12 @@ > /* The maximum number of sg elements that fit into a virtqueue */ > #define VIRTIO_BLK_MAX_SG_ELEMS 32768 > > +#ifdef CONFIG_ARCH_NO_SG_CHAIN > +#define VIRTIO_BLK_INLINE_SG_CNT 0 > +#else > +#define VIRTIO_BLK_INLINE_SG_CNT 2 > +#endif > + > static int virtblk_queue_count_set(const char *val, > const struct kernel_param *kp) > { > @@ -93,6 +99,7 @@ struct virtio_blk { > struct virtblk_req { > struct virtio_blk_outhdr out_hdr; > u8 status; > + struct sg_table sg_table; > struct scatterlist sg[]; > }; > > @@ -178,15 +185,94 @@ static int virtblk_setup_discard_write_zeroes(struct request *req, bool unmap) > return 0; > } > > -static inline void virtblk_request_done(struct request *req) > +static void virtblk_unmap_data(struct request *req, struct virtblk_req *vbr) > { > - struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); > + if (blk_rq_nr_phys_segments(req)) > + sg_free_table_chained(&vbr->sg_table, > + VIRTIO_BLK_INLINE_SG_CNT); > +} > + > +static int virtblk_map_data(struct blk_mq_hw_ctx *hctx, struct request *req, > + struct virtblk_req *vbr) > +{ > + int err; > + > + if (!blk_rq_nr_phys_segments(req)) > + return 0; > + > + vbr->sg_table.sgl = vbr->sg; > + err = sg_alloc_table_chained(&vbr->sg_table, > + blk_rq_nr_phys_segments(req), > + vbr->sg_table.sgl, > + VIRTIO_BLK_INLINE_SG_CNT); > + if (unlikely(err)) > + return -ENOMEM; > > + return blk_rq_map_sg(hctx->queue, req, vbr->sg_table.sgl); > +} > + > +static void virtblk_cleanup_cmd(struct request *req) > +{ > if (req->rq_flags & RQF_SPECIAL_PAYLOAD) { > kfree(page_address(req->special_vec.bv_page) + > req->special_vec.bv_offset); > } > +} > + > +static int virtblk_setup_cmd(struct virtio_device *vdev, struct request *req, > + struct virtblk_req *vbr) > +{ > + bool unmap = false; > + u32 type; > + > + vbr->out_hdr.sector = 0; > + > + switch (req_op(req)) { > + case REQ_OP_READ: > + type = VIRTIO_BLK_T_IN; > + vbr->out_hdr.sector = cpu_to_virtio64(vdev, > + blk_rq_pos(req)); > + break; > + case REQ_OP_WRITE: > + type = VIRTIO_BLK_T_OUT; > + vbr->out_hdr.sector = cpu_to_virtio64(vdev, > + blk_rq_pos(req)); > + break; > + case REQ_OP_FLUSH: > + type = VIRTIO_BLK_T_FLUSH; > + break; > + case REQ_OP_DISCARD: > + type = VIRTIO_BLK_T_DISCARD; > + break; > + case REQ_OP_WRITE_ZEROES: > + type = VIRTIO_BLK_T_WRITE_ZEROES; > + unmap = !(req->cmd_flags & REQ_NOUNMAP); > + break; > + case REQ_OP_DRV_IN: > + type = VIRTIO_BLK_T_GET_ID; > + break; > + default: > + WARN_ON_ONCE(1); > + return BLK_STS_IOERR; > + } > > + vbr->out_hdr.type = cpu_to_virtio32(vdev, type); > + vbr->out_hdr.ioprio = cpu_to_virtio32(vdev, req_get_ioprio(req)); > + > + if (type == VIRTIO_BLK_T_DISCARD || type == VIRTIO_BLK_T_WRITE_ZEROES) { > + if (virtblk_setup_discard_write_zeroes(req, unmap)) > + return BLK_STS_RESOURCE; > + } > + > + return 0; > +} > + > +static inline void virtblk_request_done(struct request *req) > +{ > + struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); > + > + virtblk_unmap_data(req, vbr); > + virtblk_cleanup_cmd(req); > blk_mq_end_request(req, virtblk_result(vbr)); > } > > @@ -244,57 +330,23 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx, > int qid = hctx->queue_num; > int err; > bool notify = false; > - bool unmap = false; > - u32 type; > > BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems); > > - switch (req_op(req)) { > - case REQ_OP_READ: > - case REQ_OP_WRITE: > - type = 0; > - break; > - case REQ_OP_FLUSH: > - type = VIRTIO_BLK_T_FLUSH; > - break; > - case REQ_OP_DISCARD: > - type = VIRTIO_BLK_T_DISCARD; > - break; > - case REQ_OP_WRITE_ZEROES: > - type = VIRTIO_BLK_T_WRITE_ZEROES; > - unmap = !(req->cmd_flags & REQ_NOUNMAP); > - break; > - case REQ_OP_DRV_IN: > - type = VIRTIO_BLK_T_GET_ID; > - break; > - default: > - WARN_ON_ONCE(1); > - return BLK_STS_IOERR; > - } > - > - vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, type); > - vbr->out_hdr.sector = type ? > - 0 : cpu_to_virtio64(vblk->vdev, blk_rq_pos(req)); > - vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(req)); > + err = virtblk_setup_cmd(vblk->vdev, req, vbr); > + if (unlikely(err)) > + return err; > > blk_mq_start_request(req); > > - if (type == VIRTIO_BLK_T_DISCARD || type == VIRTIO_BLK_T_WRITE_ZEROES) { > - err = virtblk_setup_discard_write_zeroes(req, unmap); > - if (err) > - return BLK_STS_RESOURCE; > - } > - > - num = blk_rq_map_sg(hctx->queue, req, vbr->sg); > - if (num) { > - if (rq_data_dir(req) == WRITE) > - vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_OUT); > - else > - vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_IN); > + num = virtblk_map_data(hctx, req, vbr); > + if (unlikely(num < 0)) { > + virtblk_cleanup_cmd(req); > + return BLK_STS_RESOURCE; > } > > spin_lock_irqsave(&vblk->vqs[qid].lock, flags); > - err = virtblk_add_req(vblk->vqs[qid].vq, vbr, vbr->sg, num); > + err = virtblk_add_req(vblk->vqs[qid].vq, vbr, vbr->sg_table.sgl, num); > if (err) { > virtqueue_kick(vblk->vqs[qid].vq); > /* Don't stop the queue if -ENOMEM: we may have failed to > @@ -303,6 +355,8 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx, > if (err == -ENOSPC) > blk_mq_stop_hw_queue(hctx); > spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags); > + virtblk_unmap_data(req, vbr); > + virtblk_cleanup_cmd(req); > switch (err) { > case -ENOSPC: > return BLK_STS_DEV_RESOURCE; > @@ -681,16 +735,6 @@ static const struct attribute_group *virtblk_attr_groups[] = { > NULL, > }; > > -static int virtblk_init_request(struct blk_mq_tag_set *set, struct request *rq, > - unsigned int hctx_idx, unsigned int numa_node) > -{ > - struct virtio_blk *vblk = set->driver_data; > - struct virtblk_req *vbr = blk_mq_rq_to_pdu(rq); > - > - sg_init_table(vbr->sg, vblk->sg_elems); > - return 0; > -} > - > static int virtblk_map_queues(struct blk_mq_tag_set *set) > { > struct virtio_blk *vblk = set->driver_data; > @@ -703,7 +747,6 @@ static const struct blk_mq_ops virtio_mq_ops = { > .queue_rq = virtio_queue_rq, > .commit_rqs = virtio_commit_rqs, > .complete = virtblk_request_done, > - .init_request = virtblk_init_request, > .map_queues = virtblk_map_queues, > }; > > @@ -783,7 +826,7 @@ static int virtblk_probe(struct virtio_device *vdev) > vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; > vblk->tag_set.cmd_size = > sizeof(struct virtblk_req) + > - sizeof(struct scatterlist) * sg_elems; > + sizeof(struct scatterlist) * VIRTIO_BLK_INLINE_SG_CNT; > vblk->tag_set.driver_data = vblk; > vblk->tag_set.nr_hw_queues = vblk->num_vqs; > > -- > 2.18.1
On 9/1/2021 5:50 PM, Michael S. Tsirkin wrote: > On Wed, Sep 01, 2021 at 04:14:34PM +0300, Max Gurtovoy wrote: >> No need to pre-allocate a big buffer for the IO SGL anymore. If a device >> has lots of deep queues, preallocation for the sg list can consume >> substantial amounts of memory. For HW virtio-blk device, nr_hw_queues >> can be 64 or 128 and each queue's depth might be 128. This means the >> resulting preallocation for the data SGLs is big. >> >> Switch to runtime allocation for SGL for lists longer than 2 entries. >> This is the approach used by NVMe drivers so it should be reasonable for >> virtio block as well. Runtime SGL allocation has always been the case >> for the legacy I/O path so this is nothing new. >> >> The preallocated small SGL depends on SG_CHAIN so if the ARCH doesn't >> support SG_CHAIN, use only runtime allocation for the SGL. >> >> Re-organize the setup of the IO request to fit the new sg chain >> mechanism. >> >> No performance degradation was seen (fio libaio engine with 16 jobs and >> 128 iodepth): >> >> IO size IOPs Rand Read (before/after) IOPs Rand Write (before/after) >> -------- --------------------------------- ---------------------------------- >> 512B 318K/316K 329K/325K >> >> 4KB 323K/321K 353K/349K >> >> 16KB 199K/208K 250K/275K >> >> 128KB 36K/36.1K 39.2K/41.7K >> >> Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com> >> Reviewed-by: Israel Rukshin <israelr@nvidia.com> > Could you use something to give confidence intervals maybe? > As it is it looks like a 1-2% regression for 512B and 4KB. 1%-2% is not a regression. It's a device/env/test variance. This is just one test results. I run it many times and got difference by +/- 2%-3% in each run for each sides. Even if I run same driver without changes I get 2%-3% difference between runs. If you have a perf test suite for virtio-blk it will be great if you can run it, or maybe Feng Li has. > > > >> --- >> >> changes from V2: >> - initialize vbr->out_hdr.sector during virtblk_setup_cmd >> >> changes from V1: >> - Kconfig update (from Christoph) >> - Re-order cmd setup (from Christoph) >> - use flexible sg pointer in the cmd (from Christoph) >> - added perf numbers to commit msg (from Feng Li) >> >> --- >> drivers/block/Kconfig | 1 + >> drivers/block/virtio_blk.c | 155 +++++++++++++++++++++++-------------- >> 2 files changed, 100 insertions(+), 56 deletions(-) >> >> diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig >> index 63056cfd4b62..ca25a122b8ee 100644 >> --- a/drivers/block/Kconfig >> +++ b/drivers/block/Kconfig >> @@ -395,6 +395,7 @@ config XEN_BLKDEV_BACKEND >> config VIRTIO_BLK >> tristate "Virtio block driver" >> depends on VIRTIO >> + select SG_POOL >> help >> This is the virtual block driver for virtio. It can be used with >> QEMU based VMMs (like KVM or Xen). Say Y or M. >> diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c >> index 9332fc4e9b31..bdd6d415bd20 100644 >> --- a/drivers/block/virtio_blk.c >> +++ b/drivers/block/virtio_blk.c >> @@ -24,6 +24,12 @@ >> /* The maximum number of sg elements that fit into a virtqueue */ >> #define VIRTIO_BLK_MAX_SG_ELEMS 32768 >> >> +#ifdef CONFIG_ARCH_NO_SG_CHAIN >> +#define VIRTIO_BLK_INLINE_SG_CNT 0 >> +#else >> +#define VIRTIO_BLK_INLINE_SG_CNT 2 >> +#endif >> + >> static int virtblk_queue_count_set(const char *val, >> const struct kernel_param *kp) >> { >> @@ -93,6 +99,7 @@ struct virtio_blk { >> struct virtblk_req { >> struct virtio_blk_outhdr out_hdr; >> u8 status; >> + struct sg_table sg_table; >> struct scatterlist sg[]; >> }; >> >> @@ -178,15 +185,94 @@ static int virtblk_setup_discard_write_zeroes(struct request *req, bool unmap) >> return 0; >> } >> >> -static inline void virtblk_request_done(struct request *req) >> +static void virtblk_unmap_data(struct request *req, struct virtblk_req *vbr) >> { >> - struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); >> + if (blk_rq_nr_phys_segments(req)) >> + sg_free_table_chained(&vbr->sg_table, >> + VIRTIO_BLK_INLINE_SG_CNT); >> +} >> + >> +static int virtblk_map_data(struct blk_mq_hw_ctx *hctx, struct request *req, >> + struct virtblk_req *vbr) >> +{ >> + int err; >> + >> + if (!blk_rq_nr_phys_segments(req)) >> + return 0; >> + >> + vbr->sg_table.sgl = vbr->sg; >> + err = sg_alloc_table_chained(&vbr->sg_table, >> + blk_rq_nr_phys_segments(req), >> + vbr->sg_table.sgl, >> + VIRTIO_BLK_INLINE_SG_CNT); >> + if (unlikely(err)) >> + return -ENOMEM; >> >> + return blk_rq_map_sg(hctx->queue, req, vbr->sg_table.sgl); >> +} >> + >> +static void virtblk_cleanup_cmd(struct request *req) >> +{ >> if (req->rq_flags & RQF_SPECIAL_PAYLOAD) { >> kfree(page_address(req->special_vec.bv_page) + >> req->special_vec.bv_offset); >> } >> +} >> + >> +static int virtblk_setup_cmd(struct virtio_device *vdev, struct request *req, >> + struct virtblk_req *vbr) >> +{ >> + bool unmap = false; >> + u32 type; >> + >> + vbr->out_hdr.sector = 0; >> + >> + switch (req_op(req)) { >> + case REQ_OP_READ: >> + type = VIRTIO_BLK_T_IN; >> + vbr->out_hdr.sector = cpu_to_virtio64(vdev, >> + blk_rq_pos(req)); >> + break; >> + case REQ_OP_WRITE: >> + type = VIRTIO_BLK_T_OUT; >> + vbr->out_hdr.sector = cpu_to_virtio64(vdev, >> + blk_rq_pos(req)); >> + break; >> + case REQ_OP_FLUSH: >> + type = VIRTIO_BLK_T_FLUSH; >> + break; >> + case REQ_OP_DISCARD: >> + type = VIRTIO_BLK_T_DISCARD; >> + break; >> + case REQ_OP_WRITE_ZEROES: >> + type = VIRTIO_BLK_T_WRITE_ZEROES; >> + unmap = !(req->cmd_flags & REQ_NOUNMAP); >> + break; >> + case REQ_OP_DRV_IN: >> + type = VIRTIO_BLK_T_GET_ID; >> + break; >> + default: >> + WARN_ON_ONCE(1); >> + return BLK_STS_IOERR; >> + } >> >> + vbr->out_hdr.type = cpu_to_virtio32(vdev, type); >> + vbr->out_hdr.ioprio = cpu_to_virtio32(vdev, req_get_ioprio(req)); >> + >> + if (type == VIRTIO_BLK_T_DISCARD || type == VIRTIO_BLK_T_WRITE_ZEROES) { >> + if (virtblk_setup_discard_write_zeroes(req, unmap)) >> + return BLK_STS_RESOURCE; >> + } >> + >> + return 0; >> +} >> + >> +static inline void virtblk_request_done(struct request *req) >> +{ >> + struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); >> + >> + virtblk_unmap_data(req, vbr); >> + virtblk_cleanup_cmd(req); >> blk_mq_end_request(req, virtblk_result(vbr)); >> } >> >> @@ -244,57 +330,23 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx, >> int qid = hctx->queue_num; >> int err; >> bool notify = false; >> - bool unmap = false; >> - u32 type; >> >> BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems); >> >> - switch (req_op(req)) { >> - case REQ_OP_READ: >> - case REQ_OP_WRITE: >> - type = 0; >> - break; >> - case REQ_OP_FLUSH: >> - type = VIRTIO_BLK_T_FLUSH; >> - break; >> - case REQ_OP_DISCARD: >> - type = VIRTIO_BLK_T_DISCARD; >> - break; >> - case REQ_OP_WRITE_ZEROES: >> - type = VIRTIO_BLK_T_WRITE_ZEROES; >> - unmap = !(req->cmd_flags & REQ_NOUNMAP); >> - break; >> - case REQ_OP_DRV_IN: >> - type = VIRTIO_BLK_T_GET_ID; >> - break; >> - default: >> - WARN_ON_ONCE(1); >> - return BLK_STS_IOERR; >> - } >> - >> - vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, type); >> - vbr->out_hdr.sector = type ? >> - 0 : cpu_to_virtio64(vblk->vdev, blk_rq_pos(req)); >> - vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(req)); >> + err = virtblk_setup_cmd(vblk->vdev, req, vbr); >> + if (unlikely(err)) >> + return err; >> >> blk_mq_start_request(req); >> >> - if (type == VIRTIO_BLK_T_DISCARD || type == VIRTIO_BLK_T_WRITE_ZEROES) { >> - err = virtblk_setup_discard_write_zeroes(req, unmap); >> - if (err) >> - return BLK_STS_RESOURCE; >> - } >> - >> - num = blk_rq_map_sg(hctx->queue, req, vbr->sg); >> - if (num) { >> - if (rq_data_dir(req) == WRITE) >> - vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_OUT); >> - else >> - vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_IN); >> + num = virtblk_map_data(hctx, req, vbr); >> + if (unlikely(num < 0)) { >> + virtblk_cleanup_cmd(req); >> + return BLK_STS_RESOURCE; >> } >> >> spin_lock_irqsave(&vblk->vqs[qid].lock, flags); >> - err = virtblk_add_req(vblk->vqs[qid].vq, vbr, vbr->sg, num); >> + err = virtblk_add_req(vblk->vqs[qid].vq, vbr, vbr->sg_table.sgl, num); >> if (err) { >> virtqueue_kick(vblk->vqs[qid].vq); >> /* Don't stop the queue if -ENOMEM: we may have failed to >> @@ -303,6 +355,8 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx, >> if (err == -ENOSPC) >> blk_mq_stop_hw_queue(hctx); >> spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags); >> + virtblk_unmap_data(req, vbr); >> + virtblk_cleanup_cmd(req); >> switch (err) { >> case -ENOSPC: >> return BLK_STS_DEV_RESOURCE; >> @@ -681,16 +735,6 @@ static const struct attribute_group *virtblk_attr_groups[] = { >> NULL, >> }; >> >> -static int virtblk_init_request(struct blk_mq_tag_set *set, struct request *rq, >> - unsigned int hctx_idx, unsigned int numa_node) >> -{ >> - struct virtio_blk *vblk = set->driver_data; >> - struct virtblk_req *vbr = blk_mq_rq_to_pdu(rq); >> - >> - sg_init_table(vbr->sg, vblk->sg_elems); >> - return 0; >> -} >> - >> static int virtblk_map_queues(struct blk_mq_tag_set *set) >> { >> struct virtio_blk *vblk = set->driver_data; >> @@ -703,7 +747,6 @@ static const struct blk_mq_ops virtio_mq_ops = { >> .queue_rq = virtio_queue_rq, >> .commit_rqs = virtio_commit_rqs, >> .complete = virtblk_request_done, >> - .init_request = virtblk_init_request, >> .map_queues = virtblk_map_queues, >> }; >> >> @@ -783,7 +826,7 @@ static int virtblk_probe(struct virtio_device *vdev) >> vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; >> vblk->tag_set.cmd_size = >> sizeof(struct virtblk_req) + >> - sizeof(struct scatterlist) * sg_elems; >> + sizeof(struct scatterlist) * VIRTIO_BLK_INLINE_SG_CNT; >> vblk->tag_set.driver_data = vblk; >> vblk->tag_set.nr_hw_queues = vblk->num_vqs; >> >> -- >> 2.18.1
On 9/1/21 8:58 AM, Max Gurtovoy wrote: > > On 9/1/2021 5:50 PM, Michael S. Tsirkin wrote: >> On Wed, Sep 01, 2021 at 04:14:34PM +0300, Max Gurtovoy wrote: >>> No need to pre-allocate a big buffer for the IO SGL anymore. If a device >>> has lots of deep queues, preallocation for the sg list can consume >>> substantial amounts of memory. For HW virtio-blk device, nr_hw_queues >>> can be 64 or 128 and each queue's depth might be 128. This means the >>> resulting preallocation for the data SGLs is big. >>> >>> Switch to runtime allocation for SGL for lists longer than 2 entries. >>> This is the approach used by NVMe drivers so it should be reasonable for >>> virtio block as well. Runtime SGL allocation has always been the case >>> for the legacy I/O path so this is nothing new. >>> >>> The preallocated small SGL depends on SG_CHAIN so if the ARCH doesn't >>> support SG_CHAIN, use only runtime allocation for the SGL. >>> >>> Re-organize the setup of the IO request to fit the new sg chain >>> mechanism. >>> >>> No performance degradation was seen (fio libaio engine with 16 jobs and >>> 128 iodepth): >>> >>> IO size IOPs Rand Read (before/after) IOPs Rand Write (before/after) >>> -------- --------------------------------- ---------------------------------- >>> 512B 318K/316K 329K/325K >>> >>> 4KB 323K/321K 353K/349K >>> >>> 16KB 199K/208K 250K/275K >>> >>> 128KB 36K/36.1K 39.2K/41.7K >>> >>> Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com> >>> Reviewed-by: Israel Rukshin <israelr@nvidia.com> >> Could you use something to give confidence intervals maybe? >> As it is it looks like a 1-2% regression for 512B and 4KB. > > 1%-2% is not a regression. It's a device/env/test variance. > > This is just one test results. I run it many times and got difference by > +/- 2%-3% in each run for each sides. > > Even if I run same driver without changes I get 2%-3% difference between > runs. > > If you have a perf test suite for virtio-blk it will be great if you can > run it, or maybe Feng Li has. You're adding an allocation to the hot path, and a free to the completion hot path. It's not unreasonable to expect that there could be performance implications associated with that. Which would be particularly evident with 1 segment requests, as the results would seem to indicate as well. Probably needs better testing. A profile of a peak run before and after and a diff of the two might also be interesting. The common idiom for situations like this is to have an inline part that holds 1-2 segments, and then only punt to alloc if you need more than that. As the number of segments grows, the cost per request matters less.
On 9/1/2021 6:27 PM, Jens Axboe wrote: > On 9/1/21 8:58 AM, Max Gurtovoy wrote: >> On 9/1/2021 5:50 PM, Michael S. Tsirkin wrote: >>> On Wed, Sep 01, 2021 at 04:14:34PM +0300, Max Gurtovoy wrote: >>>> No need to pre-allocate a big buffer for the IO SGL anymore. If a device >>>> has lots of deep queues, preallocation for the sg list can consume >>>> substantial amounts of memory. For HW virtio-blk device, nr_hw_queues >>>> can be 64 or 128 and each queue's depth might be 128. This means the >>>> resulting preallocation for the data SGLs is big. >>>> >>>> Switch to runtime allocation for SGL for lists longer than 2 entries. >>>> This is the approach used by NVMe drivers so it should be reasonable for >>>> virtio block as well. Runtime SGL allocation has always been the case >>>> for the legacy I/O path so this is nothing new. >>>> >>>> The preallocated small SGL depends on SG_CHAIN so if the ARCH doesn't >>>> support SG_CHAIN, use only runtime allocation for the SGL. >>>> >>>> Re-organize the setup of the IO request to fit the new sg chain >>>> mechanism. >>>> >>>> No performance degradation was seen (fio libaio engine with 16 jobs and >>>> 128 iodepth): >>>> >>>> IO size IOPs Rand Read (before/after) IOPs Rand Write (before/after) >>>> -------- --------------------------------- ---------------------------------- >>>> 512B 318K/316K 329K/325K >>>> >>>> 4KB 323K/321K 353K/349K >>>> >>>> 16KB 199K/208K 250K/275K >>>> >>>> 128KB 36K/36.1K 39.2K/41.7K >>>> >>>> Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com> >>>> Reviewed-by: Israel Rukshin <israelr@nvidia.com> >>> Could you use something to give confidence intervals maybe? >>> As it is it looks like a 1-2% regression for 512B and 4KB. >> 1%-2% is not a regression. It's a device/env/test variance. >> >> This is just one test results. I run it many times and got difference by >> +/- 2%-3% in each run for each sides. >> >> Even if I run same driver without changes I get 2%-3% difference between >> runs. >> >> If you have a perf test suite for virtio-blk it will be great if you can >> run it, or maybe Feng Li has. > You're adding an allocation to the hot path, and a free to the > completion hot path. It's not unreasonable to expect that there could be > performance implications associated with that. Which would be > particularly evident with 1 segment requests, as the results would seem > to indicate as well. but for sg_nents <= 2 there is no dynamic allocation also in this patch exactly as we do in nvmf RDMA and FC for example. > > Probably needs better testing. A profile of a peak run before and after > and a diff of the two might also be interesting. I'll run ezfio test suite with stronger virtio-blk device that reach > 800KIOPs > > The common idiom for situations like this is to have an inline part that > holds 1-2 segments, and then only punt to alloc if you need more than > that. As the number of segments grows, the cost per request matters > less. isn't this the case here ? or am I missing something ?
On 9/1/21 4:25 PM, Max Gurtovoy wrote: > > On 9/1/2021 6:27 PM, Jens Axboe wrote: >> On 9/1/21 8:58 AM, Max Gurtovoy wrote: >>> On 9/1/2021 5:50 PM, Michael S. Tsirkin wrote: >>>> On Wed, Sep 01, 2021 at 04:14:34PM +0300, Max Gurtovoy wrote: >>>>> No need to pre-allocate a big buffer for the IO SGL anymore. If a device >>>>> has lots of deep queues, preallocation for the sg list can consume >>>>> substantial amounts of memory. For HW virtio-blk device, nr_hw_queues >>>>> can be 64 or 128 and each queue's depth might be 128. This means the >>>>> resulting preallocation for the data SGLs is big. >>>>> >>>>> Switch to runtime allocation for SGL for lists longer than 2 entries. >>>>> This is the approach used by NVMe drivers so it should be reasonable for >>>>> virtio block as well. Runtime SGL allocation has always been the case >>>>> for the legacy I/O path so this is nothing new. >>>>> >>>>> The preallocated small SGL depends on SG_CHAIN so if the ARCH doesn't >>>>> support SG_CHAIN, use only runtime allocation for the SGL. >>>>> >>>>> Re-organize the setup of the IO request to fit the new sg chain >>>>> mechanism. >>>>> >>>>> No performance degradation was seen (fio libaio engine with 16 jobs and >>>>> 128 iodepth): >>>>> >>>>> IO size IOPs Rand Read (before/after) IOPs Rand Write (before/after) >>>>> -------- --------------------------------- ---------------------------------- >>>>> 512B 318K/316K 329K/325K >>>>> >>>>> 4KB 323K/321K 353K/349K >>>>> >>>>> 16KB 199K/208K 250K/275K >>>>> >>>>> 128KB 36K/36.1K 39.2K/41.7K >>>>> >>>>> Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com> >>>>> Reviewed-by: Israel Rukshin <israelr@nvidia.com> >>>> Could you use something to give confidence intervals maybe? >>>> As it is it looks like a 1-2% regression for 512B and 4KB. >>> 1%-2% is not a regression. It's a device/env/test variance. >>> >>> This is just one test results. I run it many times and got difference by >>> +/- 2%-3% in each run for each sides. >>> >>> Even if I run same driver without changes I get 2%-3% difference between >>> runs. >>> >>> If you have a perf test suite for virtio-blk it will be great if you can >>> run it, or maybe Feng Li has. >> You're adding an allocation to the hot path, and a free to the >> completion hot path. It's not unreasonable to expect that there could be >> performance implications associated with that. Which would be >> particularly evident with 1 segment requests, as the results would seem >> to indicate as well. > > but for sg_nents <= 2 there is no dynamic allocation also in this patch > exactly as we do in nvmf RDMA and FC for example. My quick read missed that, which is why you're using chaining. Then it looks very reasonable to me. >> Probably needs better testing. A profile of a peak run before and after >> and a diff of the two might also be interesting. > > I'll run ezfio test suite with stronger virtio-blk device that reach > > 800KIOPs That'd be better, and preferably a test with pinning etc so that you can show more consistent results. Just reading your table does indeed look like there's a performance degradation, even if you preface it by saying there is none. It would need better explaining, but preferably better testing. >> The common idiom for situations like this is to have an inline part that >> holds 1-2 segments, and then only punt to alloc if you need more than >> that. As the number of segments grows, the cost per request matters >> less. > > isn't this the case here ? or am I missing something ? it totally is, I was the one missing that.
On Wed, Sep 01, 2021 at 04:14:34PM +0300, Max Gurtovoy wrote: > No need to pre-allocate a big buffer for the IO SGL anymore. If a device > has lots of deep queues, preallocation for the sg list can consume > substantial amounts of memory. For HW virtio-blk device, nr_hw_queues > can be 64 or 128 and each queue's depth might be 128. This means the > resulting preallocation for the data SGLs is big. > > Switch to runtime allocation for SGL for lists longer than 2 entries. > This is the approach used by NVMe drivers so it should be reasonable for > virtio block as well. Runtime SGL allocation has always been the case > for the legacy I/O path so this is nothing new. > > The preallocated small SGL depends on SG_CHAIN so if the ARCH doesn't > support SG_CHAIN, use only runtime allocation for the SGL. > > Re-organize the setup of the IO request to fit the new sg chain > mechanism. > > No performance degradation was seen (fio libaio engine with 16 jobs and > 128 iodepth): > > IO size IOPs Rand Read (before/after) IOPs Rand Write (before/after) > -------- --------------------------------- ---------------------------------- > 512B 318K/316K 329K/325K > > 4KB 323K/321K 353K/349K > > 16KB 199K/208K 250K/275K > > 128KB 36K/36.1K 39.2K/41.7K > > Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com> > Reviewed-by: Israel Rukshin <israelr@nvidia.com> > --- > > changes from V2: > - initialize vbr->out_hdr.sector during virtblk_setup_cmd > > changes from V1: > - Kconfig update (from Christoph) > - Re-order cmd setup (from Christoph) > - use flexible sg pointer in the cmd (from Christoph) > - added perf numbers to commit msg (from Feng Li) > > --- > drivers/block/Kconfig | 1 + > drivers/block/virtio_blk.c | 155 +++++++++++++++++++++++-------------- > 2 files changed, 100 insertions(+), 56 deletions(-) Hi Max, I can run benchmark to give everyone more confidence about this change. Should I test this version or are you still planning to make code changes? Stefan
On 9/2/2021 3:21 PM, Stefan Hajnoczi wrote: > On Wed, Sep 01, 2021 at 04:14:34PM +0300, Max Gurtovoy wrote: >> No need to pre-allocate a big buffer for the IO SGL anymore. If a device >> has lots of deep queues, preallocation for the sg list can consume >> substantial amounts of memory. For HW virtio-blk device, nr_hw_queues >> can be 64 or 128 and each queue's depth might be 128. This means the >> resulting preallocation for the data SGLs is big. >> >> Switch to runtime allocation for SGL for lists longer than 2 entries. >> This is the approach used by NVMe drivers so it should be reasonable for >> virtio block as well. Runtime SGL allocation has always been the case >> for the legacy I/O path so this is nothing new. >> >> The preallocated small SGL depends on SG_CHAIN so if the ARCH doesn't >> support SG_CHAIN, use only runtime allocation for the SGL. >> >> Re-organize the setup of the IO request to fit the new sg chain >> mechanism. >> >> No performance degradation was seen (fio libaio engine with 16 jobs and >> 128 iodepth): >> >> IO size IOPs Rand Read (before/after) IOPs Rand Write (before/after) >> -------- --------------------------------- ---------------------------------- >> 512B 318K/316K 329K/325K >> >> 4KB 323K/321K 353K/349K >> >> 16KB 199K/208K 250K/275K >> >> 128KB 36K/36.1K 39.2K/41.7K >> >> Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com> >> Reviewed-by: Israel Rukshin <israelr@nvidia.com> >> --- >> >> changes from V2: >> - initialize vbr->out_hdr.sector during virtblk_setup_cmd >> >> changes from V1: >> - Kconfig update (from Christoph) >> - Re-order cmd setup (from Christoph) >> - use flexible sg pointer in the cmd (from Christoph) >> - added perf numbers to commit msg (from Feng Li) >> >> --- >> drivers/block/Kconfig | 1 + >> drivers/block/virtio_blk.c | 155 +++++++++++++++++++++++-------------- >> 2 files changed, 100 insertions(+), 56 deletions(-) > Hi Max, > I can run benchmark to give everyone more confidence about this change. > Should I test this version or are you still planning to make code > changes? Yes you can test v3. Thanks, Max. > > Stefan
On Wed, Sep 01, 2021 at 04:14:34PM +0300, Max Gurtovoy wrote: > No need to pre-allocate a big buffer for the IO SGL anymore. If a device > has lots of deep queues, preallocation for the sg list can consume > substantial amounts of memory. For HW virtio-blk device, nr_hw_queues > can be 64 or 128 and each queue's depth might be 128. This means the > resulting preallocation for the data SGLs is big. > > Switch to runtime allocation for SGL for lists longer than 2 entries. > This is the approach used by NVMe drivers so it should be reasonable for > virtio block as well. Runtime SGL allocation has always been the case > for the legacy I/O path so this is nothing new. > > The preallocated small SGL depends on SG_CHAIN so if the ARCH doesn't > support SG_CHAIN, use only runtime allocation for the SGL. > > Re-organize the setup of the IO request to fit the new sg chain > mechanism. > > No performance degradation was seen (fio libaio engine with 16 jobs and > 128 iodepth): > > IO size IOPs Rand Read (before/after) IOPs Rand Write (before/after) > -------- --------------------------------- ---------------------------------- > 512B 318K/316K 329K/325K > > 4KB 323K/321K 353K/349K > > 16KB 199K/208K 250K/275K > > 128KB 36K/36.1K 39.2K/41.7K I ran fio randread benchmarks with 4k, 16k, 64k, and 128k at iodepth 1, 8, and 64 on two vCPUs. The results look fine, there is no significant regression. iodepth=1 and iodepth=64 are very consistent. For some reason the iodepth=8 has significant variance but I don't think it's the fault of this patch. Fio results and the Jupyter notebook export are available here (check out benchmark.html to see the graphs): https://gitlab.com/stefanha/virt-playbooks/-/tree/virtio-blk-sgl-allocation-benchmark/notebook Guest: - Fedora 34 - Linux v5.14 - 2 vCPUs (pinned), 4 GB RAM (single host NUMA node) - 1 IOThread (pinned) - virtio-blk aio=native,cache=none,format=raw - QEMU 6.1.0 Host: - RHEL 8.3 - Linux 4.18.0-240.22.1.el8_3.x86_64 - Intel(R) Xeon(R) Silver 4214 CPU @ 2.20GHz - Intel Optane DC P4800X Stefan
On Mon, Sep 6, 2021 at 11:39 PM Stefan Hajnoczi <stefanha@redhat.com> wrote: > > On Wed, Sep 01, 2021 at 04:14:34PM +0300, Max Gurtovoy wrote: > > No need to pre-allocate a big buffer for the IO SGL anymore. If a device > > has lots of deep queues, preallocation for the sg list can consume > > substantial amounts of memory. For HW virtio-blk device, nr_hw_queues > > can be 64 or 128 and each queue's depth might be 128. This means the > > resulting preallocation for the data SGLs is big. > > > > Switch to runtime allocation for SGL for lists longer than 2 entries. > > This is the approach used by NVMe drivers so it should be reasonable for > > virtio block as well. Runtime SGL allocation has always been the case > > for the legacy I/O path so this is nothing new. > > > > The preallocated small SGL depends on SG_CHAIN so if the ARCH doesn't > > support SG_CHAIN, use only runtime allocation for the SGL. > > > > Re-organize the setup of the IO request to fit the new sg chain > > mechanism. > > > > No performance degradation was seen (fio libaio engine with 16 jobs and > > 128 iodepth): > > > > IO size IOPs Rand Read (before/after) IOPs Rand Write (before/after) > > -------- --------------------------------- ---------------------------------- > > 512B 318K/316K 329K/325K > > > > 4KB 323K/321K 353K/349K > > > > 16KB 199K/208K 250K/275K > > > > 128KB 36K/36.1K 39.2K/41.7K > > I ran fio randread benchmarks with 4k, 16k, 64k, and 128k at iodepth 1, > 8, and 64 on two vCPUs. The results look fine, there is no significant > regression. > > iodepth=1 and iodepth=64 are very consistent. For some reason the > iodepth=8 has significant variance but I don't think it's the fault of > this patch. > > Fio results and the Jupyter notebook export are available here (check > out benchmark.html to see the graphs): > > https://gitlab.com/stefanha/virt-playbooks/-/tree/virtio-blk-sgl-allocation-benchmark/notebook > > Guest: > - Fedora 34 > - Linux v5.14 > - 2 vCPUs (pinned), 4 GB RAM (single host NUMA node) > - 1 IOThread (pinned) > - virtio-blk aio=native,cache=none,format=raw > - QEMU 6.1.0 > > Host: > - RHEL 8.3 > - Linux 4.18.0-240.22.1.el8_3.x86_64 > - Intel(R) Xeon(R) Silver 4214 CPU @ 2.20GHz > - Intel Optane DC P4800X > > Stefan Reviewed-by: Feng Li <lifeng1519@gmail.com>
On 9/6/2021 6:09 PM, Stefan Hajnoczi wrote: > On Wed, Sep 01, 2021 at 04:14:34PM +0300, Max Gurtovoy wrote: >> No need to pre-allocate a big buffer for the IO SGL anymore. If a device >> has lots of deep queues, preallocation for the sg list can consume >> substantial amounts of memory. For HW virtio-blk device, nr_hw_queues >> can be 64 or 128 and each queue's depth might be 128. This means the >> resulting preallocation for the data SGLs is big. >> >> Switch to runtime allocation for SGL for lists longer than 2 entries. >> This is the approach used by NVMe drivers so it should be reasonable for >> virtio block as well. Runtime SGL allocation has always been the case >> for the legacy I/O path so this is nothing new. >> >> The preallocated small SGL depends on SG_CHAIN so if the ARCH doesn't >> support SG_CHAIN, use only runtime allocation for the SGL. >> >> Re-organize the setup of the IO request to fit the new sg chain >> mechanism. >> >> No performance degradation was seen (fio libaio engine with 16 jobs and >> 128 iodepth): >> >> IO size IOPs Rand Read (before/after) IOPs Rand Write (before/after) >> -------- --------------------------------- ---------------------------------- >> 512B 318K/316K 329K/325K >> >> 4KB 323K/321K 353K/349K >> >> 16KB 199K/208K 250K/275K >> >> 128KB 36K/36.1K 39.2K/41.7K > I ran fio randread benchmarks with 4k, 16k, 64k, and 128k at iodepth 1, > 8, and 64 on two vCPUs. The results look fine, there is no significant > regression. > > iodepth=1 and iodepth=64 are very consistent. For some reason the > iodepth=8 has significant variance but I don't think it's the fault of > this patch. > > Fio results and the Jupyter notebook export are available here (check > out benchmark.html to see the graphs): > > https://gitlab.com/stefanha/virt-playbooks/-/tree/virtio-blk-sgl-allocation-benchmark/notebook > > Guest: > - Fedora 34 > - Linux v5.14 > - 2 vCPUs (pinned), 4 GB RAM (single host NUMA node) > - 1 IOThread (pinned) > - virtio-blk aio=native,cache=none,format=raw > - QEMU 6.1.0 > > Host: > - RHEL 8.3 > - Linux 4.18.0-240.22.1.el8_3.x86_64 > - Intel(R) Xeon(R) Silver 4214 CPU @ 2.20GHz > - Intel Optane DC P4800X > > Stefan Thanks, Stefan. Would you like me to add some of the results in my commit msg ? or Tested-By sign ?
On Mon, Sep 13, 2021 at 05:50:21PM +0300, Max Gurtovoy wrote: > > On 9/6/2021 6:09 PM, Stefan Hajnoczi wrote: > > On Wed, Sep 01, 2021 at 04:14:34PM +0300, Max Gurtovoy wrote: > > > No need to pre-allocate a big buffer for the IO SGL anymore. If a device > > > has lots of deep queues, preallocation for the sg list can consume > > > substantial amounts of memory. For HW virtio-blk device, nr_hw_queues > > > can be 64 or 128 and each queue's depth might be 128. This means the > > > resulting preallocation for the data SGLs is big. > > > > > > Switch to runtime allocation for SGL for lists longer than 2 entries. > > > This is the approach used by NVMe drivers so it should be reasonable for > > > virtio block as well. Runtime SGL allocation has always been the case > > > for the legacy I/O path so this is nothing new. > > > > > > The preallocated small SGL depends on SG_CHAIN so if the ARCH doesn't > > > support SG_CHAIN, use only runtime allocation for the SGL. > > > > > > Re-organize the setup of the IO request to fit the new sg chain > > > mechanism. > > > > > > No performance degradation was seen (fio libaio engine with 16 jobs and > > > 128 iodepth): > > > > > > IO size IOPs Rand Read (before/after) IOPs Rand Write (before/after) > > > -------- --------------------------------- ---------------------------------- > > > 512B 318K/316K 329K/325K > > > > > > 4KB 323K/321K 353K/349K > > > > > > 16KB 199K/208K 250K/275K > > > > > > 128KB 36K/36.1K 39.2K/41.7K > > I ran fio randread benchmarks with 4k, 16k, 64k, and 128k at iodepth 1, > > 8, and 64 on two vCPUs. The results look fine, there is no significant > > regression. > > > > iodepth=1 and iodepth=64 are very consistent. For some reason the > > iodepth=8 has significant variance but I don't think it's the fault of > > this patch. > > > > Fio results and the Jupyter notebook export are available here (check > > out benchmark.html to see the graphs): > > > > https://gitlab.com/stefanha/virt-playbooks/-/tree/virtio-blk-sgl-allocation-benchmark/notebook > > > > Guest: > > - Fedora 34 > > - Linux v5.14 > > - 2 vCPUs (pinned), 4 GB RAM (single host NUMA node) > > - 1 IOThread (pinned) > > - virtio-blk aio=native,cache=none,format=raw > > - QEMU 6.1.0 > > > > Host: > > - RHEL 8.3 > > - Linux 4.18.0-240.22.1.el8_3.x86_64 > > - Intel(R) Xeon(R) Silver 4214 CPU @ 2.20GHz > > - Intel Optane DC P4800X > > > > Stefan > > Thanks, Stefan. > > Would you like me to add some of the results in my commit msg ? or Tested-By > sign ? Thanks, there's no need to change the commit description. Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> Tested-by: Stefan Hajnoczi <stefanha@redhat.com>
Hi MST/Jens, Do we need more review here or are we ok with the code and the test matrix ? If we're ok, we need to decide if this goes through virtio PR or block PR. Cheers, -Max. On 9/14/2021 3:22 PM, Stefan Hajnoczi wrote: > On Mon, Sep 13, 2021 at 05:50:21PM +0300, Max Gurtovoy wrote: >> On 9/6/2021 6:09 PM, Stefan Hajnoczi wrote: >>> On Wed, Sep 01, 2021 at 04:14:34PM +0300, Max Gurtovoy wrote: >>>> No need to pre-allocate a big buffer for the IO SGL anymore. If a device >>>> has lots of deep queues, preallocation for the sg list can consume >>>> substantial amounts of memory. For HW virtio-blk device, nr_hw_queues >>>> can be 64 or 128 and each queue's depth might be 128. This means the >>>> resulting preallocation for the data SGLs is big. >>>> >>>> Switch to runtime allocation for SGL for lists longer than 2 entries. >>>> This is the approach used by NVMe drivers so it should be reasonable for >>>> virtio block as well. Runtime SGL allocation has always been the case >>>> for the legacy I/O path so this is nothing new. >>>> >>>> The preallocated small SGL depends on SG_CHAIN so if the ARCH doesn't >>>> support SG_CHAIN, use only runtime allocation for the SGL. >>>> >>>> Re-organize the setup of the IO request to fit the new sg chain >>>> mechanism. >>>> >>>> No performance degradation was seen (fio libaio engine with 16 jobs and >>>> 128 iodepth): >>>> >>>> IO size IOPs Rand Read (before/after) IOPs Rand Write (before/after) >>>> -------- --------------------------------- ---------------------------------- >>>> 512B 318K/316K 329K/325K >>>> >>>> 4KB 323K/321K 353K/349K >>>> >>>> 16KB 199K/208K 250K/275K >>>> >>>> 128KB 36K/36.1K 39.2K/41.7K >>> I ran fio randread benchmarks with 4k, 16k, 64k, and 128k at iodepth 1, >>> 8, and 64 on two vCPUs. The results look fine, there is no significant >>> regression. >>> >>> iodepth=1 and iodepth=64 are very consistent. For some reason the >>> iodepth=8 has significant variance but I don't think it's the fault of >>> this patch. >>> >>> Fio results and the Jupyter notebook export are available here (check >>> out benchmark.html to see the graphs): >>> >>> https://gitlab.com/stefanha/virt-playbooks/-/tree/virtio-blk-sgl-allocation-benchmark/notebook >>> >>> Guest: >>> - Fedora 34 >>> - Linux v5.14 >>> - 2 vCPUs (pinned), 4 GB RAM (single host NUMA node) >>> - 1 IOThread (pinned) >>> - virtio-blk aio=native,cache=none,format=raw >>> - QEMU 6.1.0 >>> >>> Host: >>> - RHEL 8.3 >>> - Linux 4.18.0-240.22.1.el8_3.x86_64 >>> - Intel(R) Xeon(R) Silver 4214 CPU @ 2.20GHz >>> - Intel Optane DC P4800X >>> >>> Stefan >> Thanks, Stefan. >> >> Would you like me to add some of the results in my commit msg ? or Tested-By >> sign ? > Thanks, there's no need to change the commit description. > > Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> > Tested-by: Stefan Hajnoczi <stefanha@redhat.com>
OK by me. Acked-by: Michael S. Tsirkin <mst@redhat.com> I will queue it for the next kernel. Thanks! On Thu, Sep 23, 2021 at 04:40:56PM +0300, Max Gurtovoy wrote: > Hi MST/Jens, > > Do we need more review here or are we ok with the code and the test matrix ? > > If we're ok, we need to decide if this goes through virtio PR or block PR. > > Cheers, > > -Max. > > On 9/14/2021 3:22 PM, Stefan Hajnoczi wrote: > > On Mon, Sep 13, 2021 at 05:50:21PM +0300, Max Gurtovoy wrote: > > > On 9/6/2021 6:09 PM, Stefan Hajnoczi wrote: > > > > On Wed, Sep 01, 2021 at 04:14:34PM +0300, Max Gurtovoy wrote: > > > > > No need to pre-allocate a big buffer for the IO SGL anymore. If a device > > > > > has lots of deep queues, preallocation for the sg list can consume > > > > > substantial amounts of memory. For HW virtio-blk device, nr_hw_queues > > > > > can be 64 or 128 and each queue's depth might be 128. This means the > > > > > resulting preallocation for the data SGLs is big. > > > > > > > > > > Switch to runtime allocation for SGL for lists longer than 2 entries. > > > > > This is the approach used by NVMe drivers so it should be reasonable for > > > > > virtio block as well. Runtime SGL allocation has always been the case > > > > > for the legacy I/O path so this is nothing new. > > > > > > > > > > The preallocated small SGL depends on SG_CHAIN so if the ARCH doesn't > > > > > support SG_CHAIN, use only runtime allocation for the SGL. > > > > > > > > > > Re-organize the setup of the IO request to fit the new sg chain > > > > > mechanism. > > > > > > > > > > No performance degradation was seen (fio libaio engine with 16 jobs and > > > > > 128 iodepth): > > > > > > > > > > IO size IOPs Rand Read (before/after) IOPs Rand Write (before/after) > > > > > -------- --------------------------------- ---------------------------------- > > > > > 512B 318K/316K 329K/325K > > > > > > > > > > 4KB 323K/321K 353K/349K > > > > > > > > > > 16KB 199K/208K 250K/275K > > > > > > > > > > 128KB 36K/36.1K 39.2K/41.7K > > > > I ran fio randread benchmarks with 4k, 16k, 64k, and 128k at iodepth 1, > > > > 8, and 64 on two vCPUs. The results look fine, there is no significant > > > > regression. > > > > > > > > iodepth=1 and iodepth=64 are very consistent. For some reason the > > > > iodepth=8 has significant variance but I don't think it's the fault of > > > > this patch. > > > > > > > > Fio results and the Jupyter notebook export are available here (check > > > > out benchmark.html to see the graphs): > > > > > > > > https://gitlab.com/stefanha/virt-playbooks/-/tree/virtio-blk-sgl-allocation-benchmark/notebook > > > > > > > > Guest: > > > > - Fedora 34 > > > > - Linux v5.14 > > > > - 2 vCPUs (pinned), 4 GB RAM (single host NUMA node) > > > > - 1 IOThread (pinned) > > > > - virtio-blk aio=native,cache=none,format=raw > > > > - QEMU 6.1.0 > > > > > > > > Host: > > > > - RHEL 8.3 > > > > - Linux 4.18.0-240.22.1.el8_3.x86_64 > > > > - Intel(R) Xeon(R) Silver 4214 CPU @ 2.20GHz > > > > - Intel Optane DC P4800X > > > > > > > > Stefan > > > Thanks, Stefan. > > > > > > Would you like me to add some of the results in my commit msg ? or Tested-By > > > sign ? > > Thanks, there's no need to change the commit description. > > > > Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> > > Tested-by: Stefan Hajnoczi <stefanha@redhat.com>
Looks good,
Reviewed-by: Christoph Hellwig <hch@lst.de>
My tree is ok. Looks like your patch was developed on top of some other tree, not plan upstream linux, so git am fails. I applied it using patch and some manual tweaking, and it seems to work for me but please do test it in linux-next and confirm - will push to a linux-next branch in my tree soon. On Thu, Sep 23, 2021 at 04:40:56PM +0300, Max Gurtovoy wrote: > Hi MST/Jens, > > Do we need more review here or are we ok with the code and the test matrix ? > > If we're ok, we need to decide if this goes through virtio PR or block PR. > > Cheers, > > -Max. > > On 9/14/2021 3:22 PM, Stefan Hajnoczi wrote: > > On Mon, Sep 13, 2021 at 05:50:21PM +0300, Max Gurtovoy wrote: > > > On 9/6/2021 6:09 PM, Stefan Hajnoczi wrote: > > > > On Wed, Sep 01, 2021 at 04:14:34PM +0300, Max Gurtovoy wrote: > > > > > No need to pre-allocate a big buffer for the IO SGL anymore. If a device > > > > > has lots of deep queues, preallocation for the sg list can consume > > > > > substantial amounts of memory. For HW virtio-blk device, nr_hw_queues > > > > > can be 64 or 128 and each queue's depth might be 128. This means the > > > > > resulting preallocation for the data SGLs is big. > > > > > > > > > > Switch to runtime allocation for SGL for lists longer than 2 entries. > > > > > This is the approach used by NVMe drivers so it should be reasonable for > > > > > virtio block as well. Runtime SGL allocation has always been the case > > > > > for the legacy I/O path so this is nothing new. > > > > > > > > > > The preallocated small SGL depends on SG_CHAIN so if the ARCH doesn't > > > > > support SG_CHAIN, use only runtime allocation for the SGL. > > > > > > > > > > Re-organize the setup of the IO request to fit the new sg chain > > > > > mechanism. > > > > > > > > > > No performance degradation was seen (fio libaio engine with 16 jobs and > > > > > 128 iodepth): > > > > > > > > > > IO size IOPs Rand Read (before/after) IOPs Rand Write (before/after) > > > > > -------- --------------------------------- ---------------------------------- > > > > > 512B 318K/316K 329K/325K > > > > > > > > > > 4KB 323K/321K 353K/349K > > > > > > > > > > 16KB 199K/208K 250K/275K > > > > > > > > > > 128KB 36K/36.1K 39.2K/41.7K > > > > I ran fio randread benchmarks with 4k, 16k, 64k, and 128k at iodepth 1, > > > > 8, and 64 on two vCPUs. The results look fine, there is no significant > > > > regression. > > > > > > > > iodepth=1 and iodepth=64 are very consistent. For some reason the > > > > iodepth=8 has significant variance but I don't think it's the fault of > > > > this patch. > > > > > > > > Fio results and the Jupyter notebook export are available here (check > > > > out benchmark.html to see the graphs): > > > > > > > > https://gitlab.com/stefanha/virt-playbooks/-/tree/virtio-blk-sgl-allocation-benchmark/notebook > > > > > > > > Guest: > > > > - Fedora 34 > > > > - Linux v5.14 > > > > - 2 vCPUs (pinned), 4 GB RAM (single host NUMA node) > > > > - 1 IOThread (pinned) > > > > - virtio-blk aio=native,cache=none,format=raw > > > > - QEMU 6.1.0 > > > > > > > > Host: > > > > - RHEL 8.3 > > > > - Linux 4.18.0-240.22.1.el8_3.x86_64 > > > > - Intel(R) Xeon(R) Silver 4214 CPU @ 2.20GHz > > > > - Intel Optane DC P4800X > > > > > > > > Stefan > > > Thanks, Stefan. > > > > > > Would you like me to add some of the results in my commit msg ? or Tested-By > > > sign ? > > Thanks, there's no need to change the commit description. > > > > Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> > > Tested-by: Stefan Hajnoczi <stefanha@redhat.com>
On 10/22/2021 12:15 PM, Michael S. Tsirkin wrote: > My tree is ok. > Looks like your patch was developed on top of some other tree, > not plan upstream linux, so git am fails. I applied it using > patch and some manual tweaking, and it seems to work for > me but please do test it in linux-next and confirm - > will push to a linux-next branch in my tree soon. I've tested. looks fine. > > On Thu, Sep 23, 2021 at 04:40:56PM +0300, Max Gurtovoy wrote: >> Hi MST/Jens, >> >> Do we need more review here or are we ok with the code and the test matrix ? >> >> If we're ok, we need to decide if this goes through virtio PR or block PR. >> >> Cheers, >> >> -Max. >> >> On 9/14/2021 3:22 PM, Stefan Hajnoczi wrote: >>> On Mon, Sep 13, 2021 at 05:50:21PM +0300, Max Gurtovoy wrote: >>>> On 9/6/2021 6:09 PM, Stefan Hajnoczi wrote: >>>>> On Wed, Sep 01, 2021 at 04:14:34PM +0300, Max Gurtovoy wrote: >>>>>> No need to pre-allocate a big buffer for the IO SGL anymore. If a device >>>>>> has lots of deep queues, preallocation for the sg list can consume >>>>>> substantial amounts of memory. For HW virtio-blk device, nr_hw_queues >>>>>> can be 64 or 128 and each queue's depth might be 128. This means the >>>>>> resulting preallocation for the data SGLs is big. >>>>>> >>>>>> Switch to runtime allocation for SGL for lists longer than 2 entries. >>>>>> This is the approach used by NVMe drivers so it should be reasonable for >>>>>> virtio block as well. Runtime SGL allocation has always been the case >>>>>> for the legacy I/O path so this is nothing new. >>>>>> >>>>>> The preallocated small SGL depends on SG_CHAIN so if the ARCH doesn't >>>>>> support SG_CHAIN, use only runtime allocation for the SGL. >>>>>> >>>>>> Re-organize the setup of the IO request to fit the new sg chain >>>>>> mechanism. >>>>>> >>>>>> No performance degradation was seen (fio libaio engine with 16 jobs and >>>>>> 128 iodepth): >>>>>> >>>>>> IO size IOPs Rand Read (before/after) IOPs Rand Write (before/after) >>>>>> -------- --------------------------------- ---------------------------------- >>>>>> 512B 318K/316K 329K/325K >>>>>> >>>>>> 4KB 323K/321K 353K/349K >>>>>> >>>>>> 16KB 199K/208K 250K/275K >>>>>> >>>>>> 128KB 36K/36.1K 39.2K/41.7K >>>>> I ran fio randread benchmarks with 4k, 16k, 64k, and 128k at iodepth 1, >>>>> 8, and 64 on two vCPUs. The results look fine, there is no significant >>>>> regression. >>>>> >>>>> iodepth=1 and iodepth=64 are very consistent. For some reason the >>>>> iodepth=8 has significant variance but I don't think it's the fault of >>>>> this patch. >>>>> >>>>> Fio results and the Jupyter notebook export are available here (check >>>>> out benchmark.html to see the graphs): >>>>> >>>>> https://gitlab.com/stefanha/virt-playbooks/-/tree/virtio-blk-sgl-allocation-benchmark/notebook >>>>> >>>>> Guest: >>>>> - Fedora 34 >>>>> - Linux v5.14 >>>>> - 2 vCPUs (pinned), 4 GB RAM (single host NUMA node) >>>>> - 1 IOThread (pinned) >>>>> - virtio-blk aio=native,cache=none,format=raw >>>>> - QEMU 6.1.0 >>>>> >>>>> Host: >>>>> - RHEL 8.3 >>>>> - Linux 4.18.0-240.22.1.el8_3.x86_64 >>>>> - Intel(R) Xeon(R) Silver 4214 CPU @ 2.20GHz >>>>> - Intel Optane DC P4800X >>>>> >>>>> Stefan >>>> Thanks, Stefan. >>>> >>>> Would you like me to add some of the results in my commit msg ? or Tested-By >>>> sign ? >>> Thanks, there's no need to change the commit description. >>> >>> Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> >>> Tested-by: Stefan Hajnoczi <stefanha@redhat.com>
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index 63056cfd4b62..ca25a122b8ee 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -395,6 +395,7 @@ config XEN_BLKDEV_BACKEND config VIRTIO_BLK tristate "Virtio block driver" depends on VIRTIO + select SG_POOL help This is the virtual block driver for virtio. It can be used with QEMU based VMMs (like KVM or Xen). Say Y or M. diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 9332fc4e9b31..bdd6d415bd20 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -24,6 +24,12 @@ /* The maximum number of sg elements that fit into a virtqueue */ #define VIRTIO_BLK_MAX_SG_ELEMS 32768 +#ifdef CONFIG_ARCH_NO_SG_CHAIN +#define VIRTIO_BLK_INLINE_SG_CNT 0 +#else +#define VIRTIO_BLK_INLINE_SG_CNT 2 +#endif + static int virtblk_queue_count_set(const char *val, const struct kernel_param *kp) { @@ -93,6 +99,7 @@ struct virtio_blk { struct virtblk_req { struct virtio_blk_outhdr out_hdr; u8 status; + struct sg_table sg_table; struct scatterlist sg[]; }; @@ -178,15 +185,94 @@ static int virtblk_setup_discard_write_zeroes(struct request *req, bool unmap) return 0; } -static inline void virtblk_request_done(struct request *req) +static void virtblk_unmap_data(struct request *req, struct virtblk_req *vbr) { - struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); + if (blk_rq_nr_phys_segments(req)) + sg_free_table_chained(&vbr->sg_table, + VIRTIO_BLK_INLINE_SG_CNT); +} + +static int virtblk_map_data(struct blk_mq_hw_ctx *hctx, struct request *req, + struct virtblk_req *vbr) +{ + int err; + + if (!blk_rq_nr_phys_segments(req)) + return 0; + + vbr->sg_table.sgl = vbr->sg; + err = sg_alloc_table_chained(&vbr->sg_table, + blk_rq_nr_phys_segments(req), + vbr->sg_table.sgl, + VIRTIO_BLK_INLINE_SG_CNT); + if (unlikely(err)) + return -ENOMEM; + return blk_rq_map_sg(hctx->queue, req, vbr->sg_table.sgl); +} + +static void virtblk_cleanup_cmd(struct request *req) +{ if (req->rq_flags & RQF_SPECIAL_PAYLOAD) { kfree(page_address(req->special_vec.bv_page) + req->special_vec.bv_offset); } +} + +static int virtblk_setup_cmd(struct virtio_device *vdev, struct request *req, + struct virtblk_req *vbr) +{ + bool unmap = false; + u32 type; + + vbr->out_hdr.sector = 0; + + switch (req_op(req)) { + case REQ_OP_READ: + type = VIRTIO_BLK_T_IN; + vbr->out_hdr.sector = cpu_to_virtio64(vdev, + blk_rq_pos(req)); + break; + case REQ_OP_WRITE: + type = VIRTIO_BLK_T_OUT; + vbr->out_hdr.sector = cpu_to_virtio64(vdev, + blk_rq_pos(req)); + break; + case REQ_OP_FLUSH: + type = VIRTIO_BLK_T_FLUSH; + break; + case REQ_OP_DISCARD: + type = VIRTIO_BLK_T_DISCARD; + break; + case REQ_OP_WRITE_ZEROES: + type = VIRTIO_BLK_T_WRITE_ZEROES; + unmap = !(req->cmd_flags & REQ_NOUNMAP); + break; + case REQ_OP_DRV_IN: + type = VIRTIO_BLK_T_GET_ID; + break; + default: + WARN_ON_ONCE(1); + return BLK_STS_IOERR; + } + vbr->out_hdr.type = cpu_to_virtio32(vdev, type); + vbr->out_hdr.ioprio = cpu_to_virtio32(vdev, req_get_ioprio(req)); + + if (type == VIRTIO_BLK_T_DISCARD || type == VIRTIO_BLK_T_WRITE_ZEROES) { + if (virtblk_setup_discard_write_zeroes(req, unmap)) + return BLK_STS_RESOURCE; + } + + return 0; +} + +static inline void virtblk_request_done(struct request *req) +{ + struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); + + virtblk_unmap_data(req, vbr); + virtblk_cleanup_cmd(req); blk_mq_end_request(req, virtblk_result(vbr)); } @@ -244,57 +330,23 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx, int qid = hctx->queue_num; int err; bool notify = false; - bool unmap = false; - u32 type; BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems); - switch (req_op(req)) { - case REQ_OP_READ: - case REQ_OP_WRITE: - type = 0; - break; - case REQ_OP_FLUSH: - type = VIRTIO_BLK_T_FLUSH; - break; - case REQ_OP_DISCARD: - type = VIRTIO_BLK_T_DISCARD; - break; - case REQ_OP_WRITE_ZEROES: - type = VIRTIO_BLK_T_WRITE_ZEROES; - unmap = !(req->cmd_flags & REQ_NOUNMAP); - break; - case REQ_OP_DRV_IN: - type = VIRTIO_BLK_T_GET_ID; - break; - default: - WARN_ON_ONCE(1); - return BLK_STS_IOERR; - } - - vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, type); - vbr->out_hdr.sector = type ? - 0 : cpu_to_virtio64(vblk->vdev, blk_rq_pos(req)); - vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(req)); + err = virtblk_setup_cmd(vblk->vdev, req, vbr); + if (unlikely(err)) + return err; blk_mq_start_request(req); - if (type == VIRTIO_BLK_T_DISCARD || type == VIRTIO_BLK_T_WRITE_ZEROES) { - err = virtblk_setup_discard_write_zeroes(req, unmap); - if (err) - return BLK_STS_RESOURCE; - } - - num = blk_rq_map_sg(hctx->queue, req, vbr->sg); - if (num) { - if (rq_data_dir(req) == WRITE) - vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_OUT); - else - vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_IN); + num = virtblk_map_data(hctx, req, vbr); + if (unlikely(num < 0)) { + virtblk_cleanup_cmd(req); + return BLK_STS_RESOURCE; } spin_lock_irqsave(&vblk->vqs[qid].lock, flags); - err = virtblk_add_req(vblk->vqs[qid].vq, vbr, vbr->sg, num); + err = virtblk_add_req(vblk->vqs[qid].vq, vbr, vbr->sg_table.sgl, num); if (err) { virtqueue_kick(vblk->vqs[qid].vq); /* Don't stop the queue if -ENOMEM: we may have failed to @@ -303,6 +355,8 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx, if (err == -ENOSPC) blk_mq_stop_hw_queue(hctx); spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags); + virtblk_unmap_data(req, vbr); + virtblk_cleanup_cmd(req); switch (err) { case -ENOSPC: return BLK_STS_DEV_RESOURCE; @@ -681,16 +735,6 @@ static const struct attribute_group *virtblk_attr_groups[] = { NULL, }; -static int virtblk_init_request(struct blk_mq_tag_set *set, struct request *rq, - unsigned int hctx_idx, unsigned int numa_node) -{ - struct virtio_blk *vblk = set->driver_data; - struct virtblk_req *vbr = blk_mq_rq_to_pdu(rq); - - sg_init_table(vbr->sg, vblk->sg_elems); - return 0; -} - static int virtblk_map_queues(struct blk_mq_tag_set *set) { struct virtio_blk *vblk = set->driver_data; @@ -703,7 +747,6 @@ static const struct blk_mq_ops virtio_mq_ops = { .queue_rq = virtio_queue_rq, .commit_rqs = virtio_commit_rqs, .complete = virtblk_request_done, - .init_request = virtblk_init_request, .map_queues = virtblk_map_queues, }; @@ -783,7 +826,7 @@ static int virtblk_probe(struct virtio_device *vdev) vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; vblk->tag_set.cmd_size = sizeof(struct virtblk_req) + - sizeof(struct scatterlist) * sg_elems; + sizeof(struct scatterlist) * VIRTIO_BLK_INLINE_SG_CNT; vblk->tag_set.driver_data = vblk; vblk->tag_set.nr_hw_queues = vblk->num_vqs;