Message ID | 20170505090044.28754-1-hch@lst.de (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 05/05/2017 11:00, Christoph Hellwig wrote: > Signed-off-by: Keith Busch <keith.busch@intel.com> > [hch: ported over from qemu-nvme.git to mainline] > Signed-off-by: Christoph Hellwig <hch@lst.de> > --- > hw/block/nvme.c | 26 ++++++++++++++++++++++++++ > hw/block/nvme.h | 1 + > 2 files changed, 27 insertions(+) > > diff --git a/hw/block/nvme.c b/hw/block/nvme.c > index ae303d44e5..3f4d2bf2ba 100644 > --- a/hw/block/nvme.c > +++ b/hw/block/nvme.c > @@ -227,6 +227,29 @@ static uint16_t nvme_flush(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd, > return NVME_NO_COMPLETE; > } > > +static uint16_t nvme_write_zeros(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd, > + NvmeRequest *req) > +{ > + NvmeRwCmd *rw = (NvmeRwCmd *)cmd; > + const uint8_t lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas); > + const uint8_t data_shift = ns->id_ns.lbaf[lba_index].ds; > + uint64_t slba = le64_to_cpu(rw->slba); > + uint32_t nlb = le16_to_cpu(rw->nlb) + 1; > + uint64_t aio_slba = slba << (data_shift - BDRV_SECTOR_BITS); > + uint32_t aio_nlb = nlb << (data_shift - BDRV_SECTOR_BITS); > + > + if (slba + nlb > ns->id_ns.nsze) { > + return NVME_LBA_RANGE | NVME_DNR; > + } > + > + req->has_sg = false; > + block_acct_start(blk_get_stats(n->conf.blk), &req->acct, 0, > + BLOCK_ACCT_WRITE); > + req->aiocb = blk_aio_pwrite_zeroes(n->conf.blk, aio_slba, aio_nlb, 0, > + nvme_rw_cb, req); Hi Christoph, could you pass BDRV_REQ_MAY_UNMAP for the flags here if the deallocate bit (dword 12 bit 25) is set? Thanks, Paolo > + return NVME_NO_COMPLETE; > +} > + > static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd, > NvmeRequest *req) > { > @@ -279,6 +302,8 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) > switch (cmd->opcode) { > case NVME_CMD_FLUSH: > return nvme_flush(n, ns, cmd, req); > + case NVME_CMD_WRITE_ZEROS: > + return nvme_write_zeros(n, ns, cmd, req); > case NVME_CMD_WRITE: > case NVME_CMD_READ: > return nvme_rw(n, ns, cmd, req); > @@ -895,6 +920,7 @@ static int nvme_init(PCIDevice *pci_dev) > id->sqes = (0x6 << 4) | 0x6; > id->cqes = (0x4 << 4) | 0x4; > id->nn = cpu_to_le32(n->num_namespaces); > + id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROS); > id->psd[0].mp = cpu_to_le16(0x9c4); > id->psd[0].enlat = cpu_to_le32(0x10); > id->psd[0].exlat = cpu_to_le32(0x4); > diff --git a/hw/block/nvme.h b/hw/block/nvme.h > index 8fb0c10756..a0d15649f9 100644 > --- a/hw/block/nvme.h > +++ b/hw/block/nvme.h > @@ -179,6 +179,7 @@ enum NvmeIoCommands { > NVME_CMD_READ = 0x02, > NVME_CMD_WRITE_UNCOR = 0x04, > NVME_CMD_COMPARE = 0x05, > + NVME_CMD_WRITE_ZEROS = 0x08, > NVME_CMD_DSM = 0x09, > }; > >
On Fri, May 05, 2017 at 11:30:11AM +0200, Paolo Bonzini wrote: > could you pass BDRV_REQ_MAY_UNMAP for the flags here if the deallocate > bit (dword 12 bit 25) is set? In fact we should do that unconditonally. The deallocate bit is new in 1.3 (which we don't claim to support) and forces deallocating, but NVMe already allows for this behavior without the flag (we in fact explicitly clarified this in an ECN for 1.2.1).
On 05/05/2017 11:51, Christoph Hellwig wrote: >> could you pass BDRV_REQ_MAY_UNMAP for the flags here if the deallocate >> bit (dword 12 bit 25) is set? > In fact we should do that unconditonally. The deallocate bit is new > in 1.3 (which we don't claim to support) and forces deallocating, but > NVMe already allows for this behavior without the flag (we in fact > explicitly clarified this in an ECN for 1.2.1). While that's allowed and it makes sense indeed on SSDs, for QEMU's typical usage it can lead to fragmentation and worse performance. On extent-based file systems, write zeroes without deallocate can be implemented very efficiently with FALLOC_FL_ZERO_RANGE, and there's no reason not to do it. Is there anything backwards-incompatible in 1.3 that would make it hard to just bump the supported version? Paolo
On Fri, May 05, 2017 at 12:03:40PM +0200, Paolo Bonzini wrote: > While that's allowed and it makes sense indeed on SSDs, for QEMU's > typical usage it can lead to fragmentation and worse performance. On > extent-based file systems, write zeroes without deallocate can be > implemented very efficiently with FALLOC_FL_ZERO_RANGE, and there's no > reason not to do it. Ok.. > Is there anything backwards-incompatible in 1.3 that would make it hard > to just bump the supported version? 1.2.1 requires a valid NQN in Identify Controller, and 1.3 requires the new Identify CNS 03h for controller identification. Otherwise it's mostly fine tuning corner conditions for which I'd have to audit the code.
diff --git a/hw/block/nvme.c b/hw/block/nvme.c index ae303d44e5..3f4d2bf2ba 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -227,6 +227,29 @@ static uint16_t nvme_flush(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd, return NVME_NO_COMPLETE; } +static uint16_t nvme_write_zeros(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd, + NvmeRequest *req) +{ + NvmeRwCmd *rw = (NvmeRwCmd *)cmd; + const uint8_t lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas); + const uint8_t data_shift = ns->id_ns.lbaf[lba_index].ds; + uint64_t slba = le64_to_cpu(rw->slba); + uint32_t nlb = le16_to_cpu(rw->nlb) + 1; + uint64_t aio_slba = slba << (data_shift - BDRV_SECTOR_BITS); + uint32_t aio_nlb = nlb << (data_shift - BDRV_SECTOR_BITS); + + if (slba + nlb > ns->id_ns.nsze) { + return NVME_LBA_RANGE | NVME_DNR; + } + + req->has_sg = false; + block_acct_start(blk_get_stats(n->conf.blk), &req->acct, 0, + BLOCK_ACCT_WRITE); + req->aiocb = blk_aio_pwrite_zeroes(n->conf.blk, aio_slba, aio_nlb, 0, + nvme_rw_cb, req); + return NVME_NO_COMPLETE; +} + static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd, NvmeRequest *req) { @@ -279,6 +302,8 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) switch (cmd->opcode) { case NVME_CMD_FLUSH: return nvme_flush(n, ns, cmd, req); + case NVME_CMD_WRITE_ZEROS: + return nvme_write_zeros(n, ns, cmd, req); case NVME_CMD_WRITE: case NVME_CMD_READ: return nvme_rw(n, ns, cmd, req); @@ -895,6 +920,7 @@ static int nvme_init(PCIDevice *pci_dev) id->sqes = (0x6 << 4) | 0x6; id->cqes = (0x4 << 4) | 0x4; id->nn = cpu_to_le32(n->num_namespaces); + id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROS); id->psd[0].mp = cpu_to_le16(0x9c4); id->psd[0].enlat = cpu_to_le32(0x10); id->psd[0].exlat = cpu_to_le32(0x4); diff --git a/hw/block/nvme.h b/hw/block/nvme.h index 8fb0c10756..a0d15649f9 100644 --- a/hw/block/nvme.h +++ b/hw/block/nvme.h @@ -179,6 +179,7 @@ enum NvmeIoCommands { NVME_CMD_READ = 0x02, NVME_CMD_WRITE_UNCOR = 0x04, NVME_CMD_COMPARE = 0x05, + NVME_CMD_WRITE_ZEROS = 0x08, NVME_CMD_DSM = 0x09, };