diff mbox series

[v2,08/18] hw/block/nvme: Make Zoned NS Command Set definitions

Message ID 20200617213415.22417-9-dmitry.fomichev@wdc.com (mailing list archive)
State New, archived
Headers show
Series hw/block/nvme: Support Namespace Types and Zoned Namespace Command Set | expand

Commit Message

Dmitry Fomichev June 17, 2020, 9:34 p.m. UTC
Define values and structures that are needed to support Zoned
Namespace Command Set (NVMe TP 4053) in PCI NVMe controller emulator.

All new protocol definitions are located in include/block/nvme.h
and everything added that is specific to this implementation is kept
in hw/block/nvme.h.

In order to improve scalability, all open, closed and full zones
are organized in separate linked lists. Consequently, almost all
zone operations don't require scanning of the entire zone array
(which potentially can be quite large) - it is only necessary to
enumerate one or more zone lists. Zone lists are designed to be
position-independent as they can be persisted to the backing file
as a part of zone metadata. NvmeZoneList struct defined in this patch
serves as a head of every zone list.

NvmeZone structure encapsulates NvmeZoneDescriptor defined in Zoned
Command Set specification and adds a few more fields that are
internal to this implementation.

Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com>
Signed-off-by: Ajay Joshi <ajay.joshi@wdc.com>
Signed-off-by: Matias Bjorling <matias.bjorling@wdc.com>
Signed-off-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
Signed-off-by: Alexey Bogoslavsky <alexey.bogoslavsky@wdc.com>
Signed-off-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
---
 hw/block/nvme.h      | 130 +++++++++++++++++++++++++++++++++++++++++++
 include/block/nvme.h | 119 ++++++++++++++++++++++++++++++++++++++-
 2 files changed, 248 insertions(+), 1 deletion(-)

Comments

Klaus Jensen June 30, 2020, 11:44 a.m. UTC | #1
On Jun 18 06:34, Dmitry Fomichev wrote:
> Define values and structures that are needed to support Zoned
> Namespace Command Set (NVMe TP 4053) in PCI NVMe controller emulator.
> 
> All new protocol definitions are located in include/block/nvme.h
> and everything added that is specific to this implementation is kept
> in hw/block/nvme.h.
> 
> In order to improve scalability, all open, closed and full zones
> are organized in separate linked lists. Consequently, almost all
> zone operations don't require scanning of the entire zone array
> (which potentially can be quite large) - it is only necessary to
> enumerate one or more zone lists. Zone lists are designed to be
> position-independent as they can be persisted to the backing file
> as a part of zone metadata. NvmeZoneList struct defined in this patch
> serves as a head of every zone list.
> 
> NvmeZone structure encapsulates NvmeZoneDescriptor defined in Zoned
> Command Set specification and adds a few more fields that are
> internal to this implementation.
> 
> Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
> Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com>
> Signed-off-by: Ajay Joshi <ajay.joshi@wdc.com>
> Signed-off-by: Matias Bjorling <matias.bjorling@wdc.com>
> Signed-off-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
> Signed-off-by: Alexey Bogoslavsky <alexey.bogoslavsky@wdc.com>
> Signed-off-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
> ---
>  hw/block/nvme.h      | 130 +++++++++++++++++++++++++++++++++++++++++++
>  include/block/nvme.h | 119 ++++++++++++++++++++++++++++++++++++++-
>  2 files changed, 248 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/block/nvme.h b/hw/block/nvme.h
> index 0d29f75475..2c932b5e29 100644
> --- a/hw/block/nvme.h
> +++ b/hw/block/nvme.h
> @@ -3,12 +3,22 @@
>  
>  #include "block/nvme.h"
>  
> +#define NVME_DEFAULT_ZONE_SIZE   128 /* MiB */
> +#define NVME_DEFAULT_MAX_ZA_SIZE 128 /* KiB */
> +
>  typedef struct NvmeParams {
>      char     *serial;
>      uint32_t num_queues; /* deprecated since 5.1 */
>      uint32_t max_ioqpairs;
>      uint16_t msix_qsize;
>      uint32_t cmb_size_mb;
> +
> +    bool        zoned;
> +    bool        cross_zone_read;
> +    uint8_t     fill_pattern;
> +    uint32_t    zamds_bs;

Rename to zasl.

> +    uint64_t    zone_size;
> +    uint64_t    zone_capacity;
>  } NvmeParams;
>  
>  typedef struct NvmeAsyncEvent {
> @@ -17,6 +27,8 @@ typedef struct NvmeAsyncEvent {
>  
>  enum NvmeRequestFlags {
>      NVME_REQ_FLG_HAS_SG   = 1 << 0,
> +    NVME_REQ_FLG_FILL     = 1 << 1,
> +    NVME_REQ_FLG_APPEND   = 1 << 2,
>  };
>  
>  typedef struct NvmeRequest {
> @@ -24,6 +36,7 @@ typedef struct NvmeRequest {
>      BlockAIOCB              *aiocb;
>      uint16_t                status;
>      uint16_t                flags;
> +    uint64_t                fill_ofs;
>      NvmeCqe                 cqe;
>      BlockAcctCookie         acct;
>      QEMUSGList              qsg;
> @@ -61,11 +74,35 @@ typedef struct NvmeCQueue {
>      QTAILQ_HEAD(, NvmeRequest) req_list;
>  } NvmeCQueue;
>  
> +typedef struct NvmeZone {
> +    NvmeZoneDescr   d;
> +    uint64_t        tstamp;
> +    uint32_t        next;
> +    uint32_t        prev;
> +    uint8_t         rsvd80[8];
> +} NvmeZone;
> +
> +#define NVME_ZONE_LIST_NIL    UINT_MAX
> +
> +typedef struct NvmeZoneList {
> +    uint32_t        head;
> +    uint32_t        tail;
> +    uint32_t        size;
> +    uint8_t         rsvd12[4];
> +} NvmeZoneList;
> +
>  typedef struct NvmeNamespace {
>      NvmeIdNs        id_ns;
>      uint32_t        nsid;
>      uint8_t         csi;
>      QemuUUID        uuid;
> +
> +    NvmeIdNsZoned   *id_ns_zoned;
> +    NvmeZone        *zone_array;
> +    NvmeZoneList    *exp_open_zones;
> +    NvmeZoneList    *imp_open_zones;
> +    NvmeZoneList    *closed_zones;
> +    NvmeZoneList    *full_zones;
>  } NvmeNamespace;
>  
>  static inline NvmeLBAF *nvme_ns_lbaf(NvmeNamespace *ns)
> @@ -100,6 +137,7 @@ typedef struct NvmeCtrl {
>      uint32_t    num_namespaces;
>      uint32_t    max_q_ents;
>      uint64_t    ns_size;
> +
>      uint8_t     *cmbuf;
>      uint32_t    irq_status;
>      uint64_t    host_timestamp;                 /* Timestamp sent by the host */
> @@ -107,6 +145,12 @@ typedef struct NvmeCtrl {
>  
>      HostMemoryBackend *pmrdev;
>  
> +    int             zone_file_fd;
> +    uint32_t        num_zones;
> +    uint64_t        zone_size_bs;
> +    uint64_t        zone_array_size;
> +    uint8_t         zamds;

Rename to zasl.

> +
>      NvmeNamespace   *namespaces;
>      NvmeSQueue      **sq;
>      NvmeCQueue      **cq;
> @@ -121,6 +165,86 @@ static inline uint64_t nvme_ns_nlbas(NvmeCtrl *n, NvmeNamespace *ns)
>      return n->ns_size >> nvme_ns_lbads(ns);
>  }
>  
> +static inline uint8_t nvme_get_zone_state(NvmeZone *zone)
> +{
> +    return zone->d.zs >> 4;
> +}
> +
> +static inline void nvme_set_zone_state(NvmeZone *zone, enum NvmeZoneState state)
> +{
> +    zone->d.zs = state << 4;
> +}
> +
> +static inline uint64_t nvme_zone_rd_boundary(NvmeCtrl *n, NvmeZone *zone)
> +{
> +    return zone->d.zslba + n->params.zone_size;
> +}
> +
> +static inline uint64_t nvme_zone_wr_boundary(NvmeZone *zone)
> +{
> +    return zone->d.zslba + zone->d.zcap;
> +}

Everything working on zone->d needs leXX_to_cpu() conversions.

> +
> +static inline bool nvme_wp_is_valid(NvmeZone *zone)
> +{
> +    uint8_t st = nvme_get_zone_state(zone);
> +
> +    return st != NVME_ZONE_STATE_FULL &&
> +           st != NVME_ZONE_STATE_READ_ONLY &&
> +           st != NVME_ZONE_STATE_OFFLINE;
> +}
> +
> +/*
> + * Initialize a zone list head.
> + */
> +static inline void nvme_init_zone_list(NvmeZoneList *zl)
> +{
> +    zl->head = NVME_ZONE_LIST_NIL;
> +    zl->tail = NVME_ZONE_LIST_NIL;
> +    zl->size = 0;
> +}
> +
> +/*
> + * Initialize the number of entries contained in a zone list.
> + */
> +static inline uint32_t nvme_zone_list_size(NvmeZoneList *zl)
> +{
> +    return zl->size;
> +}
> +
> +/*
> + * Check if the zone is not currently included into any zone list.
> + */
> +static inline bool nvme_zone_not_in_list(NvmeZone *zone)
> +{
> +    return (bool)(zone->prev == 0 && zone->next == 0);
> +}
> +
> +/*
> + * Return the zone at the head of zone list or NULL if the list is empty.
> + */
> +static inline NvmeZone *nvme_peek_zone_head(NvmeNamespace *ns, NvmeZoneList *zl)
> +{
> +    if (zl->head == NVME_ZONE_LIST_NIL) {
> +        return NULL;
> +    }
> +    return &ns->zone_array[zl->head];
> +}
> +
> +/*
> + * Return the next zone in the list.
> + */
> +static inline NvmeZone *nvme_next_zone_in_list(NvmeNamespace *ns, NvmeZone *z,
> +    NvmeZoneList *zl)
> +{
> +    assert(!nvme_zone_not_in_list(z));
> +
> +    if (z->next == NVME_ZONE_LIST_NIL) {
> +        return NULL;
> +    }
> +    return &ns->zone_array[z->next];
> +}
> +
>  static inline int nvme_ilog2(uint64_t i)
>  {
>      int log = -1;
> @@ -132,4 +256,10 @@ static inline int nvme_ilog2(uint64_t i)
>      return log;
>  }
>  
> +static inline void _hw_nvme_check_size(void)
> +{
> +    QEMU_BUILD_BUG_ON(sizeof(NvmeZoneList) != 16);
> +    QEMU_BUILD_BUG_ON(sizeof(NvmeZone) != 88);
> +}
> +
>  #endif /* HW_NVME_H */
> diff --git a/include/block/nvme.h b/include/block/nvme.h
> index 5a1e5e137c..596c39162b 100644
> --- a/include/block/nvme.h
> +++ b/include/block/nvme.h
> @@ -446,6 +446,9 @@ enum NvmeIoCommands {
>      NVME_CMD_COMPARE            = 0x05,
>      NVME_CMD_WRITE_ZEROS        = 0x08,
>      NVME_CMD_DSM                = 0x09,
> +    NVME_CMD_ZONE_MGMT_SEND     = 0x79,
> +    NVME_CMD_ZONE_MGMT_RECV     = 0x7a,
> +    NVME_CMD_ZONE_APND          = 0x7d,
>  };
>  
>  typedef struct NvmeDeleteQ {
> @@ -539,6 +542,7 @@ enum NvmeNidLength {
>  
>  enum NvmeCsi {
>      NVME_CSI_NVM                = 0x00,
> +    NVME_CSI_ZONED              = 0x02,
>  };
>  
>  #define NVME_SET_CSI(vec, csi) (vec |= (uint8_t)(1 << (csi)))
> @@ -661,6 +665,7 @@ enum NvmeStatusCodes {
>      NVME_INVALID_NSID           = 0x000b,
>      NVME_CMD_SEQ_ERROR          = 0x000c,
>      NVME_CMD_SET_CMB_REJECTED   = 0x002b,
> +    NVME_INVALID_CMD_SET        = 0x002c,
>      NVME_LBA_RANGE              = 0x0080,
>      NVME_CAP_EXCEEDED           = 0x0081,
>      NVME_NS_NOT_READY           = 0x0082,
> @@ -684,6 +689,14 @@ enum NvmeStatusCodes {
>      NVME_CONFLICTING_ATTRS      = 0x0180,
>      NVME_INVALID_PROT_INFO      = 0x0181,
>      NVME_WRITE_TO_RO            = 0x0182,
> +    NVME_ZONE_BOUNDARY_ERROR    = 0x01b8,
> +    NVME_ZONE_FULL              = 0x01b9,
> +    NVME_ZONE_READ_ONLY         = 0x01ba,
> +    NVME_ZONE_OFFLINE           = 0x01bb,
> +    NVME_ZONE_INVALID_WRITE     = 0x01bc,
> +    NVME_ZONE_TOO_MANY_ACTIVE   = 0x01bd,
> +    NVME_ZONE_TOO_MANY_OPEN     = 0x01be,
> +    NVME_ZONE_INVAL_TRANSITION  = 0x01bf,
>      NVME_WRITE_FAULT            = 0x0280,
>      NVME_UNRECOVERED_READ       = 0x0281,
>      NVME_E2E_GUARD_ERROR        = 0x0282,
> @@ -807,7 +820,17 @@ typedef struct NvmeIdCtrl {
>      uint8_t     ieee[3];
>      uint8_t     cmic;
>      uint8_t     mdts;
> -    uint8_t     rsvd255[178];
> +    uint16_t    cntlid;
> +    uint32_t    ver;
> +    uint32_t    rtd3r;
> +    uint32_t    rtd3e;
> +    uint32_t    oaes;
> +    uint32_t    ctratt;
> +    uint8_t     rsvd100[28];
> +    uint16_t    crdt1;
> +    uint16_t    crdt2;
> +    uint16_t    crdt3;
> +    uint8_t     rsvd134[122];

Would be nice in a separate patch, see my "bump to ..." patches.

>      uint16_t    oacs;
>      uint8_t     acl;
>      uint8_t     aerl;
> @@ -832,6 +855,11 @@ typedef struct NvmeIdCtrl {
>      uint8_t     vs[1024];
>  } NvmeIdCtrl;
>  
> +typedef struct NvmeIdCtrlZoned {
> +    uint8_t     zamds;

zasl.

> +    uint8_t     rsvd1[4095];
> +} NvmeIdCtrlZoned;
> +
>  enum NvmeIdCtrlOacs {
>      NVME_OACS_SECURITY  = 1 << 0,
>      NVME_OACS_FORMAT    = 1 << 1,
> @@ -908,6 +936,12 @@ typedef struct NvmeLBAF {
>      uint8_t     rp;
>  } NvmeLBAF;
>  
> +typedef struct NvmeLBAFE {
> +    uint64_t    zsze;
> +    uint8_t     zdes;
> +    uint8_t     rsvd9[7];
> +} NvmeLBAFE;
> +
>  typedef struct NvmeIdNs {
>      uint64_t    nsze;
>      uint64_t    ncap;
> @@ -930,6 +964,19 @@ typedef struct NvmeIdNs {
>      uint8_t     vs[3712];
>  } NvmeIdNs;
>  
> +typedef struct NvmeIdNsZoned {
> +    uint16_t    zoc;
> +    uint16_t    ozcs;
> +    uint32_t    mar;
> +    uint32_t    mor;
> +    uint32_t    rrl;
> +    uint32_t    frl;
> +    uint8_t     rsvd20[2796];
> +    NvmeLBAFE   lbafe[16];
> +    uint8_t     rsvd3072[768];
> +    uint8_t     vs[256];
> +} NvmeIdNsZoned;
> +
>  
>  /*Deallocate Logical Block Features*/
>  #define NVME_ID_NS_DLFEAT_GUARD_CRC(dlfeat)       ((dlfeat) & 0x10)
> @@ -962,6 +1009,71 @@ enum NvmeIdNsDps {
>      DPS_FIRST_EIGHT = 8,
>  };
>  
> +enum NvmeZoneAttr {
> +    NVME_ZA_FINISHED_BY_CTLR         = 1 << 0,
> +    NVME_ZA_FINISH_RECOMMENDED       = 1 << 1,
> +    NVME_ZA_RESET_RECOMMENDED        = 1 << 2,
> +    NVME_ZA_ZD_EXT_VALID             = 1 << 7,
> +};
> +
> +typedef struct NvmeZoneReportHeader {
> +    uint64_t    nr_zones;
> +    uint8_t     rsvd[56];
> +} NvmeZoneReportHeader;
> +
> +enum NvmeZoneReceiveAction {
> +    NVME_ZONE_REPORT                 = 0,
> +    NVME_ZONE_REPORT_EXTENDED        = 1,
> +};
> +
> +enum NvmeZoneReportType {
> +    NVME_ZONE_REPORT_ALL             = 0,
> +    NVME_ZONE_REPORT_EMPTY           = 1,
> +    NVME_ZONE_REPORT_IMPLICITLY_OPEN = 2,
> +    NVME_ZONE_REPORT_EXPLICITLY_OPEN = 3,
> +    NVME_ZONE_REPORT_CLOSED          = 4,
> +    NVME_ZONE_REPORT_FULL            = 5,
> +    NVME_ZONE_REPORT_READ_ONLY       = 6,
> +    NVME_ZONE_REPORT_OFFLINE         = 7,
> +};
> +
> +typedef struct NvmeZoneDescr {
> +    uint8_t     zt;
> +    uint8_t     zs;
> +    uint8_t     za;
> +    uint8_t     rsvd3[5];
> +    uint64_t    zcap;
> +    uint64_t    zslba;
> +    uint64_t    wp;
> +    uint8_t     rsvd32[32];
> +} NvmeZoneDescr;
> +
> +enum NvmeZoneState {
> +    NVME_ZONE_STATE_RESERVED         = 0x00,
> +    NVME_ZONE_STATE_EMPTY            = 0x01,
> +    NVME_ZONE_STATE_IMPLICITLY_OPEN  = 0x02,
> +    NVME_ZONE_STATE_EXPLICITLY_OPEN  = 0x03,
> +    NVME_ZONE_STATE_CLOSED           = 0x04,
> +    NVME_ZONE_STATE_READ_ONLY        = 0x0D,
> +    NVME_ZONE_STATE_FULL             = 0x0E,
> +    NVME_ZONE_STATE_OFFLINE          = 0x0F,
> +};
> +
> +enum NvmeZoneType {
> +    NVME_ZONE_TYPE_RESERVED          = 0x00,
> +    NVME_ZONE_TYPE_SEQ_WRITE         = 0x02,
> +};
> +
> +enum NvmeZoneSendAction {
> +    NVME_ZONE_ACTION_RSD             = 0x00,
> +    NVME_ZONE_ACTION_CLOSE           = 0x01,
> +    NVME_ZONE_ACTION_FINISH          = 0x02,
> +    NVME_ZONE_ACTION_OPEN            = 0x03,
> +    NVME_ZONE_ACTION_RESET           = 0x04,
> +    NVME_ZONE_ACTION_OFFLINE         = 0x05,
> +    NVME_ZONE_ACTION_SET_ZD_EXT      = 0x10,
> +};
> +
>  static inline void _nvme_check_size(void)
>  {
>      QEMU_BUILD_BUG_ON(sizeof(NvmeCqe) != 16);
> @@ -978,8 +1090,13 @@ static inline void _nvme_check_size(void)
>      QEMU_BUILD_BUG_ON(sizeof(NvmeFwSlotInfoLog) != 512);
>      QEMU_BUILD_BUG_ON(sizeof(NvmeSmartLog) != 512);
>      QEMU_BUILD_BUG_ON(sizeof(NvmeIdCtrl) != 4096);
> +    QEMU_BUILD_BUG_ON(sizeof(NvmeIdCtrlZoned) != 4096);
>      QEMU_BUILD_BUG_ON(sizeof(NvmeNsIdDesc) != 4);
> +    QEMU_BUILD_BUG_ON(sizeof(NvmeLBAF) != 4);
> +    QEMU_BUILD_BUG_ON(sizeof(NvmeLBAFE) != 16);
>      QEMU_BUILD_BUG_ON(sizeof(NvmeIdNs) != 4096);
> +    QEMU_BUILD_BUG_ON(sizeof(NvmeIdNsZoned) != 4096);
>      QEMU_BUILD_BUG_ON(sizeof(NvmeEffectsLog) != 4096);
> +    QEMU_BUILD_BUG_ON(sizeof(NvmeZoneDescr) != 64);
>  }
>  #endif
> -- 
> 2.21.0
> 
>
Klaus Jensen June 30, 2020, 12:08 p.m. UTC | #2
On Jun 30 13:44, Klaus Jensen wrote:
> On Jun 18 06:34, Dmitry Fomichev wrote:
> > Define values and structures that are needed to support Zoned
> > Namespace Command Set (NVMe TP 4053) in PCI NVMe controller emulator.
> > 
> > All new protocol definitions are located in include/block/nvme.h
> > and everything added that is specific to this implementation is kept
> > in hw/block/nvme.h.
> > 
> > In order to improve scalability, all open, closed and full zones
> > are organized in separate linked lists. Consequently, almost all
> > zone operations don't require scanning of the entire zone array
> > (which potentially can be quite large) - it is only necessary to
> > enumerate one or more zone lists. Zone lists are designed to be
> > position-independent as they can be persisted to the backing file
> > as a part of zone metadata. NvmeZoneList struct defined in this patch
> > serves as a head of every zone list.
> > 
> > NvmeZone structure encapsulates NvmeZoneDescriptor defined in Zoned
> > Command Set specification and adds a few more fields that are
> > internal to this implementation.
> > 
> > Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
> > Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com>
> > Signed-off-by: Ajay Joshi <ajay.joshi@wdc.com>
> > Signed-off-by: Matias Bjorling <matias.bjorling@wdc.com>
> > Signed-off-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
> > Signed-off-by: Alexey Bogoslavsky <alexey.bogoslavsky@wdc.com>
> > Signed-off-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
> > ---
> >  hw/block/nvme.h      | 130 +++++++++++++++++++++++++++++++++++++++++++
> >  include/block/nvme.h | 119 ++++++++++++++++++++++++++++++++++++++-
> >  2 files changed, 248 insertions(+), 1 deletion(-)
> > 
> > diff --git a/hw/block/nvme.h b/hw/block/nvme.h
> > index 0d29f75475..2c932b5e29 100644
> > --- a/hw/block/nvme.h
> > +++ b/hw/block/nvme.h
> > @@ -121,6 +165,86 @@ static inline uint64_t nvme_ns_nlbas(NvmeCtrl *n, NvmeNamespace *ns)
> >      return n->ns_size >> nvme_ns_lbads(ns);
> >  }
> >  
> > +static inline uint8_t nvme_get_zone_state(NvmeZone *zone)
> > +{
> > +    return zone->d.zs >> 4;
> > +}
> > +
> > +static inline void nvme_set_zone_state(NvmeZone *zone, enum NvmeZoneState state)
> > +{
> > +    zone->d.zs = state << 4;
> > +}
> > +
> > +static inline uint64_t nvme_zone_rd_boundary(NvmeCtrl *n, NvmeZone *zone)
> > +{
> > +    return zone->d.zslba + n->params.zone_size;
> > +}
> > +
> > +static inline uint64_t nvme_zone_wr_boundary(NvmeZone *zone)
> > +{
> > +    return zone->d.zslba + zone->d.zcap;
> > +}
> 
> Everything working on zone->d needs leXX_to_cpu() conversions.

Disregard this. I see from the following patches that you keep zone->d
in cpu endianess and convert on zone management receive.

Sorry!
Alistair Francis June 30, 2020, 10:11 p.m. UTC | #3
On Wed, Jun 17, 2020 at 2:51 PM Dmitry Fomichev <dmitry.fomichev@wdc.com> wrote:
>
> Define values and structures that are needed to support Zoned
> Namespace Command Set (NVMe TP 4053) in PCI NVMe controller emulator.
>
> All new protocol definitions are located in include/block/nvme.h
> and everything added that is specific to this implementation is kept
> in hw/block/nvme.h.
>
> In order to improve scalability, all open, closed and full zones
> are organized in separate linked lists. Consequently, almost all
> zone operations don't require scanning of the entire zone array
> (which potentially can be quite large) - it is only necessary to
> enumerate one or more zone lists. Zone lists are designed to be
> position-independent as they can be persisted to the backing file
> as a part of zone metadata. NvmeZoneList struct defined in this patch
> serves as a head of every zone list.
>
> NvmeZone structure encapsulates NvmeZoneDescriptor defined in Zoned
> Command Set specification and adds a few more fields that are
> internal to this implementation.
>
> Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
> Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com>
> Signed-off-by: Ajay Joshi <ajay.joshi@wdc.com>
> Signed-off-by: Matias Bjorling <matias.bjorling@wdc.com>
> Signed-off-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
> Signed-off-by: Alexey Bogoslavsky <alexey.bogoslavsky@wdc.com>
> Signed-off-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
> ---
>  hw/block/nvme.h      | 130 +++++++++++++++++++++++++++++++++++++++++++
>  include/block/nvme.h | 119 ++++++++++++++++++++++++++++++++++++++-
>  2 files changed, 248 insertions(+), 1 deletion(-)
>
> diff --git a/hw/block/nvme.h b/hw/block/nvme.h
> index 0d29f75475..2c932b5e29 100644
> --- a/hw/block/nvme.h
> +++ b/hw/block/nvme.h
> @@ -3,12 +3,22 @@
>
>  #include "block/nvme.h"
>
> +#define NVME_DEFAULT_ZONE_SIZE   128 /* MiB */
> +#define NVME_DEFAULT_MAX_ZA_SIZE 128 /* KiB */
> +
>  typedef struct NvmeParams {
>      char     *serial;
>      uint32_t num_queues; /* deprecated since 5.1 */
>      uint32_t max_ioqpairs;
>      uint16_t msix_qsize;
>      uint32_t cmb_size_mb;
> +
> +    bool        zoned;
> +    bool        cross_zone_read;
> +    uint8_t     fill_pattern;
> +    uint32_t    zamds_bs;
> +    uint64_t    zone_size;
> +    uint64_t    zone_capacity;
>  } NvmeParams;
>
>  typedef struct NvmeAsyncEvent {
> @@ -17,6 +27,8 @@ typedef struct NvmeAsyncEvent {
>
>  enum NvmeRequestFlags {
>      NVME_REQ_FLG_HAS_SG   = 1 << 0,
> +    NVME_REQ_FLG_FILL     = 1 << 1,
> +    NVME_REQ_FLG_APPEND   = 1 << 2,
>  };
>
>  typedef struct NvmeRequest {
> @@ -24,6 +36,7 @@ typedef struct NvmeRequest {
>      BlockAIOCB              *aiocb;
>      uint16_t                status;
>      uint16_t                flags;
> +    uint64_t                fill_ofs;
>      NvmeCqe                 cqe;
>      BlockAcctCookie         acct;
>      QEMUSGList              qsg;
> @@ -61,11 +74,35 @@ typedef struct NvmeCQueue {
>      QTAILQ_HEAD(, NvmeRequest) req_list;
>  } NvmeCQueue;
>
> +typedef struct NvmeZone {
> +    NvmeZoneDescr   d;
> +    uint64_t        tstamp;
> +    uint32_t        next;
> +    uint32_t        prev;
> +    uint8_t         rsvd80[8];
> +} NvmeZone;
> +
> +#define NVME_ZONE_LIST_NIL    UINT_MAX
> +
> +typedef struct NvmeZoneList {
> +    uint32_t        head;
> +    uint32_t        tail;
> +    uint32_t        size;
> +    uint8_t         rsvd12[4];
> +} NvmeZoneList;
> +
>  typedef struct NvmeNamespace {
>      NvmeIdNs        id_ns;
>      uint32_t        nsid;
>      uint8_t         csi;
>      QemuUUID        uuid;
> +
> +    NvmeIdNsZoned   *id_ns_zoned;
> +    NvmeZone        *zone_array;
> +    NvmeZoneList    *exp_open_zones;
> +    NvmeZoneList    *imp_open_zones;
> +    NvmeZoneList    *closed_zones;
> +    NvmeZoneList    *full_zones;
>  } NvmeNamespace;
>
>  static inline NvmeLBAF *nvme_ns_lbaf(NvmeNamespace *ns)
> @@ -100,6 +137,7 @@ typedef struct NvmeCtrl {
>      uint32_t    num_namespaces;
>      uint32_t    max_q_ents;
>      uint64_t    ns_size;
> +
>      uint8_t     *cmbuf;
>      uint32_t    irq_status;
>      uint64_t    host_timestamp;                 /* Timestamp sent by the host */
> @@ -107,6 +145,12 @@ typedef struct NvmeCtrl {
>
>      HostMemoryBackend *pmrdev;
>
> +    int             zone_file_fd;
> +    uint32_t        num_zones;
> +    uint64_t        zone_size_bs;
> +    uint64_t        zone_array_size;
> +    uint8_t         zamds;
> +
>      NvmeNamespace   *namespaces;
>      NvmeSQueue      **sq;
>      NvmeCQueue      **cq;
> @@ -121,6 +165,86 @@ static inline uint64_t nvme_ns_nlbas(NvmeCtrl *n, NvmeNamespace *ns)
>      return n->ns_size >> nvme_ns_lbads(ns);
>  }
>
> +static inline uint8_t nvme_get_zone_state(NvmeZone *zone)
> +{
> +    return zone->d.zs >> 4;
> +}
> +
> +static inline void nvme_set_zone_state(NvmeZone *zone, enum NvmeZoneState state)
> +{
> +    zone->d.zs = state << 4;
> +}
> +
> +static inline uint64_t nvme_zone_rd_boundary(NvmeCtrl *n, NvmeZone *zone)
> +{
> +    return zone->d.zslba + n->params.zone_size;
> +}
> +
> +static inline uint64_t nvme_zone_wr_boundary(NvmeZone *zone)
> +{
> +    return zone->d.zslba + zone->d.zcap;
> +}
> +
> +static inline bool nvme_wp_is_valid(NvmeZone *zone)
> +{
> +    uint8_t st = nvme_get_zone_state(zone);
> +
> +    return st != NVME_ZONE_STATE_FULL &&
> +           st != NVME_ZONE_STATE_READ_ONLY &&
> +           st != NVME_ZONE_STATE_OFFLINE;
> +}
> +
> +/*
> + * Initialize a zone list head.
> + */
> +static inline void nvme_init_zone_list(NvmeZoneList *zl)
> +{
> +    zl->head = NVME_ZONE_LIST_NIL;
> +    zl->tail = NVME_ZONE_LIST_NIL;
> +    zl->size = 0;
> +}
> +
> +/*
> + * Initialize the number of entries contained in a zone list.
> + */

This should be retrieve (or something similar) instead of initialise.

> +static inline uint32_t nvme_zone_list_size(NvmeZoneList *zl)
> +{
> +    return zl->size;
> +}
> +
> +/*
> + * Check if the zone is not currently included into any zone list.
> + */
> +static inline bool nvme_zone_not_in_list(NvmeZone *zone)
> +{
> +    return (bool)(zone->prev == 0 && zone->next == 0);

You don't need the cast to bool.

Besides that it looks good. I didn't check every value against the spec though.

Acked-by: Alistair Francis <alistair.francis@wdc.com>

Alistair

> +}
> +
> +/*
> + * Return the zone at the head of zone list or NULL if the list is empty.
> + */
> +static inline NvmeZone *nvme_peek_zone_head(NvmeNamespace *ns, NvmeZoneList *zl)
> +{
> +    if (zl->head == NVME_ZONE_LIST_NIL) {
> +        return NULL;
> +    }
> +    return &ns->zone_array[zl->head];
> +}
> +
> +/*
> + * Return the next zone in the list.
> + */
> +static inline NvmeZone *nvme_next_zone_in_list(NvmeNamespace *ns, NvmeZone *z,
> +    NvmeZoneList *zl)
> +{
> +    assert(!nvme_zone_not_in_list(z));
> +
> +    if (z->next == NVME_ZONE_LIST_NIL) {
> +        return NULL;
> +    }
> +    return &ns->zone_array[z->next];
> +}
> +
>  static inline int nvme_ilog2(uint64_t i)
>  {
>      int log = -1;
> @@ -132,4 +256,10 @@ static inline int nvme_ilog2(uint64_t i)
>      return log;
>  }
>
> +static inline void _hw_nvme_check_size(void)
> +{
> +    QEMU_BUILD_BUG_ON(sizeof(NvmeZoneList) != 16);
> +    QEMU_BUILD_BUG_ON(sizeof(NvmeZone) != 88);
> +}
> +
>  #endif /* HW_NVME_H */
> diff --git a/include/block/nvme.h b/include/block/nvme.h
> index 5a1e5e137c..596c39162b 100644
> --- a/include/block/nvme.h
> +++ b/include/block/nvme.h
> @@ -446,6 +446,9 @@ enum NvmeIoCommands {
>      NVME_CMD_COMPARE            = 0x05,
>      NVME_CMD_WRITE_ZEROS        = 0x08,
>      NVME_CMD_DSM                = 0x09,
> +    NVME_CMD_ZONE_MGMT_SEND     = 0x79,
> +    NVME_CMD_ZONE_MGMT_RECV     = 0x7a,
> +    NVME_CMD_ZONE_APND          = 0x7d,
>  };
>
>  typedef struct NvmeDeleteQ {
> @@ -539,6 +542,7 @@ enum NvmeNidLength {
>
>  enum NvmeCsi {
>      NVME_CSI_NVM                = 0x00,
> +    NVME_CSI_ZONED              = 0x02,
>  };
>
>  #define NVME_SET_CSI(vec, csi) (vec |= (uint8_t)(1 << (csi)))
> @@ -661,6 +665,7 @@ enum NvmeStatusCodes {
>      NVME_INVALID_NSID           = 0x000b,
>      NVME_CMD_SEQ_ERROR          = 0x000c,
>      NVME_CMD_SET_CMB_REJECTED   = 0x002b,
> +    NVME_INVALID_CMD_SET        = 0x002c,
>      NVME_LBA_RANGE              = 0x0080,
>      NVME_CAP_EXCEEDED           = 0x0081,
>      NVME_NS_NOT_READY           = 0x0082,
> @@ -684,6 +689,14 @@ enum NvmeStatusCodes {
>      NVME_CONFLICTING_ATTRS      = 0x0180,
>      NVME_INVALID_PROT_INFO      = 0x0181,
>      NVME_WRITE_TO_RO            = 0x0182,
> +    NVME_ZONE_BOUNDARY_ERROR    = 0x01b8,
> +    NVME_ZONE_FULL              = 0x01b9,
> +    NVME_ZONE_READ_ONLY         = 0x01ba,
> +    NVME_ZONE_OFFLINE           = 0x01bb,
> +    NVME_ZONE_INVALID_WRITE     = 0x01bc,
> +    NVME_ZONE_TOO_MANY_ACTIVE   = 0x01bd,
> +    NVME_ZONE_TOO_MANY_OPEN     = 0x01be,
> +    NVME_ZONE_INVAL_TRANSITION  = 0x01bf,
>      NVME_WRITE_FAULT            = 0x0280,
>      NVME_UNRECOVERED_READ       = 0x0281,
>      NVME_E2E_GUARD_ERROR        = 0x0282,
> @@ -807,7 +820,17 @@ typedef struct NvmeIdCtrl {
>      uint8_t     ieee[3];
>      uint8_t     cmic;
>      uint8_t     mdts;
> -    uint8_t     rsvd255[178];
> +    uint16_t    cntlid;
> +    uint32_t    ver;
> +    uint32_t    rtd3r;
> +    uint32_t    rtd3e;
> +    uint32_t    oaes;
> +    uint32_t    ctratt;
> +    uint8_t     rsvd100[28];
> +    uint16_t    crdt1;
> +    uint16_t    crdt2;
> +    uint16_t    crdt3;
> +    uint8_t     rsvd134[122];
>      uint16_t    oacs;
>      uint8_t     acl;
>      uint8_t     aerl;
> @@ -832,6 +855,11 @@ typedef struct NvmeIdCtrl {
>      uint8_t     vs[1024];
>  } NvmeIdCtrl;
>
> +typedef struct NvmeIdCtrlZoned {
> +    uint8_t     zamds;
> +    uint8_t     rsvd1[4095];
> +} NvmeIdCtrlZoned;
> +
>  enum NvmeIdCtrlOacs {
>      NVME_OACS_SECURITY  = 1 << 0,
>      NVME_OACS_FORMAT    = 1 << 1,
> @@ -908,6 +936,12 @@ typedef struct NvmeLBAF {
>      uint8_t     rp;
>  } NvmeLBAF;
>
> +typedef struct NvmeLBAFE {
> +    uint64_t    zsze;
> +    uint8_t     zdes;
> +    uint8_t     rsvd9[7];
> +} NvmeLBAFE;
> +
>  typedef struct NvmeIdNs {
>      uint64_t    nsze;
>      uint64_t    ncap;
> @@ -930,6 +964,19 @@ typedef struct NvmeIdNs {
>      uint8_t     vs[3712];
>  } NvmeIdNs;
>
> +typedef struct NvmeIdNsZoned {
> +    uint16_t    zoc;
> +    uint16_t    ozcs;
> +    uint32_t    mar;
> +    uint32_t    mor;
> +    uint32_t    rrl;
> +    uint32_t    frl;
> +    uint8_t     rsvd20[2796];
> +    NvmeLBAFE   lbafe[16];
> +    uint8_t     rsvd3072[768];
> +    uint8_t     vs[256];
> +} NvmeIdNsZoned;
> +
>
>  /*Deallocate Logical Block Features*/
>  #define NVME_ID_NS_DLFEAT_GUARD_CRC(dlfeat)       ((dlfeat) & 0x10)
> @@ -962,6 +1009,71 @@ enum NvmeIdNsDps {
>      DPS_FIRST_EIGHT = 8,
>  };
>
> +enum NvmeZoneAttr {
> +    NVME_ZA_FINISHED_BY_CTLR         = 1 << 0,
> +    NVME_ZA_FINISH_RECOMMENDED       = 1 << 1,
> +    NVME_ZA_RESET_RECOMMENDED        = 1 << 2,
> +    NVME_ZA_ZD_EXT_VALID             = 1 << 7,
> +};
> +
> +typedef struct NvmeZoneReportHeader {
> +    uint64_t    nr_zones;
> +    uint8_t     rsvd[56];
> +} NvmeZoneReportHeader;
> +
> +enum NvmeZoneReceiveAction {
> +    NVME_ZONE_REPORT                 = 0,
> +    NVME_ZONE_REPORT_EXTENDED        = 1,
> +};
> +
> +enum NvmeZoneReportType {
> +    NVME_ZONE_REPORT_ALL             = 0,
> +    NVME_ZONE_REPORT_EMPTY           = 1,
> +    NVME_ZONE_REPORT_IMPLICITLY_OPEN = 2,
> +    NVME_ZONE_REPORT_EXPLICITLY_OPEN = 3,
> +    NVME_ZONE_REPORT_CLOSED          = 4,
> +    NVME_ZONE_REPORT_FULL            = 5,
> +    NVME_ZONE_REPORT_READ_ONLY       = 6,
> +    NVME_ZONE_REPORT_OFFLINE         = 7,
> +};
> +
> +typedef struct NvmeZoneDescr {
> +    uint8_t     zt;
> +    uint8_t     zs;
> +    uint8_t     za;
> +    uint8_t     rsvd3[5];
> +    uint64_t    zcap;
> +    uint64_t    zslba;
> +    uint64_t    wp;
> +    uint8_t     rsvd32[32];
> +} NvmeZoneDescr;
> +
> +enum NvmeZoneState {
> +    NVME_ZONE_STATE_RESERVED         = 0x00,
> +    NVME_ZONE_STATE_EMPTY            = 0x01,
> +    NVME_ZONE_STATE_IMPLICITLY_OPEN  = 0x02,
> +    NVME_ZONE_STATE_EXPLICITLY_OPEN  = 0x03,
> +    NVME_ZONE_STATE_CLOSED           = 0x04,
> +    NVME_ZONE_STATE_READ_ONLY        = 0x0D,
> +    NVME_ZONE_STATE_FULL             = 0x0E,
> +    NVME_ZONE_STATE_OFFLINE          = 0x0F,
> +};
> +
> +enum NvmeZoneType {
> +    NVME_ZONE_TYPE_RESERVED          = 0x00,
> +    NVME_ZONE_TYPE_SEQ_WRITE         = 0x02,
> +};
> +
> +enum NvmeZoneSendAction {
> +    NVME_ZONE_ACTION_RSD             = 0x00,
> +    NVME_ZONE_ACTION_CLOSE           = 0x01,
> +    NVME_ZONE_ACTION_FINISH          = 0x02,
> +    NVME_ZONE_ACTION_OPEN            = 0x03,
> +    NVME_ZONE_ACTION_RESET           = 0x04,
> +    NVME_ZONE_ACTION_OFFLINE         = 0x05,
> +    NVME_ZONE_ACTION_SET_ZD_EXT      = 0x10,
> +};
> +
>  static inline void _nvme_check_size(void)
>  {
>      QEMU_BUILD_BUG_ON(sizeof(NvmeCqe) != 16);
> @@ -978,8 +1090,13 @@ static inline void _nvme_check_size(void)
>      QEMU_BUILD_BUG_ON(sizeof(NvmeFwSlotInfoLog) != 512);
>      QEMU_BUILD_BUG_ON(sizeof(NvmeSmartLog) != 512);
>      QEMU_BUILD_BUG_ON(sizeof(NvmeIdCtrl) != 4096);
> +    QEMU_BUILD_BUG_ON(sizeof(NvmeIdCtrlZoned) != 4096);
>      QEMU_BUILD_BUG_ON(sizeof(NvmeNsIdDesc) != 4);
> +    QEMU_BUILD_BUG_ON(sizeof(NvmeLBAF) != 4);
> +    QEMU_BUILD_BUG_ON(sizeof(NvmeLBAFE) != 16);
>      QEMU_BUILD_BUG_ON(sizeof(NvmeIdNs) != 4096);
> +    QEMU_BUILD_BUG_ON(sizeof(NvmeIdNsZoned) != 4096);
>      QEMU_BUILD_BUG_ON(sizeof(NvmeEffectsLog) != 4096);
> +    QEMU_BUILD_BUG_ON(sizeof(NvmeZoneDescr) != 64);
>  }
>  #endif
> --
> 2.21.0
>
>
diff mbox series

Patch

diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index 0d29f75475..2c932b5e29 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -3,12 +3,22 @@ 
 
 #include "block/nvme.h"
 
+#define NVME_DEFAULT_ZONE_SIZE   128 /* MiB */
+#define NVME_DEFAULT_MAX_ZA_SIZE 128 /* KiB */
+
 typedef struct NvmeParams {
     char     *serial;
     uint32_t num_queues; /* deprecated since 5.1 */
     uint32_t max_ioqpairs;
     uint16_t msix_qsize;
     uint32_t cmb_size_mb;
+
+    bool        zoned;
+    bool        cross_zone_read;
+    uint8_t     fill_pattern;
+    uint32_t    zamds_bs;
+    uint64_t    zone_size;
+    uint64_t    zone_capacity;
 } NvmeParams;
 
 typedef struct NvmeAsyncEvent {
@@ -17,6 +27,8 @@  typedef struct NvmeAsyncEvent {
 
 enum NvmeRequestFlags {
     NVME_REQ_FLG_HAS_SG   = 1 << 0,
+    NVME_REQ_FLG_FILL     = 1 << 1,
+    NVME_REQ_FLG_APPEND   = 1 << 2,
 };
 
 typedef struct NvmeRequest {
@@ -24,6 +36,7 @@  typedef struct NvmeRequest {
     BlockAIOCB              *aiocb;
     uint16_t                status;
     uint16_t                flags;
+    uint64_t                fill_ofs;
     NvmeCqe                 cqe;
     BlockAcctCookie         acct;
     QEMUSGList              qsg;
@@ -61,11 +74,35 @@  typedef struct NvmeCQueue {
     QTAILQ_HEAD(, NvmeRequest) req_list;
 } NvmeCQueue;
 
+typedef struct NvmeZone {
+    NvmeZoneDescr   d;
+    uint64_t        tstamp;
+    uint32_t        next;
+    uint32_t        prev;
+    uint8_t         rsvd80[8];
+} NvmeZone;
+
+#define NVME_ZONE_LIST_NIL    UINT_MAX
+
+typedef struct NvmeZoneList {
+    uint32_t        head;
+    uint32_t        tail;
+    uint32_t        size;
+    uint8_t         rsvd12[4];
+} NvmeZoneList;
+
 typedef struct NvmeNamespace {
     NvmeIdNs        id_ns;
     uint32_t        nsid;
     uint8_t         csi;
     QemuUUID        uuid;
+
+    NvmeIdNsZoned   *id_ns_zoned;
+    NvmeZone        *zone_array;
+    NvmeZoneList    *exp_open_zones;
+    NvmeZoneList    *imp_open_zones;
+    NvmeZoneList    *closed_zones;
+    NvmeZoneList    *full_zones;
 } NvmeNamespace;
 
 static inline NvmeLBAF *nvme_ns_lbaf(NvmeNamespace *ns)
@@ -100,6 +137,7 @@  typedef struct NvmeCtrl {
     uint32_t    num_namespaces;
     uint32_t    max_q_ents;
     uint64_t    ns_size;
+
     uint8_t     *cmbuf;
     uint32_t    irq_status;
     uint64_t    host_timestamp;                 /* Timestamp sent by the host */
@@ -107,6 +145,12 @@  typedef struct NvmeCtrl {
 
     HostMemoryBackend *pmrdev;
 
+    int             zone_file_fd;
+    uint32_t        num_zones;
+    uint64_t        zone_size_bs;
+    uint64_t        zone_array_size;
+    uint8_t         zamds;
+
     NvmeNamespace   *namespaces;
     NvmeSQueue      **sq;
     NvmeCQueue      **cq;
@@ -121,6 +165,86 @@  static inline uint64_t nvme_ns_nlbas(NvmeCtrl *n, NvmeNamespace *ns)
     return n->ns_size >> nvme_ns_lbads(ns);
 }
 
+static inline uint8_t nvme_get_zone_state(NvmeZone *zone)
+{
+    return zone->d.zs >> 4;
+}
+
+static inline void nvme_set_zone_state(NvmeZone *zone, enum NvmeZoneState state)
+{
+    zone->d.zs = state << 4;
+}
+
+static inline uint64_t nvme_zone_rd_boundary(NvmeCtrl *n, NvmeZone *zone)
+{
+    return zone->d.zslba + n->params.zone_size;
+}
+
+static inline uint64_t nvme_zone_wr_boundary(NvmeZone *zone)
+{
+    return zone->d.zslba + zone->d.zcap;
+}
+
+static inline bool nvme_wp_is_valid(NvmeZone *zone)
+{
+    uint8_t st = nvme_get_zone_state(zone);
+
+    return st != NVME_ZONE_STATE_FULL &&
+           st != NVME_ZONE_STATE_READ_ONLY &&
+           st != NVME_ZONE_STATE_OFFLINE;
+}
+
+/*
+ * Initialize a zone list head.
+ */
+static inline void nvme_init_zone_list(NvmeZoneList *zl)
+{
+    zl->head = NVME_ZONE_LIST_NIL;
+    zl->tail = NVME_ZONE_LIST_NIL;
+    zl->size = 0;
+}
+
+/*
+ * Initialize the number of entries contained in a zone list.
+ */
+static inline uint32_t nvme_zone_list_size(NvmeZoneList *zl)
+{
+    return zl->size;
+}
+
+/*
+ * Check if the zone is not currently included into any zone list.
+ */
+static inline bool nvme_zone_not_in_list(NvmeZone *zone)
+{
+    return (bool)(zone->prev == 0 && zone->next == 0);
+}
+
+/*
+ * Return the zone at the head of zone list or NULL if the list is empty.
+ */
+static inline NvmeZone *nvme_peek_zone_head(NvmeNamespace *ns, NvmeZoneList *zl)
+{
+    if (zl->head == NVME_ZONE_LIST_NIL) {
+        return NULL;
+    }
+    return &ns->zone_array[zl->head];
+}
+
+/*
+ * Return the next zone in the list.
+ */
+static inline NvmeZone *nvme_next_zone_in_list(NvmeNamespace *ns, NvmeZone *z,
+    NvmeZoneList *zl)
+{
+    assert(!nvme_zone_not_in_list(z));
+
+    if (z->next == NVME_ZONE_LIST_NIL) {
+        return NULL;
+    }
+    return &ns->zone_array[z->next];
+}
+
 static inline int nvme_ilog2(uint64_t i)
 {
     int log = -1;
@@ -132,4 +256,10 @@  static inline int nvme_ilog2(uint64_t i)
     return log;
 }
 
+static inline void _hw_nvme_check_size(void)
+{
+    QEMU_BUILD_BUG_ON(sizeof(NvmeZoneList) != 16);
+    QEMU_BUILD_BUG_ON(sizeof(NvmeZone) != 88);
+}
+
 #endif /* HW_NVME_H */
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 5a1e5e137c..596c39162b 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -446,6 +446,9 @@  enum NvmeIoCommands {
     NVME_CMD_COMPARE            = 0x05,
     NVME_CMD_WRITE_ZEROS        = 0x08,
     NVME_CMD_DSM                = 0x09,
+    NVME_CMD_ZONE_MGMT_SEND     = 0x79,
+    NVME_CMD_ZONE_MGMT_RECV     = 0x7a,
+    NVME_CMD_ZONE_APND          = 0x7d,
 };
 
 typedef struct NvmeDeleteQ {
@@ -539,6 +542,7 @@  enum NvmeNidLength {
 
 enum NvmeCsi {
     NVME_CSI_NVM                = 0x00,
+    NVME_CSI_ZONED              = 0x02,
 };
 
 #define NVME_SET_CSI(vec, csi) (vec |= (uint8_t)(1 << (csi)))
@@ -661,6 +665,7 @@  enum NvmeStatusCodes {
     NVME_INVALID_NSID           = 0x000b,
     NVME_CMD_SEQ_ERROR          = 0x000c,
     NVME_CMD_SET_CMB_REJECTED   = 0x002b,
+    NVME_INVALID_CMD_SET        = 0x002c,
     NVME_LBA_RANGE              = 0x0080,
     NVME_CAP_EXCEEDED           = 0x0081,
     NVME_NS_NOT_READY           = 0x0082,
@@ -684,6 +689,14 @@  enum NvmeStatusCodes {
     NVME_CONFLICTING_ATTRS      = 0x0180,
     NVME_INVALID_PROT_INFO      = 0x0181,
     NVME_WRITE_TO_RO            = 0x0182,
+    NVME_ZONE_BOUNDARY_ERROR    = 0x01b8,
+    NVME_ZONE_FULL              = 0x01b9,
+    NVME_ZONE_READ_ONLY         = 0x01ba,
+    NVME_ZONE_OFFLINE           = 0x01bb,
+    NVME_ZONE_INVALID_WRITE     = 0x01bc,
+    NVME_ZONE_TOO_MANY_ACTIVE   = 0x01bd,
+    NVME_ZONE_TOO_MANY_OPEN     = 0x01be,
+    NVME_ZONE_INVAL_TRANSITION  = 0x01bf,
     NVME_WRITE_FAULT            = 0x0280,
     NVME_UNRECOVERED_READ       = 0x0281,
     NVME_E2E_GUARD_ERROR        = 0x0282,
@@ -807,7 +820,17 @@  typedef struct NvmeIdCtrl {
     uint8_t     ieee[3];
     uint8_t     cmic;
     uint8_t     mdts;
-    uint8_t     rsvd255[178];
+    uint16_t    cntlid;
+    uint32_t    ver;
+    uint32_t    rtd3r;
+    uint32_t    rtd3e;
+    uint32_t    oaes;
+    uint32_t    ctratt;
+    uint8_t     rsvd100[28];
+    uint16_t    crdt1;
+    uint16_t    crdt2;
+    uint16_t    crdt3;
+    uint8_t     rsvd134[122];
     uint16_t    oacs;
     uint8_t     acl;
     uint8_t     aerl;
@@ -832,6 +855,11 @@  typedef struct NvmeIdCtrl {
     uint8_t     vs[1024];
 } NvmeIdCtrl;
 
+typedef struct NvmeIdCtrlZoned {
+    uint8_t     zamds;
+    uint8_t     rsvd1[4095];
+} NvmeIdCtrlZoned;
+
 enum NvmeIdCtrlOacs {
     NVME_OACS_SECURITY  = 1 << 0,
     NVME_OACS_FORMAT    = 1 << 1,
@@ -908,6 +936,12 @@  typedef struct NvmeLBAF {
     uint8_t     rp;
 } NvmeLBAF;
 
+typedef struct NvmeLBAFE {
+    uint64_t    zsze;
+    uint8_t     zdes;
+    uint8_t     rsvd9[7];
+} NvmeLBAFE;
+
 typedef struct NvmeIdNs {
     uint64_t    nsze;
     uint64_t    ncap;
@@ -930,6 +964,19 @@  typedef struct NvmeIdNs {
     uint8_t     vs[3712];
 } NvmeIdNs;
 
+typedef struct NvmeIdNsZoned {
+    uint16_t    zoc;
+    uint16_t    ozcs;
+    uint32_t    mar;
+    uint32_t    mor;
+    uint32_t    rrl;
+    uint32_t    frl;
+    uint8_t     rsvd20[2796];
+    NvmeLBAFE   lbafe[16];
+    uint8_t     rsvd3072[768];
+    uint8_t     vs[256];
+} NvmeIdNsZoned;
+
 
 /*Deallocate Logical Block Features*/
 #define NVME_ID_NS_DLFEAT_GUARD_CRC(dlfeat)       ((dlfeat) & 0x10)
@@ -962,6 +1009,71 @@  enum NvmeIdNsDps {
     DPS_FIRST_EIGHT = 8,
 };
 
+enum NvmeZoneAttr {
+    NVME_ZA_FINISHED_BY_CTLR         = 1 << 0,
+    NVME_ZA_FINISH_RECOMMENDED       = 1 << 1,
+    NVME_ZA_RESET_RECOMMENDED        = 1 << 2,
+    NVME_ZA_ZD_EXT_VALID             = 1 << 7,
+};
+
+typedef struct NvmeZoneReportHeader {
+    uint64_t    nr_zones;
+    uint8_t     rsvd[56];
+} NvmeZoneReportHeader;
+
+enum NvmeZoneReceiveAction {
+    NVME_ZONE_REPORT                 = 0,
+    NVME_ZONE_REPORT_EXTENDED        = 1,
+};
+
+enum NvmeZoneReportType {
+    NVME_ZONE_REPORT_ALL             = 0,
+    NVME_ZONE_REPORT_EMPTY           = 1,
+    NVME_ZONE_REPORT_IMPLICITLY_OPEN = 2,
+    NVME_ZONE_REPORT_EXPLICITLY_OPEN = 3,
+    NVME_ZONE_REPORT_CLOSED          = 4,
+    NVME_ZONE_REPORT_FULL            = 5,
+    NVME_ZONE_REPORT_READ_ONLY       = 6,
+    NVME_ZONE_REPORT_OFFLINE         = 7,
+};
+
+typedef struct NvmeZoneDescr {
+    uint8_t     zt;
+    uint8_t     zs;
+    uint8_t     za;
+    uint8_t     rsvd3[5];
+    uint64_t    zcap;
+    uint64_t    zslba;
+    uint64_t    wp;
+    uint8_t     rsvd32[32];
+} NvmeZoneDescr;
+
+enum NvmeZoneState {
+    NVME_ZONE_STATE_RESERVED         = 0x00,
+    NVME_ZONE_STATE_EMPTY            = 0x01,
+    NVME_ZONE_STATE_IMPLICITLY_OPEN  = 0x02,
+    NVME_ZONE_STATE_EXPLICITLY_OPEN  = 0x03,
+    NVME_ZONE_STATE_CLOSED           = 0x04,
+    NVME_ZONE_STATE_READ_ONLY        = 0x0D,
+    NVME_ZONE_STATE_FULL             = 0x0E,
+    NVME_ZONE_STATE_OFFLINE          = 0x0F,
+};
+
+enum NvmeZoneType {
+    NVME_ZONE_TYPE_RESERVED          = 0x00,
+    NVME_ZONE_TYPE_SEQ_WRITE         = 0x02,
+};
+
+enum NvmeZoneSendAction {
+    NVME_ZONE_ACTION_RSD             = 0x00,
+    NVME_ZONE_ACTION_CLOSE           = 0x01,
+    NVME_ZONE_ACTION_FINISH          = 0x02,
+    NVME_ZONE_ACTION_OPEN            = 0x03,
+    NVME_ZONE_ACTION_RESET           = 0x04,
+    NVME_ZONE_ACTION_OFFLINE         = 0x05,
+    NVME_ZONE_ACTION_SET_ZD_EXT      = 0x10,
+};
+
 static inline void _nvme_check_size(void)
 {
     QEMU_BUILD_BUG_ON(sizeof(NvmeCqe) != 16);
@@ -978,8 +1090,13 @@  static inline void _nvme_check_size(void)
     QEMU_BUILD_BUG_ON(sizeof(NvmeFwSlotInfoLog) != 512);
     QEMU_BUILD_BUG_ON(sizeof(NvmeSmartLog) != 512);
     QEMU_BUILD_BUG_ON(sizeof(NvmeIdCtrl) != 4096);
+    QEMU_BUILD_BUG_ON(sizeof(NvmeIdCtrlZoned) != 4096);
     QEMU_BUILD_BUG_ON(sizeof(NvmeNsIdDesc) != 4);
+    QEMU_BUILD_BUG_ON(sizeof(NvmeLBAF) != 4);
+    QEMU_BUILD_BUG_ON(sizeof(NvmeLBAFE) != 16);
     QEMU_BUILD_BUG_ON(sizeof(NvmeIdNs) != 4096);
+    QEMU_BUILD_BUG_ON(sizeof(NvmeIdNsZoned) != 4096);
     QEMU_BUILD_BUG_ON(sizeof(NvmeEffectsLog) != 4096);
+    QEMU_BUILD_BUG_ON(sizeof(NvmeZoneDescr) != 64);
 }
 #endif