diff mbox series

[v7,3/4] qcow2: add zoned emulation capability

Message ID 20240122184830.40094-4-faithilikerun@gmail.com (mailing list archive)
State New, archived
Headers show
Series Add full zoned storage emulation to qcow2 driver | expand

Commit Message

Sam Li Jan. 22, 2024, 6:48 p.m. UTC
By adding zone operations and zoned metadata, the zoned emulation
capability enables full emulation support of zoned device using
a qcow2 file. The zoned device metadata includes zone type,
zoned device state and write pointer of each zone, which is stored
to an array of unsigned integers.

Each zone of a zoned device makes state transitions following
the zone state machine. The zone state machine mainly describes
five states, IMPLICIT OPEN, EXPLICIT OPEN, FULL, EMPTY and CLOSED.
READ ONLY and OFFLINE states will generally be affected by device
internal events. The operations on zones cause corresponding state
changing.

Zoned devices have a limit on zone resources, which puts constraints on
write operations into zones. It is managed by active zone lists
following LRU policy.

Signed-off-by: Sam Li <faithilikerun@gmail.com>
---
 block/qcow2.c        | 791 ++++++++++++++++++++++++++++++++++++++++++-
 block/trace-events   |   2 +
 include/qemu/queue.h |   1 +
 3 files changed, 792 insertions(+), 2 deletions(-)

Comments

Stefan Hajnoczi March 12, 2024, 6:30 p.m. UTC | #1
On Mon, Jan 22, 2024 at 07:48:29PM +0100, Sam Li wrote:
> By adding zone operations and zoned metadata, the zoned emulation
> capability enables full emulation support of zoned device using
> a qcow2 file. The zoned device metadata includes zone type,
> zoned device state and write pointer of each zone, which is stored
> to an array of unsigned integers.
> 
> Each zone of a zoned device makes state transitions following
> the zone state machine. The zone state machine mainly describes
> five states, IMPLICIT OPEN, EXPLICIT OPEN, FULL, EMPTY and CLOSED.
> READ ONLY and OFFLINE states will generally be affected by device
> internal events. The operations on zones cause corresponding state
> changing.
> 
> Zoned devices have a limit on zone resources, which puts constraints on
> write operations into zones. It is managed by active zone lists
> following LRU policy.
> 
> Signed-off-by: Sam Li <faithilikerun@gmail.com>
> ---
>  block/qcow2.c        | 791 ++++++++++++++++++++++++++++++++++++++++++-
>  block/trace-events   |   2 +
>  include/qemu/queue.h |   1 +
>  3 files changed, 792 insertions(+), 2 deletions(-)
> 
> diff --git a/block/qcow2.c b/block/qcow2.c
> index b987f1e751..db28585b82 100644
> --- a/block/qcow2.c
> +++ b/block/qcow2.c
> @@ -195,6 +195,274 @@ qcow2_extract_crypto_opts(QemuOpts *opts, const char *fmt, Error **errp)
>      return cryptoopts_qdict;
>  }
>  
> +#define QCOW2_ZT_IS_CONV(wp)    (wp & 1ULL << 59)
> +
> +/*
> + * To emulate a real zoned device, closed, empty and full states are
> + * preserved after a power cycle. The open states are in-memory and will
> + * be lost after closing the device. Read-only and offline states are
> + * device-internal events, which are not considered for simplicity.
> + */
> +static inline BlockZoneState qcow2_get_zone_state(BlockDriverState *bs,
> +                                                  uint32_t index)
> +{
> +    BDRVQcow2State *s = bs->opaque;
> +    Qcow2ZoneListEntry *zone_entry = &s->zone_list_entries[index];
> +    uint64_t zone_wp = bs->wps->wp[index];
> +    uint64_t zone_start;
> +
> +    if (QCOW2_ZT_IS_CONV(zone_wp)) {
> +        return BLK_ZS_NOT_WP;
> +    }
> +
> +    if (QLIST_IS_INSERTED(zone_entry, exp_open_zone_entry)) {
> +        return BLK_ZS_EOPEN;
> +    }
> +    if (QLIST_IS_INSERTED(zone_entry, imp_open_zone_entry)) {
> +        return BLK_ZS_IOPEN;
> +    }
> +
> +    zone_start = index * bs->bl.zone_size;
> +    if (zone_wp == zone_start) {
> +        return BLK_ZS_EMPTY;
> +    }
> +    if (zone_wp >= zone_start + bs->bl.zone_capacity) {
> +        return BLK_ZS_FULL;
> +    }
> +    if (zone_wp > zone_start) {
> +        if (!QLIST_IS_INSERTED(zone_entry, closed_zone_entry)) {
> +            /*
> +             * The number of closed zones is not always updated in time when
> +             * the device is closed. However, it only matters when doing
> +             * zone report. Refresh the count and list of closed zones to
> +             * provide correct zone states for zone report.
> +             */
> +            QLIST_INSERT_HEAD(&s->closed_zones, zone_entry, closed_zone_entry);
> +            s->nr_zones_closed++;
> +        }
> +        return BLK_ZS_CLOSED;
> +    }
> +    return BLK_ZS_NOT_WP;
> +}
> +
> +static void qcow2_rm_exp_open_zone(BDRVQcow2State *s,
> +                                   uint32_t index)
> +{
> +    Qcow2ZoneListEntry *zone_entry = &s->zone_list_entries[index];
> +
> +    QLIST_REMOVE(zone_entry, exp_open_zone_entry);
> +    s->nr_zones_exp_open--;
> +}
> +
> +static void qcow2_rm_imp_open_zone(BDRVQcow2State *s,
> +                                   int32_t index)
> +{
> +    Qcow2ZoneListEntry *zone_entry;
> +    if (index < 0) {
> +        /* Apply LRU when the index is not specified. */
> +        zone_entry = QLIST_LAST(&s->imp_open_zones, imp_open_zone_entry);
> +    } else {
> +        zone_entry = &s->zone_list_entries[index];
> +    }
> +
> +    QLIST_REMOVE(zone_entry, imp_open_zone_entry);
> +    s->nr_zones_imp_open--;
> +}
> +
> +static void qcow2_rm_open_zone(BDRVQcow2State *s,
> +                               uint32_t index)
> +{
> +    Qcow2ZoneListEntry *zone_entry = &s->zone_list_entries[index];
> +
> +    if (QLIST_IS_INSERTED(zone_entry, exp_open_zone_entry)) {
> +        qcow2_rm_exp_open_zone(s, index);
> +    } else if (QLIST_IS_INSERTED(zone_entry, imp_open_zone_entry)) {
> +        qcow2_rm_imp_open_zone(s, index);
> +    }
> +}
> +
> +static void qcow2_rm_closed_zone(BDRVQcow2State *s,
> +                                 uint32_t index)
> +{
> +    Qcow2ZoneListEntry *zone_entry = &s->zone_list_entries[index];
> +
> +    QLIST_REMOVE(zone_entry, closed_zone_entry);
> +    s->nr_zones_closed--;
> +}
> +
> +static void qcow2_do_imp_open_zone(BDRVQcow2State *s,
> +                                   uint32_t index,
> +                                   BlockZoneState zs)
> +{
> +    Qcow2ZoneListEntry *zone_entry = &s->zone_list_entries[index];
> +
> +    switch (zs) {
> +    case BLK_ZS_EMPTY:
> +        break;
> +    case BLK_ZS_CLOSED:
> +        s->nr_zones_closed--;
> +        break;

Does the zone entry need to be removed from the closed zone list? If
yes, then qcow2_rm_closed_zone() could be called here instead of
decrementing nr_zones_closed ourselves.

> +    case BLK_ZS_IOPEN:
> +        /*
> +         * The LRU policy: update the zone that is most recently
> +         * used to the head of the zone list
> +         */
> +        if (zone_entry == QLIST_FIRST(&s->imp_open_zones)) {
> +            return;
> +        }
> +        QLIST_REMOVE(zone_entry, imp_open_zone_entry);
> +        s->nr_zones_imp_open--;
> +        break;
> +    default:
> +        return;
> +    }
> +
> +    QLIST_INSERT_HEAD(&s->imp_open_zones, zone_entry, imp_open_zone_entry);
> +    s->nr_zones_imp_open++;
> +}
> +
> +static void qcow2_do_exp_open_zone(BDRVQcow2State *s,
> +                                   uint32_t index)
> +{
> +    Qcow2ZoneListEntry *zone_entry = &s->zone_list_entries[index];
> +
> +    QLIST_INSERT_HEAD(&s->exp_open_zones, zone_entry, exp_open_zone_entry);
> +    s->nr_zones_exp_open++;
> +}
> +
> +/*
> + * The list of zones is managed using an LRU policy: the last
> + * zone of the list is always the one that was least recently used
> + * for writing and is chosen as the zone to close to be able to
> + * implicitly open another zone.
> + *
> + * We can only close the open zones. The index is not specified
> + * when it is less than 0.
> + */
> +static void qcow2_do_close_zone(BlockDriverState *bs,
> +                                int32_t index,
> +                                BlockZoneState zs)
> +{
> +    BDRVQcow2State *s = bs->opaque;
> +    Qcow2ZoneListEntry *zone_entry;
> +
> +    if (index < 0 && zs == BLK_ZS_IOPEN) {
> +        /* before removal of the last implicitly open zone */
> +        zone_entry = QLIST_LAST(&s->imp_open_zones, imp_open_zone_entry);
> +        qcow2_rm_imp_open_zone(s, index);
> +        goto close_zone;
> +    }
> +
> +    if (index >= 0 && zs == BLK_ZS_EOPEN) {
> +        zone_entry = &s->zone_list_entries[index];
> +        qcow2_rm_exp_open_zone(s, index);
> +        /*
> +         * The zone state changes when the zone is removed from the list of
> +         * open zones (explicitly open -> empty)
> +         */
> +        qcow2_get_zone_state(bs, index);
> +    }
> +    return;
> +
> +close_zone:
> +    QLIST_INSERT_HEAD(&s->closed_zones, zone_entry, closed_zone_entry);
> +    s->nr_zones_closed++;
> +}
> +
> +/*
> + * Write the new wp value to the dedicated location of the image file.
> + */
> +static int coroutine_fn GRAPH_RDLOCK
> +qcow2_write_wp_at(BlockDriverState *bs, uint64_t *wp,
> +                             int32_t index) {
> +    BDRVQcow2State *s = bs->opaque;
> +    uint64_t wpv = *wp;
> +    int ret;
> +
> +    ret = bdrv_pwrite(bs->file, s->zoned_header.zonedmeta_offset
> +        + sizeof(uint64_t) * index, sizeof(uint64_t), wp, 0);
> +    if (ret < 0) {
> +        goto exit;
> +    }
> +    trace_qcow2_wp_tracking(index, *wp >> BDRV_SECTOR_BITS);
> +    return ret;
> +
> +exit:
> +    *wp = wpv;
> +    error_report("Failed to write metadata with file");
> +    return ret;
> +}
> +
> +static bool qcow2_can_activate_zone(BlockDriverState *bs)
> +{
> +    BDRVQcow2State *s = bs->opaque;
> +
> +    /* When the max active zone is zero, there is no limit on active zones */
> +    if (!s->zoned_header.max_active_zones) {
> +        return true;
> +    }
> +
> +    /* Active zones are zones that are open or closed */
> +    return s->nr_zones_exp_open + s->nr_zones_imp_open + s->nr_zones_closed
> +        < s->zoned_header.max_active_zones;
> +}
> +
> +/*
> + * This function manages open zones under active zones limit. It checks
> + * if a zone can transition to open state while maintaining max open and
> + * active zone limits.
> + */
> +static bool qcow2_can_open_zone(BlockDriverState *bs)
> +{
> +    BDRVQcow2State *s = bs->opaque;
> +
> +    /* When the max open zone is zero, there is no limit on open zones */
> +    if (!s->zoned_header.max_open_zones) {
> +        return true;
> +    }
> +
> +    /*
> +     * The open zones are zones with the states of explicitly and
> +     * implicitly open.
> +     */
> +    if (s->nr_zones_imp_open + s->nr_zones_exp_open <
> +        s->zoned_header.max_open_zones) {
> +        return true;
> +    }
> +
> +    /*
> +     * Zones are managed one at a time. Thus, the number of implicitly open
> +     * zone can never be over the open zone limit. When the active zone limit
> +     * is not reached, close only one implicitly open zone.
> +     */
> +    if (qcow2_can_activate_zone(bs)) {
> +        qcow2_do_close_zone(bs, -1, BLK_ZS_IOPEN);
> +        trace_qcow2_imp_open_zones(0x23, s->nr_zones_imp_open);
> +        return true;
> +    }
> +    return false;
> +}
> +
> +static inline int coroutine_fn GRAPH_RDLOCK
> +qcow2_refresh_zonedmeta(BlockDriverState *bs)
> +{
> +    int ret;
> +    BDRVQcow2State *s = bs->opaque;
> +    uint64_t wps_size = s->zoned_header.zonedmeta_size;
> +    g_autofree uint64_t *temp;
> +
> +    temp = g_new(uint64_t, wps_size);
> +    ret = bdrv_pread(bs->file, s->zoned_header.zonedmeta_offset,
> +                     wps_size, temp, 0);
> +    if (ret < 0) {
> +        error_report("Can not read metadata");
> +        return ret;
> +    }
> +
> +    memcpy(bs->wps->wp, temp, wps_size);
> +    return 0;
> +}
> +
>  /*
>   * Passing by the zoned device configurations by a zoned_header struct, check
>   * if the zone device options are under constraints. Return false when some
> @@ -521,7 +789,23 @@ qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset,
>                  be32_to_cpu(zoned_ext.max_active_zones);
>              zoned_ext.max_append_bytes =
>                  be32_to_cpu(zoned_ext.max_append_bytes);
> +            zoned_ext.zonedmeta_offset =
> +                be64_to_cpu(zoned_ext.zonedmeta_offset);
> +            zoned_ext.zonedmeta_size = be64_to_cpu(zoned_ext.zonedmeta_size);
>              s->zoned_header = zoned_ext;
> +            bs->wps = g_malloc(sizeof(BlockZoneWps)
> +                + s->zoned_header.zonedmeta_size);
> +            ret = qcow2_refresh_zonedmeta(bs);
> +            if (ret < 0) {
> +                return ret;
> +            }
> +
> +            s->zone_list_entries = g_new0(Qcow2ZoneListEntry,
> +                                          zoned_ext.nr_zones);
> +            QLIST_INIT(&s->exp_open_zones);
> +            QLIST_INIT(&s->imp_open_zones);
> +            QLIST_INIT(&s->closed_zones);
> +            qemu_co_mutex_init(&bs->wps->colock);
>  
>              /* refuse to open broken images */
>              if (zoned_ext.nr_zones != DIV_ROUND_UP(bs->total_sectors *
> @@ -2789,9 +3073,68 @@ qcow2_co_pwritev_part(BlockDriverState *bs, int64_t offset, int64_t bytes,
>      uint64_t host_offset;
>      QCowL2Meta *l2meta = NULL;
>      AioTaskPool *aio = NULL;
> +    int64_t start_offset, start_bytes;
> +    BlockZoneState zs;
> +    int64_t end_zone, end_offset;
> +    uint64_t *wp;
> +    int64_t zone_size = bs->bl.zone_size;
> +    int64_t zone_capacity = bs->bl.zone_capacity;
> +    int index;
>  
>      trace_qcow2_writev_start_req(qemu_coroutine_self(), offset, bytes);
>  
> +    start_offset = offset;
> +    start_bytes = bytes;
> +    end_offset = start_offset + start_bytes;
> +    if (bs->bl.zoned == BLK_Z_HM) {
> +        index = start_offset / zone_size;
> +        wp = &bs->wps->wp[index];

This function must be thread-safe. The write pointer probably needs to
be protected by bs->wps->colock because we are checking and modifying
it.

> +        if (!QCOW2_ZT_IS_CONV(*wp)) {
> +            /* The write offset must be equal to the zone write pointer */
> +            if (offset != *wp) {
> +                return -EINVAL;
> +            }
> +
> +            /* Only allow writes when there are zone resources left */
> +            zs = qcow2_get_zone_state(bs, index);
> +            if (zs == BLK_ZS_CLOSED || zs == BLK_ZS_EMPTY) {
> +                if (!qcow2_can_open_zone(bs)) {
> +                    return -EINVAL;
> +                }
> +            }
> +
> +            /*
> +             * Align up (start_offset, zone_size), the start offset is not
> +             * necessarily power of two.
> +             */
> +            end_zone = index * zone_size + zone_capacity;
> +            /* Write cannot exceed the zone capacity. */
> +            if (end_offset > end_zone) {
> +                return -EINVAL;
> +            }
> +
> +            /*
> +             * Real drives change states before it can write to the zone. If
> +             * the write fails, the zone state may have changed.
> +             *
> +             * The zone state transitions to implicit open when the original
> +             * state is empty or closed. When the wp reaches the end, the
> +             * open states (explicit open, implicit open) become full.
> +             */
> +            zs = qcow2_get_zone_state(bs, index);
> +            if (!(end_offset & (zone_capacity - 1))) {
> +                /* Being aligned to zone capacity implies full state */
> +                qcow2_rm_open_zone(s, index);
> +                trace_qcow2_imp_open_zones(0x24,
> +                                           s->nr_zones_imp_open);
> +            } else {
> +                qcow2_do_imp_open_zone(s, index, zs);
> +                trace_qcow2_imp_open_zones(0x24,
> +                                           s->nr_zones_imp_open);
> +            }
> +        }
> +    }
> +
>      while (bytes != 0 && aio_task_pool_status(aio) == 0) {
>  
>          l2meta = NULL;
> @@ -2837,6 +3180,19 @@ qcow2_co_pwritev_part(BlockDriverState *bs, int64_t offset, int64_t bytes,
>          qiov_offset += cur_bytes;
>          trace_qcow2_writev_done_part(qemu_coroutine_self(), cur_bytes);
>      }
> +
> +    if (bs->bl.zoned == BLK_Z_HM) {
> +        index = start_offset / zone_size;
> +        wp = &bs->wps->wp[index];
> +        if (!QCOW2_ZT_IS_CONV(*wp)) {
> +            /* Advance the write pointer when the write completes */

Updating the write pointer after I/O does not prevent other write
requests from beginning at the same offset as this request. Multiple
write request coroutines can run concurrently and only the first one
should succeed. The others should fail if they are using the same
offset.

The comment above says "Real drives change states before it can write to
the zone" and I think it's appropriate to update the write pointer
before performing the write too. The qcow2 zone emulation code is
different from the file-posix.c passthrough code. We are responsible for
maintaining zoned metadata state and cannot wait for the result of the
I/O to tell us what happened.

> +            *wp = end_offset;
> +            ret = qcow2_write_wp_at(bs, wp, index);
> +            if (ret < 0) {
> +                goto fail_nometa;
> +            }
> +        }
> +    }
>      ret = 0;
>  
>      qemu_co_mutex_lock(&s->lock);
> @@ -2895,6 +3251,25 @@ static int GRAPH_RDLOCK qcow2_inactivate(BlockDriverState *bs)
>      return result;
>  }
>  
> +static void qcow2_do_close_all_zone(BDRVQcow2State *s)
> +{
> +    Qcow2ZoneListEntry *zone_entry, *next;
> +
> +    QLIST_FOREACH_SAFE(zone_entry, &s->imp_open_zones, imp_open_zone_entry,
> +                       next) {
> +        QLIST_REMOVE(zone_entry, imp_open_zone_entry);
> +        s->nr_zones_imp_open--;
> +    }
> +
> +    QLIST_FOREACH_SAFE(zone_entry, &s->exp_open_zones, exp_open_zone_entry,
> +                       next) {
> +        QLIST_REMOVE(zone_entry, exp_open_zone_entry);
> +        s->nr_zones_exp_open--;
> +    }
> +
> +    assert(s->nr_zones_imp_open + s->nr_zones_exp_open == 0);
> +}
> +
>  static void coroutine_mixed_fn GRAPH_RDLOCK
>  qcow2_do_close(BlockDriverState *bs, bool close_data_file)
>  {
> @@ -2934,6 +3309,8 @@ qcow2_do_close(BlockDriverState *bs, bool close_data_file)
>  
>      qcow2_refcount_close(bs);
>      qcow2_free_snapshots(bs);
> +    qcow2_do_close_all_zone(s);
> +    g_free(bs->wps);
>  }
>  
>  static void GRAPH_UNLOCKED qcow2_close(BlockDriverState *bs)
> @@ -3248,7 +3625,10 @@ int qcow2_update_header(BlockDriverState *bs)
>              .max_active_zones   =
>                  cpu_to_be32(s->zoned_header.max_active_zones),
>              .max_append_bytes =
> -                cpu_to_be32(s->zoned_header.max_append_bytes)
> +                cpu_to_be32(s->zoned_header.max_append_bytes),
> +            .zonedmeta_offset   =
> +                cpu_to_be64(s->zoned_header.zonedmeta_offset),
> +            .zonedmeta_size     = cpu_to_be64(s->zoned_header.zonedmeta_size),
>          };
>          ret = header_ext_add(buf, QCOW2_EXT_MAGIC_ZONED_FORMAT,
>                               &zoned_header, sizeof(zoned_header),
> @@ -3656,7 +4036,8 @@ qcow2_co_create(BlockdevCreateOptions *create_options, Error **errp)
>      int version;
>      int refcount_order;
>      uint64_t *refcount_table;
> -    int ret;
> +    uint64_t zoned_meta_size, zoned_clusterlen;
> +    int ret, offset, i;
>      uint8_t compression_type = QCOW2_COMPRESSION_TYPE_ZLIB;
>  
>      assert(create_options->driver == BLOCKDEV_DRIVER_QCOW2);
> @@ -3997,6 +4378,46 @@ qcow2_co_create(BlockdevCreateOptions *create_options, Error **errp)
>              ret = -EINVAL;
>              goto unlock;
>          }
> +
> +        uint32_t nrz = s->zoned_header.nr_zones;
> +        zoned_meta_size =  sizeof(uint64_t) * nrz;
> +        g_autofree uint64_t *meta = NULL;
> +        meta = g_new0(uint64_t, nrz);
> +
> +        for (i = 0; i < s->zoned_header.conventional_zones; ++i) {

Is it necessary to store conventional_zones in the header? This option
is only used during image creation time.

> +            meta[i] = i * s->zoned_header.zone_size;
> +            meta[i] |= 1ULL << 59;
> +        }
> +
> +        for (; i < nrz; ++i) {
> +            meta[i] = i * s->zoned_header.zone_size;
> +        }
> +
> +        offset = qcow2_alloc_clusters(blk_bs(blk), zoned_meta_size);
> +        if (offset < 0) {
> +            error_setg_errno(errp, -offset, "Could not allocate clusters "
> +                                            "for zoned metadata size");

Missing:
ret = offset;

> +            goto unlock;
> +        }
> +        s->zoned_header.zonedmeta_offset = offset;
> +        s->zoned_header.zonedmeta_size = zoned_meta_size;
> +
> +        zoned_clusterlen = size_to_clusters(s, zoned_meta_size)
> +                * s->cluster_size;
> +        assert(qcow2_pre_write_overlap_check(bs, 0, offset,
> +                                             zoned_clusterlen,false) == 0);
> +        ret = bdrv_pwrite_zeroes(blk_bs(blk)->file, offset,
> +                                 zoned_clusterlen, 0);

I don't think it's necessary to zero zonedmeta since it's about to be
overwritten. There may be some padding until the end of the last
cluster, but there is no need to zero it.

> +        if (ret < 0) {
> +            error_setg_errno(errp, -ret, "Could not zero fill zoned metadata");
> +            goto unlock;
> +        }
> +        ret = bdrv_pwrite(blk_bs(blk)->file, offset, zoned_meta_size, meta, 0);
> +        if (ret < 0) {
> +            error_setg_errno(errp, -ret, "Could not write zoned metadata "
> +                                         "to disk");
> +            goto unlock;
> +        }
>      } else {
>          s->zoned_header.zoned = BLK_Z_NONE;
>      }
> @@ -4339,6 +4760,368 @@ qcow2_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
>      return ret;
>  }
>  
> +static int coroutine_fn
> +qcow2_co_zone_report(BlockDriverState *bs, int64_t offset,
> +                     unsigned int *nr_zones, BlockZoneDescriptor *zones)
> +{
> +    BDRVQcow2State *s = bs->opaque;
> +    uint64_t zone_size = s->zoned_header.zone_size;
> +    int64_t capacity = bs->total_sectors << BDRV_SECTOR_BITS;
> +    int64_t size = bs->bl.nr_zones * zone_size;
> +    unsigned int nrz;
> +    int i = 0;
> +    int si;
> +
> +    if (offset >= capacity) {
> +        error_report("offset %" PRId64 " is equal to or greater than the "
> +                     "device capacity %" PRId64 "", offset, capacity);
> +        return -EINVAL;
> +    }
> +
> +    nrz = ((*nr_zones) < bs->bl.nr_zones) ? (*nr_zones) : bs->bl.nr_zones;
> +    si = offset / zone_size; /* Zone size cannot be 0 for zoned device */
> +    qemu_co_mutex_lock(&bs->wps->colock);
> +    for (; i < nrz; ++i) {
> +        if (i + si >= bs->bl.nr_zones) {
> +            break;
> +        }
> +
> +        zones[i].start = (si + i) * zone_size;
> +
> +        /* The last zone can be smaller than the zone size */
> +        if ((si + i + 1) == bs->bl.nr_zones && size > capacity) {
> +            uint32_t l = zone_size - (size - capacity);
> +            zones[i].length = l;
> +            zones[i].cap = l;
> +        } else {
> +            zones[i].length = zone_size;
> +            zones[i].cap = zone_size;
> +        }
> +
> +        uint64_t wp = bs->wps->wp[si + i];
> +        if (QCOW2_ZT_IS_CONV(wp)) {
> +            zones[i].type = BLK_ZT_CONV;
> +            zones[i].state = BLK_ZS_NOT_WP;
> +            /* Clear the zone type bit */
> +            wp &= ~(1ULL << 59);
> +        } else {
> +            zones[i].type = BLK_ZT_SWR;
> +            zones[i].state = qcow2_get_zone_state(bs, si + i);
> +        }
> +        zones[i].wp = wp;
> +    }
> +    qemu_co_mutex_unlock(&bs->wps->colock);
> +    *nr_zones = i;
> +    return 0;
> +}
> +
> +static int coroutine_fn GRAPH_RDLOCK
> +qcow2_open_zone(BlockDriverState *bs, uint32_t index) {
> +    BDRVQcow2State *s = bs->opaque;
> +    int ret;
> +
> +    qemu_co_mutex_lock(&bs->wps->colock);
> +    BlockZoneState zs = qcow2_get_zone_state(bs, index);
> +    trace_qcow2_imp_open_zones(BLK_ZO_OPEN, s->nr_zones_imp_open);
> +
> +    switch(zs) {
> +    case BLK_ZS_EMPTY:
> +        if (!qcow2_can_activate_zone(bs)) {
> +            ret = -EBUSY;
> +            goto unlock;
> +        }
> +        break;
> +    case BLK_ZS_IOPEN:
> +        qcow2_rm_imp_open_zone(s, index);
> +        break;
> +    case BLK_ZS_EOPEN:
> +        return 0;
> +    case BLK_ZS_CLOSED:
> +        if (!qcow2_can_open_zone(bs)) {
> +            ret = -EINVAL;
> +            goto unlock;
> +        }
> +        s->nr_zones_closed--;

Does the zone entry need to be removed from the closed zone list?

> +        break;
> +    case BLK_ZS_FULL:
> +        break;
> +    default:
> +        ret = -EINVAL;
> +        goto unlock;
> +    }
> +
> +    qcow2_do_exp_open_zone(s, index);
> +    ret = 0;
> +
> +unlock:
> +    qemu_co_mutex_unlock(&bs->wps->colock);
> +    return ret;
> +}
> +
> +static int qcow2_close_zone(BlockDriverState *bs, uint32_t index) {
> +    int ret;
> +
> +    qemu_co_mutex_lock(&bs->wps->colock);
> +    BlockZoneState zs = qcow2_get_zone_state(bs, index);
> +
> +    switch(zs) {
> +    case BLK_ZS_EMPTY:
> +        break;
> +    case BLK_ZS_IOPEN:
> +        break;
> +    case BLK_ZS_EOPEN:
> +        break;
> +    case BLK_ZS_CLOSED:
> +        /* Closing a closed zone is not an error */
> +        ret = 0;
> +        goto unlock;
> +    case BLK_ZS_FULL:
> +        break;
> +    default:
> +        ret = -EINVAL;
> +        goto unlock;
> +    }
> +    qcow2_do_close_zone(bs, index, zs);
> +    ret = 0;
> +
> +unlock:
> +    qemu_co_mutex_unlock(&bs->wps->colock);
> +    return ret;
> +}
> +
> +static int coroutine_fn GRAPH_RDLOCK
> +qcow2_finish_zone(BlockDriverState *bs, uint32_t index) {
> +    BDRVQcow2State *s = bs->opaque;
> +    int ret;
> +
> +    qemu_co_mutex_lock(&bs->wps->colock);
> +    uint64_t *wp = &bs->wps->wp[index];
> +    BlockZoneState zs = qcow2_get_zone_state(bs, index);
> +
> +    switch(zs) {
> +    case BLK_ZS_EMPTY:
> +        if (!qcow2_can_activate_zone(bs)) {
> +            ret = -EBUSY;
> +            goto unlock;
> +        }
> +        break;
> +    case BLK_ZS_IOPEN:
> +        qcow2_rm_imp_open_zone(s, index);
> +        trace_qcow2_imp_open_zones(BLK_ZO_FINISH, s->nr_zones_imp_open);
> +        break;
> +    case BLK_ZS_EOPEN:
> +        qcow2_rm_exp_open_zone(s, index);
> +        break;
> +    case BLK_ZS_CLOSED:
> +        if (!qcow2_can_open_zone(bs)) {
> +            ret = -EINVAL;
> +            goto unlock;
> +        }
> +        qcow2_rm_closed_zone(s, index);
> +        break;
> +    case BLK_ZS_FULL:
> +        ret = 0;
> +        goto unlock;
> +    default:
> +        ret = -EINVAL;
> +        goto unlock;
> +    }
> +
> +    *wp = ((uint64_t)index + 1) * s->zoned_header.zone_size;
> +    ret = qcow2_write_wp_at(bs, wp, index);
> +
> +unlock:
> +    qemu_co_mutex_unlock(&bs->wps->colock);
> +    return ret;
> +}
> +
> +static int coroutine_fn GRAPH_RDLOCK
> +qcow2_reset_zone(BlockDriverState *bs, uint32_t index,
> +                            int64_t len) {
> +    BDRVQcow2State *s = bs->opaque;
> +    int nrz = bs->bl.nr_zones;
> +    int zone_size = bs->bl.zone_size;
> +    int n, ret = 0;
> +
> +    qemu_co_mutex_lock(&bs->wps->colock);
> +    uint64_t *wp = &bs->wps->wp[index];
> +    if (len == bs->total_sectors << BDRV_SECTOR_BITS) {
> +        n = nrz;
> +        index = 0;
> +    } else {
> +        n = len / zone_size;
> +    }
> +
> +    for (int i = 0; i < n; ++i) {
> +        uint64_t *wp_i = (uint64_t *)(wp + i);
> +        uint64_t wpi_v = *wp_i;
> +        if (QCOW2_ZT_IS_CONV(wpi_v)) {
> +            continue;
> +        }
> +        
> +        BlockZoneState zs = qcow2_get_zone_state(bs, index + i);
> +        switch (zs) {
> +        case BLK_ZS_EMPTY:
> +            break;
> +        case BLK_ZS_IOPEN:
> +            qcow2_rm_imp_open_zone(s, index + i);
> +            trace_qcow2_imp_open_zones(BLK_ZO_RESET, s->nr_zones_imp_open);
> +            break;
> +        case BLK_ZS_EOPEN:
> +            qcow2_rm_exp_open_zone(s, index + i);
> +            break;
> +        case BLK_ZS_CLOSED:
> +            qcow2_rm_closed_zone(s, index + i);
> +            break;
> +        case BLK_ZS_FULL:
> +            break;
> +        default:
> +            ret = -EINVAL;
> +            goto unlock;
> +        }
> +
> +        if (zs == BLK_ZS_EMPTY) {
> +            continue;
> +        }
> +
> +        *wp_i = ((uint64_t)index + i) * zone_size;
> +        ret = qcow2_write_wp_at(bs, wp_i, index + i);
> +        if (ret < 0) {
> +            goto unlock;
> +        }
> +        /* clear data */
> +        ret = qcow2_co_pwrite_zeroes(bs, *wp_i, zone_size, 0);
> +        if (ret < 0) {
> +            error_report("Failed to reset zone at 0x%" PRIx64 "", *wp_i);
> +        }
> +    }
> +
> +unlock:
> +    qemu_co_mutex_unlock(&bs->wps->colock);
> +    return ret;
> +}
> +
> +static int coroutine_fn GRAPH_RDLOCK
> +qcow2_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
> +                                           int64_t offset, int64_t len)
> +{
> +    BDRVQcow2State *s = bs->opaque;
> +    int ret = 0;
> +    int64_t capacity = bs->total_sectors << BDRV_SECTOR_BITS;
> +    int64_t zone_size = s->zoned_header.zone_size;
> +    int64_t zone_size_mask = zone_size - 1;
> +    uint32_t index = offset / zone_size;
> +    BlockZoneWps *wps = bs->wps;
> +
> +    if (offset >= capacity) {
> +        error_report("offset %" PRId64 " is equal to or greater than the"
> +                     "device capacity %" PRId64 "", offset, capacity);
> +        return -EINVAL;
> +    }
> +
> +    if (offset & zone_size_mask) {
> +        error_report("sector offset %" PRId64 " is not aligned to zone size"
> +                     " %" PRId64 "", offset / 512, zone_size / 512);
> +        return -EINVAL;
> +    }
> +
> +    if (((offset + len) < capacity && len & zone_size_mask) ||
> +        offset + len > capacity) {
> +        error_report("number of sectors %" PRId64 " is not aligned to zone"
> +                     " size %" PRId64 "", len / 512, zone_size / 512);
> +        return -EINVAL;
> +    }
> +
> +    qemu_co_mutex_lock(&wps->colock);
> +    uint64_t wpv = wps->wp[index];
> +    if (QCOW2_ZT_IS_CONV(wpv) && len != capacity) {
> +        error_report("zone mgmt operations are not allowed for "
> +                     "conventional zones");
> +        ret = -EIO;
> +        goto unlock;
> +    }
> +    qemu_co_mutex_unlock(&wps->colock);
> +
> +    switch(op) {
> +    case BLK_ZO_OPEN:
> +        ret = qcow2_open_zone(bs, index);
> +        break;
> +    case BLK_ZO_CLOSE:
> +        ret = qcow2_close_zone(bs, index);
> +        break;
> +    case BLK_ZO_FINISH:
> +        ret = qcow2_finish_zone(bs, index);
> +        break;
> +    case BLK_ZO_RESET:
> +        ret = qcow2_reset_zone(bs, index, len);
> +        break;
> +    default:
> +        error_report("Unsupported zone op: 0x%x", op);
> +        ret = -ENOTSUP;
> +        break;
> +    }
> +    return ret;
> +
> +unlock:
> +    qemu_co_mutex_unlock(&wps->colock);
> +    return ret;
> +}
> +
> +static int coroutine_fn GRAPH_RDLOCK
> +qcow2_co_zone_append(BlockDriverState *bs, int64_t *offset, QEMUIOVector *qiov,
> +                     BdrvRequestFlags flags)
> +{
> +    assert(flags == 0);
> +    int64_t capacity = bs->total_sectors << BDRV_SECTOR_BITS;
> +    uint32_t index;
> +    int ret;
> +    int64_t zone_size_mask = bs->bl.zone_size - 1;
> +    int64_t iov_len = 0;
> +    int64_t len = 0;
> +
> +    if (*offset >= capacity) {
> +        error_report("*offset %" PRId64 " is equal to or greater than the"
> +                     "device capacity %" PRId64 "", *offset, capacity);
> +        return -EINVAL;
> +    }
> +
> +    /* offset + len should not pass the end of that zone starting from offset */
> +    if (*offset & zone_size_mask) {
> +        error_report("sector offset %" PRId64 " is not aligned to zone size "
> +                     "%" PRId32 "", *offset / 512, bs->bl.zone_size / 512);
> +        return -EINVAL;
> +    }
> +
> +    int64_t wg = bs->bl.write_granularity;
> +    int64_t wg_mask = wg - 1;
> +    for (int i = 0; i < qiov->niov; i++) {
> +        iov_len = qiov->iov[i].iov_len;
> +        if (iov_len & wg_mask) {
> +            error_report("len of IOVector[%d] %" PRId64 " is not aligned to "
> +                         "block size %" PRId64 "", i, iov_len, wg);
> +            return -EINVAL;
> +        }
> +    }
> +    len = qiov->size;
> +    index = *offset / bs->bl.zone_size;
> +
> +    if ((len >> BDRV_SECTOR_BITS) > bs->bl.max_append_sectors) {
> +        return -ENOTSUP;
> +    }
> +
> +    qemu_co_mutex_lock(&bs->wps->colock);
> +    uint64_t wp_i = bs->wps->wp[index];
> +    ret = qcow2_co_pwritev_part(bs, wp_i, len, qiov, 0, 0);

Is it correct to use wp_i directly without masking bits (e.g. bit 63 for
conventional zones)?

> +    if (ret == 0) {
> +        *offset = wp_i;
> +    } else {
> +        error_report("qcow2: zap failed");
> +    }
> +
> +    qemu_co_mutex_unlock(&bs->wps->colock);
> +    return ret;
> +}
> +
>  static int coroutine_fn GRAPH_RDLOCK
>  qcow2_co_copy_range_from(BlockDriverState *bs,
>                           BdrvChild *src, int64_t src_offset,
> @@ -6398,6 +7181,10 @@ BlockDriver bdrv_qcow2 = {
>      .bdrv_co_pwritev_compressed_part    = qcow2_co_pwritev_compressed_part,
>      .bdrv_make_empty                    = qcow2_make_empty,
>  
> +    .bdrv_co_zone_report    = qcow2_co_zone_report,
> +    .bdrv_co_zone_mgmt    = qcow2_co_zone_mgmt,
> +    .bdrv_co_zone_append    = qcow2_co_zone_append,
> +
>      .bdrv_snapshot_create               = qcow2_snapshot_create,
>      .bdrv_snapshot_goto                 = qcow2_snapshot_goto,
>      .bdrv_snapshot_delete               = qcow2_snapshot_delete,
> diff --git a/block/trace-events b/block/trace-events
> index 8e789e1f12..e35222e079 100644
> --- a/block/trace-events
> +++ b/block/trace-events
> @@ -82,6 +82,8 @@ qcow2_writev_data(void *co, uint64_t offset) "co %p offset 0x%" PRIx64
>  qcow2_pwrite_zeroes_start_req(void *co, int64_t offset, int64_t bytes) "co %p offset 0x%" PRIx64 " bytes %" PRId64
>  qcow2_pwrite_zeroes(void *co, int64_t offset, int64_t bytes) "co %p offset 0x%" PRIx64 " bytes %" PRId64
>  qcow2_skip_cow(void *co, uint64_t offset, int nb_clusters) "co %p offset 0x%" PRIx64 " nb_clusters %d"
> +qcow2_wp_tracking(int index, uint64_t wp) "wps[%d]: 0x%" PRIx64
> +qcow2_imp_open_zones(uint8_t op, int nrz) "nr_imp_open_zones after op 0x%x: %d"
>  
>  # qcow2-cluster.c
>  qcow2_alloc_clusters_offset(void *co, uint64_t offset, int bytes) "co %p offset 0x%" PRIx64 " bytes %d"
> diff --git a/include/qemu/queue.h b/include/qemu/queue.h
> index e029e7bf66..3f0a48740e 100644
> --- a/include/qemu/queue.h
> +++ b/include/qemu/queue.h
> @@ -179,6 +179,7 @@ struct {                                                                \
>  #define QLIST_EMPTY(head)                ((head)->lh_first == NULL)
>  #define QLIST_FIRST(head)                ((head)->lh_first)
>  #define QLIST_NEXT(elm, field)           ((elm)->field.le_next)
> +#define QLIST_LAST(head, field)          (*(head)->lh_first->field.le_prev)

This assumes QLIST is a circular linked list, but it is not circular:

  #define QLIST_INSERT_HEAD(head, elm, field) do {                        \
          if (((elm)->field.le_next = (head)->lh_first) != NULL)          \
                  (head)->lh_first->field.le_prev = &(elm)->field.le_next;\
          (head)->lh_first = (elm);                                       \
          (elm)->field.le_prev = &(head)->lh_first;                       \
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  } while (/*CONSTCOND*/0)

head->lh_first->field.le_prev points to head's list head lh_first field,
not to the last node.

Take a look at QTAILQ or QSIMPLEQ if you need a LAST() API.
Sam Li Sept. 23, 2024, 11:06 a.m. UTC | #2
Hi Stefan,

I apologized for the late reply.

Stefan Hajnoczi <stefanha@redhat.com> 于2024年3月12日周二 20:02写道:
>
> On Mon, Jan 22, 2024 at 07:48:29PM +0100, Sam Li wrote:
> > By adding zone operations and zoned metadata, the zoned emulation
> > capability enables full emulation support of zoned device using
> > a qcow2 file. The zoned device metadata includes zone type,
> > zoned device state and write pointer of each zone, which is stored
> > to an array of unsigned integers.
> >
> > Each zone of a zoned device makes state transitions following
> > the zone state machine. The zone state machine mainly describes
> > five states, IMPLICIT OPEN, EXPLICIT OPEN, FULL, EMPTY and CLOSED.
> > READ ONLY and OFFLINE states will generally be affected by device
> > internal events. The operations on zones cause corresponding state
> > changing.
> >
> > Zoned devices have a limit on zone resources, which puts constraints on
> > write operations into zones. It is managed by active zone lists
> > following LRU policy.
> >
> > Signed-off-by: Sam Li <faithilikerun@gmail.com>
> > ---
> >  block/qcow2.c        | 791 ++++++++++++++++++++++++++++++++++++++++++-
> >  block/trace-events   |   2 +
> >  include/qemu/queue.h |   1 +
> >  3 files changed, 792 insertions(+), 2 deletions(-)
> >
> > diff --git a/block/qcow2.c b/block/qcow2.c
> > index b987f1e751..db28585b82 100644
> > --- a/block/qcow2.c
> > +++ b/block/qcow2.c
> > @@ -195,6 +195,274 @@ qcow2_extract_crypto_opts(QemuOpts *opts, const char *fmt, Error **errp)
> >      return cryptoopts_qdict;
> >  }
> >
> > +#define QCOW2_ZT_IS_CONV(wp)    (wp & 1ULL << 59)
> > +
> > +/*
> > + * To emulate a real zoned device, closed, empty and full states are
> > + * preserved after a power cycle. The open states are in-memory and will
> > + * be lost after closing the device. Read-only and offline states are
> > + * device-internal events, which are not considered for simplicity.
> > + */
> > +static inline BlockZoneState qcow2_get_zone_state(BlockDriverState *bs,
> > +                                                  uint32_t index)
> > +{
> > +    BDRVQcow2State *s = bs->opaque;
> > +    Qcow2ZoneListEntry *zone_entry = &s->zone_list_entries[index];
> > +    uint64_t zone_wp = bs->wps->wp[index];
> > +    uint64_t zone_start;
> > +
> > +    if (QCOW2_ZT_IS_CONV(zone_wp)) {
> > +        return BLK_ZS_NOT_WP;
> > +    }
> > +
> > +    if (QLIST_IS_INSERTED(zone_entry, exp_open_zone_entry)) {
> > +        return BLK_ZS_EOPEN;
> > +    }
> > +    if (QLIST_IS_INSERTED(zone_entry, imp_open_zone_entry)) {
> > +        return BLK_ZS_IOPEN;
> > +    }
> > +
> > +    zone_start = index * bs->bl.zone_size;
> > +    if (zone_wp == zone_start) {
> > +        return BLK_ZS_EMPTY;
> > +    }
> > +    if (zone_wp >= zone_start + bs->bl.zone_capacity) {
> > +        return BLK_ZS_FULL;
> > +    }
> > +    if (zone_wp > zone_start) {
> > +        if (!QLIST_IS_INSERTED(zone_entry, closed_zone_entry)) {
> > +            /*
> > +             * The number of closed zones is not always updated in time when
> > +             * the device is closed. However, it only matters when doing
> > +             * zone report. Refresh the count and list of closed zones to
> > +             * provide correct zone states for zone report.
> > +             */
> > +            QLIST_INSERT_HEAD(&s->closed_zones, zone_entry, closed_zone_entry);
> > +            s->nr_zones_closed++;
> > +        }
> > +        return BLK_ZS_CLOSED;
> > +    }
> > +    return BLK_ZS_NOT_WP;
> > +}
> > +
> > +static void qcow2_rm_exp_open_zone(BDRVQcow2State *s,
> > +                                   uint32_t index)
> > +{
> > +    Qcow2ZoneListEntry *zone_entry = &s->zone_list_entries[index];
> > +
> > +    QLIST_REMOVE(zone_entry, exp_open_zone_entry);
> > +    s->nr_zones_exp_open--;
> > +}
> > +
> > +static void qcow2_rm_imp_open_zone(BDRVQcow2State *s,
> > +                                   int32_t index)
> > +{
> > +    Qcow2ZoneListEntry *zone_entry;
> > +    if (index < 0) {
> > +        /* Apply LRU when the index is not specified. */
> > +        zone_entry = QLIST_LAST(&s->imp_open_zones, imp_open_zone_entry);
> > +    } else {
> > +        zone_entry = &s->zone_list_entries[index];
> > +    }
> > +
> > +    QLIST_REMOVE(zone_entry, imp_open_zone_entry);
> > +    s->nr_zones_imp_open--;
> > +}
> > +
> > +static void qcow2_rm_open_zone(BDRVQcow2State *s,
> > +                               uint32_t index)
> > +{
> > +    Qcow2ZoneListEntry *zone_entry = &s->zone_list_entries[index];
> > +
> > +    if (QLIST_IS_INSERTED(zone_entry, exp_open_zone_entry)) {
> > +        qcow2_rm_exp_open_zone(s, index);
> > +    } else if (QLIST_IS_INSERTED(zone_entry, imp_open_zone_entry)) {
> > +        qcow2_rm_imp_open_zone(s, index);
> > +    }
> > +}
> > +
> > +static void qcow2_rm_closed_zone(BDRVQcow2State *s,
> > +                                 uint32_t index)
> > +{
> > +    Qcow2ZoneListEntry *zone_entry = &s->zone_list_entries[index];
> > +
> > +    QLIST_REMOVE(zone_entry, closed_zone_entry);
> > +    s->nr_zones_closed--;
> > +}
> > +
> > +static void qcow2_do_imp_open_zone(BDRVQcow2State *s,
> > +                                   uint32_t index,
> > +                                   BlockZoneState zs)
> > +{
> > +    Qcow2ZoneListEntry *zone_entry = &s->zone_list_entries[index];
> > +
> > +    switch (zs) {
> > +    case BLK_ZS_EMPTY:
> > +        break;
> > +    case BLK_ZS_CLOSED:
> > +        s->nr_zones_closed--;
> > +        break;
>
> Does the zone entry need to be removed from the closed zone list? If
> yes, then qcow2_rm_closed_zone() could be called here instead of
> decrementing nr_zones_closed ourselves.

Yes, will fix it.

>
> > +    case BLK_ZS_IOPEN:
> > +        /*
> > +         * The LRU policy: update the zone that is most recently
> > +         * used to the head of the zone list
> > +         */
> > +        if (zone_entry == QLIST_FIRST(&s->imp_open_zones)) {
> > +            return;
> > +        }
> > +        QLIST_REMOVE(zone_entry, imp_open_zone_entry);
> > +        s->nr_zones_imp_open--;
> > +        break;
> > +    default:
> > +        return;
> > +    }
> > +
> > +    QLIST_INSERT_HEAD(&s->imp_open_zones, zone_entry, imp_open_zone_entry);
> > +    s->nr_zones_imp_open++;
> > +}
> > +
> > +static void qcow2_do_exp_open_zone(BDRVQcow2State *s,
> > +                                   uint32_t index)
> > +{
> > +    Qcow2ZoneListEntry *zone_entry = &s->zone_list_entries[index];
> > +
> > +    QLIST_INSERT_HEAD(&s->exp_open_zones, zone_entry, exp_open_zone_entry);
> > +    s->nr_zones_exp_open++;
> > +}
> > +
> > +/*
> > + * The list of zones is managed using an LRU policy: the last
> > + * zone of the list is always the one that was least recently used
> > + * for writing and is chosen as the zone to close to be able to
> > + * implicitly open another zone.
> > + *
> > + * We can only close the open zones. The index is not specified
> > + * when it is less than 0.
> > + */
> > +static void qcow2_do_close_zone(BlockDriverState *bs,
> > +                                int32_t index,
> > +                                BlockZoneState zs)
> > +{
> > +    BDRVQcow2State *s = bs->opaque;
> > +    Qcow2ZoneListEntry *zone_entry;
> > +
> > +    if (index < 0 && zs == BLK_ZS_IOPEN) {
> > +        /* before removal of the last implicitly open zone */
> > +        zone_entry = QLIST_LAST(&s->imp_open_zones, imp_open_zone_entry);
> > +        qcow2_rm_imp_open_zone(s, index);
> > +        goto close_zone;
> > +    }
> > +
> > +    if (index >= 0 && zs == BLK_ZS_EOPEN) {
> > +        zone_entry = &s->zone_list_entries[index];
> > +        qcow2_rm_exp_open_zone(s, index);
> > +        /*
> > +         * The zone state changes when the zone is removed from the list of
> > +         * open zones (explicitly open -> empty)
> > +         */
> > +        qcow2_get_zone_state(bs, index);
> > +    }
> > +    return;
> > +
> > +close_zone:
> > +    QLIST_INSERT_HEAD(&s->closed_zones, zone_entry, closed_zone_entry);
> > +    s->nr_zones_closed++;
> > +}
> > +
> > +/*
> > + * Write the new wp value to the dedicated location of the image file.
> > + */
> > +static int coroutine_fn GRAPH_RDLOCK
> > +qcow2_write_wp_at(BlockDriverState *bs, uint64_t *wp,
> > +                             int32_t index) {
> > +    BDRVQcow2State *s = bs->opaque;
> > +    uint64_t wpv = *wp;
> > +    int ret;
> > +
> > +    ret = bdrv_pwrite(bs->file, s->zoned_header.zonedmeta_offset
> > +        + sizeof(uint64_t) * index, sizeof(uint64_t), wp, 0);
> > +    if (ret < 0) {
> > +        goto exit;
> > +    }
> > +    trace_qcow2_wp_tracking(index, *wp >> BDRV_SECTOR_BITS);
> > +    return ret;
> > +
> > +exit:
> > +    *wp = wpv;
> > +    error_report("Failed to write metadata with file");
> > +    return ret;
> > +}
> > +
> > +static bool qcow2_can_activate_zone(BlockDriverState *bs)
> > +{
> > +    BDRVQcow2State *s = bs->opaque;
> > +
> > +    /* When the max active zone is zero, there is no limit on active zones */
> > +    if (!s->zoned_header.max_active_zones) {
> > +        return true;
> > +    }
> > +
> > +    /* Active zones are zones that are open or closed */
> > +    return s->nr_zones_exp_open + s->nr_zones_imp_open + s->nr_zones_closed
> > +        < s->zoned_header.max_active_zones;
> > +}
> > +
> > +/*
> > + * This function manages open zones under active zones limit. It checks
> > + * if a zone can transition to open state while maintaining max open and
> > + * active zone limits.
> > + */
> > +static bool qcow2_can_open_zone(BlockDriverState *bs)
> > +{
> > +    BDRVQcow2State *s = bs->opaque;
> > +
> > +    /* When the max open zone is zero, there is no limit on open zones */
> > +    if (!s->zoned_header.max_open_zones) {
> > +        return true;
> > +    }
> > +
> > +    /*
> > +     * The open zones are zones with the states of explicitly and
> > +     * implicitly open.
> > +     */
> > +    if (s->nr_zones_imp_open + s->nr_zones_exp_open <
> > +        s->zoned_header.max_open_zones) {
> > +        return true;
> > +    }
> > +
> > +    /*
> > +     * Zones are managed one at a time. Thus, the number of implicitly open
> > +     * zone can never be over the open zone limit. When the active zone limit
> > +     * is not reached, close only one implicitly open zone.
> > +     */
> > +    if (qcow2_can_activate_zone(bs)) {
> > +        qcow2_do_close_zone(bs, -1, BLK_ZS_IOPEN);
> > +        trace_qcow2_imp_open_zones(0x23, s->nr_zones_imp_open);
> > +        return true;
> > +    }
> > +    return false;
> > +}
> > +
> > +static inline int coroutine_fn GRAPH_RDLOCK
> > +qcow2_refresh_zonedmeta(BlockDriverState *bs)
> > +{
> > +    int ret;
> > +    BDRVQcow2State *s = bs->opaque;
> > +    uint64_t wps_size = s->zoned_header.zonedmeta_size;
> > +    g_autofree uint64_t *temp;
> > +
> > +    temp = g_new(uint64_t, wps_size);
> > +    ret = bdrv_pread(bs->file, s->zoned_header.zonedmeta_offset,
> > +                     wps_size, temp, 0);
> > +    if (ret < 0) {
> > +        error_report("Can not read metadata");
> > +        return ret;
> > +    }
> > +
> > +    memcpy(bs->wps->wp, temp, wps_size);
> > +    return 0;
> > +}
> > +
> >  /*
> >   * Passing by the zoned device configurations by a zoned_header struct, check
> >   * if the zone device options are under constraints. Return false when some
> > @@ -521,7 +789,23 @@ qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset,
> >                  be32_to_cpu(zoned_ext.max_active_zones);
> >              zoned_ext.max_append_bytes =
> >                  be32_to_cpu(zoned_ext.max_append_bytes);
> > +            zoned_ext.zonedmeta_offset =
> > +                be64_to_cpu(zoned_ext.zonedmeta_offset);
> > +            zoned_ext.zonedmeta_size = be64_to_cpu(zoned_ext.zonedmeta_size);
> >              s->zoned_header = zoned_ext;
> > +            bs->wps = g_malloc(sizeof(BlockZoneWps)
> > +                + s->zoned_header.zonedmeta_size);
> > +            ret = qcow2_refresh_zonedmeta(bs);
> > +            if (ret < 0) {
> > +                return ret;
> > +            }
> > +
> > +            s->zone_list_entries = g_new0(Qcow2ZoneListEntry,
> > +                                          zoned_ext.nr_zones);
> > +            QLIST_INIT(&s->exp_open_zones);
> > +            QLIST_INIT(&s->imp_open_zones);
> > +            QLIST_INIT(&s->closed_zones);
> > +            qemu_co_mutex_init(&bs->wps->colock);
> >
> >              /* refuse to open broken images */
> >              if (zoned_ext.nr_zones != DIV_ROUND_UP(bs->total_sectors *
> > @@ -2789,9 +3073,68 @@ qcow2_co_pwritev_part(BlockDriverState *bs, int64_t offset, int64_t bytes,
> >      uint64_t host_offset;
> >      QCowL2Meta *l2meta = NULL;
> >      AioTaskPool *aio = NULL;
> > +    int64_t start_offset, start_bytes;
> > +    BlockZoneState zs;
> > +    int64_t end_zone, end_offset;
> > +    uint64_t *wp;
> > +    int64_t zone_size = bs->bl.zone_size;
> > +    int64_t zone_capacity = bs->bl.zone_capacity;
> > +    int index;
> >
> >      trace_qcow2_writev_start_req(qemu_coroutine_self(), offset, bytes);
> >
> > +    start_offset = offset;
> > +    start_bytes = bytes;
> > +    end_offset = start_offset + start_bytes;
> > +    if (bs->bl.zoned == BLK_Z_HM) {
> > +        index = start_offset / zone_size;
> > +        wp = &bs->wps->wp[index];
>
> This function must be thread-safe. The write pointer probably needs to
> be protected by bs->wps->colock because we are checking and modifying
> it.

True. The mutex lock should be held while accessing the wps.

>
> > +        if (!QCOW2_ZT_IS_CONV(*wp)) {
> > +            /* The write offset must be equal to the zone write pointer */
> > +            if (offset != *wp) {
> > +                return -EINVAL;
> > +            }
> > +
> > +            /* Only allow writes when there are zone resources left */
> > +            zs = qcow2_get_zone_state(bs, index);
> > +            if (zs == BLK_ZS_CLOSED || zs == BLK_ZS_EMPTY) {
> > +                if (!qcow2_can_open_zone(bs)) {
> > +                    return -EINVAL;
> > +                }
> > +            }
> > +
> > +            /*
> > +             * Align up (start_offset, zone_size), the start offset is not
> > +             * necessarily power of two.
> > +             */
> > +            end_zone = index * zone_size + zone_capacity;
> > +            /* Write cannot exceed the zone capacity. */
> > +            if (end_offset > end_zone) {
> > +                return -EINVAL;
> > +            }
> > +
> > +            /*
> > +             * Real drives change states before it can write to the zone. If
> > +             * the write fails, the zone state may have changed.
> > +             *
> > +             * The zone state transitions to implicit open when the original
> > +             * state is empty or closed. When the wp reaches the end, the
> > +             * open states (explicit open, implicit open) become full.
> > +             */
> > +            zs = qcow2_get_zone_state(bs, index);
> > +            if (!(end_offset & (zone_capacity - 1))) {
> > +                /* Being aligned to zone capacity implies full state */
> > +                qcow2_rm_open_zone(s, index);
> > +                trace_qcow2_imp_open_zones(0x24,
> > +                                           s->nr_zones_imp_open);
> > +            } else {
> > +                qcow2_do_imp_open_zone(s, index, zs);
> > +                trace_qcow2_imp_open_zones(0x24,
> > +                                           s->nr_zones_imp_open);
> > +            }
> > +        }
> > +    }
> > +
> >      while (bytes != 0 && aio_task_pool_status(aio) == 0) {
> >
> >          l2meta = NULL;
> > @@ -2837,6 +3180,19 @@ qcow2_co_pwritev_part(BlockDriverState *bs, int64_t offset, int64_t bytes,
> >          qiov_offset += cur_bytes;
> >          trace_qcow2_writev_done_part(qemu_coroutine_self(), cur_bytes);
> >      }
> > +
> > +    if (bs->bl.zoned == BLK_Z_HM) {
> > +        index = start_offset / zone_size;
> > +        wp = &bs->wps->wp[index];
> > +        if (!QCOW2_ZT_IS_CONV(*wp)) {
> > +            /* Advance the write pointer when the write completes */
>
> Updating the write pointer after I/O does not prevent other write
> requests from beginning at the same offset as this request. Multiple
> write request coroutines can run concurrently and only the first one
> should succeed. The others should fail if they are using the same
> offset.
>
> The comment above says "Real drives change states before it can write to
> the zone" and I think it's appropriate to update the write pointer
> before performing the write too. The qcow2 zone emulation code is
> different from the file-posix.c passthrough code. We are responsible for
> maintaining zoned metadata state and cannot wait for the result of the
> I/O to tell us what happened.

The problem of updating the write pointer before IO completion is the
failure case.  It can't be predicted in advance if an IO fails or not.
When write I/O fails, the wp should not be updated.

The alternative way is to hold the wps lock as is also required for wp
accessing. Therefore only one of multiple concurrent write requests
will succeed.

>
> > +            *wp = end_offset;
> > +            ret = qcow2_write_wp_at(bs, wp, index);
> > +            if (ret < 0) {
> > +                goto fail_nometa;
> > +            }
> > +        }
> > +    }
> >      ret = 0;
> >
> >      qemu_co_mutex_lock(&s->lock);
> > @@ -2895,6 +3251,25 @@ static int GRAPH_RDLOCK qcow2_inactivate(BlockDriverState *bs)
> >      return result;
> >  }
> >
> > +static void qcow2_do_close_all_zone(BDRVQcow2State *s)
> > +{
> > +    Qcow2ZoneListEntry *zone_entry, *next;
> > +
> > +    QLIST_FOREACH_SAFE(zone_entry, &s->imp_open_zones, imp_open_zone_entry,
> > +                       next) {
> > +        QLIST_REMOVE(zone_entry, imp_open_zone_entry);
> > +        s->nr_zones_imp_open--;
> > +    }
> > +
> > +    QLIST_FOREACH_SAFE(zone_entry, &s->exp_open_zones, exp_open_zone_entry,
> > +                       next) {
> > +        QLIST_REMOVE(zone_entry, exp_open_zone_entry);
> > +        s->nr_zones_exp_open--;
> > +    }
> > +
> > +    assert(s->nr_zones_imp_open + s->nr_zones_exp_open == 0);
> > +}
> > +
> >  static void coroutine_mixed_fn GRAPH_RDLOCK
> >  qcow2_do_close(BlockDriverState *bs, bool close_data_file)
> >  {
> > @@ -2934,6 +3309,8 @@ qcow2_do_close(BlockDriverState *bs, bool close_data_file)
> >
> >      qcow2_refcount_close(bs);
> >      qcow2_free_snapshots(bs);
> > +    qcow2_do_close_all_zone(s);
> > +    g_free(bs->wps);
> >  }
> >
> >  static void GRAPH_UNLOCKED qcow2_close(BlockDriverState *bs)
> > @@ -3248,7 +3625,10 @@ int qcow2_update_header(BlockDriverState *bs)
> >              .max_active_zones   =
> >                  cpu_to_be32(s->zoned_header.max_active_zones),
> >              .max_append_bytes =
> > -                cpu_to_be32(s->zoned_header.max_append_bytes)
> > +                cpu_to_be32(s->zoned_header.max_append_bytes),
> > +            .zonedmeta_offset   =
> > +                cpu_to_be64(s->zoned_header.zonedmeta_offset),
> > +            .zonedmeta_size     = cpu_to_be64(s->zoned_header.zonedmeta_size),
> >          };
> >          ret = header_ext_add(buf, QCOW2_EXT_MAGIC_ZONED_FORMAT,
> >                               &zoned_header, sizeof(zoned_header),
> > @@ -3656,7 +4036,8 @@ qcow2_co_create(BlockdevCreateOptions *create_options, Error **errp)
> >      int version;
> >      int refcount_order;
> >      uint64_t *refcount_table;
> > -    int ret;
> > +    uint64_t zoned_meta_size, zoned_clusterlen;
> > +    int ret, offset, i;
> >      uint8_t compression_type = QCOW2_COMPRESSION_TYPE_ZLIB;
> >
> >      assert(create_options->driver == BLOCKDEV_DRIVER_QCOW2);
> > @@ -3997,6 +4378,46 @@ qcow2_co_create(BlockdevCreateOptions *create_options, Error **errp)
> >              ret = -EINVAL;
> >              goto unlock;
> >          }
> > +
> > +        uint32_t nrz = s->zoned_header.nr_zones;
> > +        zoned_meta_size =  sizeof(uint64_t) * nrz;
> > +        g_autofree uint64_t *meta = NULL;
> > +        meta = g_new0(uint64_t, nrz);
> > +
> > +        for (i = 0; i < s->zoned_header.conventional_zones; ++i) {
>
> Is it necessary to store conventional_zones in the header? This option
> is only used during image creation time.

Yes, it is. The conventional_zones is also used during opening an
image to check if the zoned options are under constraints. As the
sequential_zones are not stored in the header, it helps to calculate
that value.

>
> > +            meta[i] = i * s->zoned_header.zone_size;
> > +            meta[i] |= 1ULL << 59;
> > +        }
> > +
> > +        for (; i < nrz; ++i) {
> > +            meta[i] = i * s->zoned_header.zone_size;
> > +        }
> > +
> > +        offset = qcow2_alloc_clusters(blk_bs(blk), zoned_meta_size);
> > +        if (offset < 0) {
> > +            error_setg_errno(errp, -offset, "Could not allocate clusters "
> > +                                            "for zoned metadata size");
>
> Missing:
> ret = offset;
>
> > +            goto unlock;
> > +        }
> > +        s->zoned_header.zonedmeta_offset = offset;
> > +        s->zoned_header.zonedmeta_size = zoned_meta_size;
> > +
> > +        zoned_clusterlen = size_to_clusters(s, zoned_meta_size)
> > +                * s->cluster_size;
> > +        assert(qcow2_pre_write_overlap_check(bs, 0, offset,
> > +                                             zoned_clusterlen,false) == 0);
> > +        ret = bdrv_pwrite_zeroes(blk_bs(blk)->file, offset,
> > +                                 zoned_clusterlen, 0);
>
> I don't think it's necessary to zero zonedmeta since it's about to be
> overwritten. There may be some padding until the end of the last
> cluster, but there is no need to zero it.
>
> > +        if (ret < 0) {
> > +            error_setg_errno(errp, -ret, "Could not zero fill zoned metadata");
> > +            goto unlock;
> > +        }
> > +        ret = bdrv_pwrite(blk_bs(blk)->file, offset, zoned_meta_size, meta, 0);
> > +        if (ret < 0) {
> > +            error_setg_errno(errp, -ret, "Could not write zoned metadata "
> > +                                         "to disk");
> > +            goto unlock;
> > +        }
> >      } else {
> >          s->zoned_header.zoned = BLK_Z_NONE;
> >      }
> > @@ -4339,6 +4760,368 @@ qcow2_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
> >      return ret;
> >  }
> >
> > +static int coroutine_fn
> > +qcow2_co_zone_report(BlockDriverState *bs, int64_t offset,
> > +                     unsigned int *nr_zones, BlockZoneDescriptor *zones)
> > +{
> > +    BDRVQcow2State *s = bs->opaque;
> > +    uint64_t zone_size = s->zoned_header.zone_size;
> > +    int64_t capacity = bs->total_sectors << BDRV_SECTOR_BITS;
> > +    int64_t size = bs->bl.nr_zones * zone_size;
> > +    unsigned int nrz;
> > +    int i = 0;
> > +    int si;
> > +
> > +    if (offset >= capacity) {
> > +        error_report("offset %" PRId64 " is equal to or greater than the "
> > +                     "device capacity %" PRId64 "", offset, capacity);
> > +        return -EINVAL;
> > +    }
> > +
> > +    nrz = ((*nr_zones) < bs->bl.nr_zones) ? (*nr_zones) : bs->bl.nr_zones;
> > +    si = offset / zone_size; /* Zone size cannot be 0 for zoned device */
> > +    qemu_co_mutex_lock(&bs->wps->colock);
> > +    for (; i < nrz; ++i) {
> > +        if (i + si >= bs->bl.nr_zones) {
> > +            break;
> > +        }
> > +
> > +        zones[i].start = (si + i) * zone_size;
> > +
> > +        /* The last zone can be smaller than the zone size */
> > +        if ((si + i + 1) == bs->bl.nr_zones && size > capacity) {
> > +            uint32_t l = zone_size - (size - capacity);
> > +            zones[i].length = l;
> > +            zones[i].cap = l;
> > +        } else {
> > +            zones[i].length = zone_size;
> > +            zones[i].cap = zone_size;
> > +        }
> > +
> > +        uint64_t wp = bs->wps->wp[si + i];
> > +        if (QCOW2_ZT_IS_CONV(wp)) {
> > +            zones[i].type = BLK_ZT_CONV;
> > +            zones[i].state = BLK_ZS_NOT_WP;
> > +            /* Clear the zone type bit */
> > +            wp &= ~(1ULL << 59);
> > +        } else {
> > +            zones[i].type = BLK_ZT_SWR;
> > +            zones[i].state = qcow2_get_zone_state(bs, si + i);
> > +        }
> > +        zones[i].wp = wp;
> > +    }
> > +    qemu_co_mutex_unlock(&bs->wps->colock);
> > +    *nr_zones = i;
> > +    return 0;
> > +}
> > +
> > +static int coroutine_fn GRAPH_RDLOCK
> > +qcow2_open_zone(BlockDriverState *bs, uint32_t index) {
> > +    BDRVQcow2State *s = bs->opaque;
> > +    int ret;
> > +
> > +    qemu_co_mutex_lock(&bs->wps->colock);
> > +    BlockZoneState zs = qcow2_get_zone_state(bs, index);
> > +    trace_qcow2_imp_open_zones(BLK_ZO_OPEN, s->nr_zones_imp_open);
> > +
> > +    switch(zs) {
> > +    case BLK_ZS_EMPTY:
> > +        if (!qcow2_can_activate_zone(bs)) {
> > +            ret = -EBUSY;
> > +            goto unlock;
> > +        }
> > +        break;
> > +    case BLK_ZS_IOPEN:
> > +        qcow2_rm_imp_open_zone(s, index);
> > +        break;
> > +    case BLK_ZS_EOPEN:
> > +        return 0;
> > +    case BLK_ZS_CLOSED:
> > +        if (!qcow2_can_open_zone(bs)) {
> > +            ret = -EINVAL;
> > +            goto unlock;
> > +        }
> > +        s->nr_zones_closed--;
>
> Does the zone entry need to be removed from the closed zone list?

Yes, will fix it.

>
> > +        break;
> > +    case BLK_ZS_FULL:
> > +        break;
> > +    default:
> > +        ret = -EINVAL;
> > +        goto unlock;
> > +    }
> > +
> > +    qcow2_do_exp_open_zone(s, index);
> > +    ret = 0;
> > +
> > +unlock:
> > +    qemu_co_mutex_unlock(&bs->wps->colock);
> > +    return ret;
> > +}
> > +
> > +static int qcow2_close_zone(BlockDriverState *bs, uint32_t index) {
> > +    int ret;
> > +
> > +    qemu_co_mutex_lock(&bs->wps->colock);
> > +    BlockZoneState zs = qcow2_get_zone_state(bs, index);
> > +
> > +    switch(zs) {
> > +    case BLK_ZS_EMPTY:
> > +        break;
> > +    case BLK_ZS_IOPEN:
> > +        break;
> > +    case BLK_ZS_EOPEN:
> > +        break;
> > +    case BLK_ZS_CLOSED:
> > +        /* Closing a closed zone is not an error */
> > +        ret = 0;
> > +        goto unlock;
> > +    case BLK_ZS_FULL:
> > +        break;
> > +    default:
> > +        ret = -EINVAL;
> > +        goto unlock;
> > +    }
> > +    qcow2_do_close_zone(bs, index, zs);
> > +    ret = 0;
> > +
> > +unlock:
> > +    qemu_co_mutex_unlock(&bs->wps->colock);
> > +    return ret;
> > +}
> > +
> > +static int coroutine_fn GRAPH_RDLOCK
> > +qcow2_finish_zone(BlockDriverState *bs, uint32_t index) {
> > +    BDRVQcow2State *s = bs->opaque;
> > +    int ret;
> > +
> > +    qemu_co_mutex_lock(&bs->wps->colock);
> > +    uint64_t *wp = &bs->wps->wp[index];
> > +    BlockZoneState zs = qcow2_get_zone_state(bs, index);
> > +
> > +    switch(zs) {
> > +    case BLK_ZS_EMPTY:
> > +        if (!qcow2_can_activate_zone(bs)) {
> > +            ret = -EBUSY;
> > +            goto unlock;
> > +        }
> > +        break;
> > +    case BLK_ZS_IOPEN:
> > +        qcow2_rm_imp_open_zone(s, index);
> > +        trace_qcow2_imp_open_zones(BLK_ZO_FINISH, s->nr_zones_imp_open);
> > +        break;
> > +    case BLK_ZS_EOPEN:
> > +        qcow2_rm_exp_open_zone(s, index);
> > +        break;
> > +    case BLK_ZS_CLOSED:
> > +        if (!qcow2_can_open_zone(bs)) {
> > +            ret = -EINVAL;
> > +            goto unlock;
> > +        }
> > +        qcow2_rm_closed_zone(s, index);
> > +        break;
> > +    case BLK_ZS_FULL:
> > +        ret = 0;
> > +        goto unlock;
> > +    default:
> > +        ret = -EINVAL;
> > +        goto unlock;
> > +    }
> > +
> > +    *wp = ((uint64_t)index + 1) * s->zoned_header.zone_size;
> > +    ret = qcow2_write_wp_at(bs, wp, index);
> > +
> > +unlock:
> > +    qemu_co_mutex_unlock(&bs->wps->colock);
> > +    return ret;
> > +}
> > +
> > +static int coroutine_fn GRAPH_RDLOCK
> > +qcow2_reset_zone(BlockDriverState *bs, uint32_t index,
> > +                            int64_t len) {
> > +    BDRVQcow2State *s = bs->opaque;
> > +    int nrz = bs->bl.nr_zones;
> > +    int zone_size = bs->bl.zone_size;
> > +    int n, ret = 0;
> > +
> > +    qemu_co_mutex_lock(&bs->wps->colock);
> > +    uint64_t *wp = &bs->wps->wp[index];
> > +    if (len == bs->total_sectors << BDRV_SECTOR_BITS) {
> > +        n = nrz;
> > +        index = 0;
> > +    } else {
> > +        n = len / zone_size;
> > +    }
> > +
> > +    for (int i = 0; i < n; ++i) {
> > +        uint64_t *wp_i = (uint64_t *)(wp + i);
> > +        uint64_t wpi_v = *wp_i;
> > +        if (QCOW2_ZT_IS_CONV(wpi_v)) {
> > +            continue;
> > +        }
> > +
> > +        BlockZoneState zs = qcow2_get_zone_state(bs, index + i);
> > +        switch (zs) {
> > +        case BLK_ZS_EMPTY:
> > +            break;
> > +        case BLK_ZS_IOPEN:
> > +            qcow2_rm_imp_open_zone(s, index + i);
> > +            trace_qcow2_imp_open_zones(BLK_ZO_RESET, s->nr_zones_imp_open);
> > +            break;
> > +        case BLK_ZS_EOPEN:
> > +            qcow2_rm_exp_open_zone(s, index + i);
> > +            break;
> > +        case BLK_ZS_CLOSED:
> > +            qcow2_rm_closed_zone(s, index + i);
> > +            break;
> > +        case BLK_ZS_FULL:
> > +            break;
> > +        default:
> > +            ret = -EINVAL;
> > +            goto unlock;
> > +        }
> > +
> > +        if (zs == BLK_ZS_EMPTY) {
> > +            continue;
> > +        }
> > +
> > +        *wp_i = ((uint64_t)index + i) * zone_size;
> > +        ret = qcow2_write_wp_at(bs, wp_i, index + i);
> > +        if (ret < 0) {
> > +            goto unlock;
> > +        }
> > +        /* clear data */
> > +        ret = qcow2_co_pwrite_zeroes(bs, *wp_i, zone_size, 0);
> > +        if (ret < 0) {
> > +            error_report("Failed to reset zone at 0x%" PRIx64 "", *wp_i);
> > +        }
> > +    }
> > +
> > +unlock:
> > +    qemu_co_mutex_unlock(&bs->wps->colock);
> > +    return ret;
> > +}
> > +
> > +static int coroutine_fn GRAPH_RDLOCK
> > +qcow2_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
> > +                                           int64_t offset, int64_t len)
> > +{
> > +    BDRVQcow2State *s = bs->opaque;
> > +    int ret = 0;
> > +    int64_t capacity = bs->total_sectors << BDRV_SECTOR_BITS;
> > +    int64_t zone_size = s->zoned_header.zone_size;
> > +    int64_t zone_size_mask = zone_size - 1;
> > +    uint32_t index = offset / zone_size;
> > +    BlockZoneWps *wps = bs->wps;
> > +
> > +    if (offset >= capacity) {
> > +        error_report("offset %" PRId64 " is equal to or greater than the"
> > +                     "device capacity %" PRId64 "", offset, capacity);
> > +        return -EINVAL;
> > +    }
> > +
> > +    if (offset & zone_size_mask) {
> > +        error_report("sector offset %" PRId64 " is not aligned to zone size"
> > +                     " %" PRId64 "", offset / 512, zone_size / 512);
> > +        return -EINVAL;
> > +    }
> > +
> > +    if (((offset + len) < capacity && len & zone_size_mask) ||
> > +        offset + len > capacity) {
> > +        error_report("number of sectors %" PRId64 " is not aligned to zone"
> > +                     " size %" PRId64 "", len / 512, zone_size / 512);
> > +        return -EINVAL;
> > +    }
> > +
> > +    qemu_co_mutex_lock(&wps->colock);
> > +    uint64_t wpv = wps->wp[index];
> > +    if (QCOW2_ZT_IS_CONV(wpv) && len != capacity) {
> > +        error_report("zone mgmt operations are not allowed for "
> > +                     "conventional zones");
> > +        ret = -EIO;
> > +        goto unlock;
> > +    }
> > +    qemu_co_mutex_unlock(&wps->colock);
> > +
> > +    switch(op) {
> > +    case BLK_ZO_OPEN:
> > +        ret = qcow2_open_zone(bs, index);
> > +        break;
> > +    case BLK_ZO_CLOSE:
> > +        ret = qcow2_close_zone(bs, index);
> > +        break;
> > +    case BLK_ZO_FINISH:
> > +        ret = qcow2_finish_zone(bs, index);
> > +        break;
> > +    case BLK_ZO_RESET:
> > +        ret = qcow2_reset_zone(bs, index, len);
> > +        break;
> > +    default:
> > +        error_report("Unsupported zone op: 0x%x", op);
> > +        ret = -ENOTSUP;
> > +        break;
> > +    }
> > +    return ret;
> > +
> > +unlock:
> > +    qemu_co_mutex_unlock(&wps->colock);
> > +    return ret;
> > +}
> > +
> > +static int coroutine_fn GRAPH_RDLOCK
> > +qcow2_co_zone_append(BlockDriverState *bs, int64_t *offset, QEMUIOVector *qiov,
> > +                     BdrvRequestFlags flags)
> > +{
> > +    assert(flags == 0);
> > +    int64_t capacity = bs->total_sectors << BDRV_SECTOR_BITS;
> > +    uint32_t index;
> > +    int ret;
> > +    int64_t zone_size_mask = bs->bl.zone_size - 1;
> > +    int64_t iov_len = 0;
> > +    int64_t len = 0;
> > +
> > +    if (*offset >= capacity) {
> > +        error_report("*offset %" PRId64 " is equal to or greater than the"
> > +                     "device capacity %" PRId64 "", *offset, capacity);
> > +        return -EINVAL;
> > +    }
> > +
> > +    /* offset + len should not pass the end of that zone starting from offset */
> > +    if (*offset & zone_size_mask) {
> > +        error_report("sector offset %" PRId64 " is not aligned to zone size "
> > +                     "%" PRId32 "", *offset / 512, bs->bl.zone_size / 512);
> > +        return -EINVAL;
> > +    }
> > +
> > +    int64_t wg = bs->bl.write_granularity;
> > +    int64_t wg_mask = wg - 1;
> > +    for (int i = 0; i < qiov->niov; i++) {
> > +        iov_len = qiov->iov[i].iov_len;
> > +        if (iov_len & wg_mask) {
> > +            error_report("len of IOVector[%d] %" PRId64 " is not aligned to "
> > +                         "block size %" PRId64 "", i, iov_len, wg);
> > +            return -EINVAL;
> > +        }
> > +    }
> > +    len = qiov->size;
> > +    index = *offset / bs->bl.zone_size;
> > +
> > +    if ((len >> BDRV_SECTOR_BITS) > bs->bl.max_append_sectors) {
> > +        return -ENOTSUP;
> > +    }
> > +
> > +    qemu_co_mutex_lock(&bs->wps->colock);
> > +    uint64_t wp_i = bs->wps->wp[index];
> > +    ret = qcow2_co_pwritev_part(bs, wp_i, len, qiov, 0, 0);
>
> Is it correct to use wp_i directly without masking bits (e.g. bit 63 for
> conventional zones)?
>
> > +    if (ret == 0) {
> > +        *offset = wp_i;
> > +    } else {
> > +        error_report("qcow2: zap failed");
> > +    }
> > +
> > +    qemu_co_mutex_unlock(&bs->wps->colock);
> > +    return ret;
> > +}
> > +
> >  static int coroutine_fn GRAPH_RDLOCK
> >  qcow2_co_copy_range_from(BlockDriverState *bs,
> >                           BdrvChild *src, int64_t src_offset,
> > @@ -6398,6 +7181,10 @@ BlockDriver bdrv_qcow2 = {
> >      .bdrv_co_pwritev_compressed_part    = qcow2_co_pwritev_compressed_part,
> >      .bdrv_make_empty                    = qcow2_make_empty,
> >
> > +    .bdrv_co_zone_report    = qcow2_co_zone_report,
> > +    .bdrv_co_zone_mgmt    = qcow2_co_zone_mgmt,
> > +    .bdrv_co_zone_append    = qcow2_co_zone_append,
> > +
> >      .bdrv_snapshot_create               = qcow2_snapshot_create,
> >      .bdrv_snapshot_goto                 = qcow2_snapshot_goto,
> >      .bdrv_snapshot_delete               = qcow2_snapshot_delete,
> > diff --git a/block/trace-events b/block/trace-events
> > index 8e789e1f12..e35222e079 100644
> > --- a/block/trace-events
> > +++ b/block/trace-events
> > @@ -82,6 +82,8 @@ qcow2_writev_data(void *co, uint64_t offset) "co %p offset 0x%" PRIx64
> >  qcow2_pwrite_zeroes_start_req(void *co, int64_t offset, int64_t bytes) "co %p offset 0x%" PRIx64 " bytes %" PRId64
> >  qcow2_pwrite_zeroes(void *co, int64_t offset, int64_t bytes) "co %p offset 0x%" PRIx64 " bytes %" PRId64
> >  qcow2_skip_cow(void *co, uint64_t offset, int nb_clusters) "co %p offset 0x%" PRIx64 " nb_clusters %d"
> > +qcow2_wp_tracking(int index, uint64_t wp) "wps[%d]: 0x%" PRIx64
> > +qcow2_imp_open_zones(uint8_t op, int nrz) "nr_imp_open_zones after op 0x%x: %d"
> >
> >  # qcow2-cluster.c
> >  qcow2_alloc_clusters_offset(void *co, uint64_t offset, int bytes) "co %p offset 0x%" PRIx64 " bytes %d"
> > diff --git a/include/qemu/queue.h b/include/qemu/queue.h
> > index e029e7bf66..3f0a48740e 100644
> > --- a/include/qemu/queue.h
> > +++ b/include/qemu/queue.h
> > @@ -179,6 +179,7 @@ struct {                                                                \
> >  #define QLIST_EMPTY(head)                ((head)->lh_first == NULL)
> >  #define QLIST_FIRST(head)                ((head)->lh_first)
> >  #define QLIST_NEXT(elm, field)           ((elm)->field.le_next)
> > +#define QLIST_LAST(head, field)          (*(head)->lh_first->field.le_prev)
>
> This assumes QLIST is a circular linked list, but it is not circular:
>
>   #define QLIST_INSERT_HEAD(head, elm, field) do {                        \
>           if (((elm)->field.le_next = (head)->lh_first) != NULL)          \
>                   (head)->lh_first->field.le_prev = &(elm)->field.le_next;\
>           (head)->lh_first = (elm);                                       \
>           (elm)->field.le_prev = &(head)->lh_first;                       \
>           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
>   } while (/*CONSTCOND*/0)
>
> head->lh_first->field.le_prev points to head's list head lh_first field,
> not to the last node.
>
> Take a look at QTAILQ or QSIMPLEQ if you need a LAST() API.

Ok, I see. Thanks!

Sam
Damien Le Moal Sept. 23, 2024, 1:22 p.m. UTC | #3
On 2024/09/23 13:06, Sam Li wrote:

[...]

>>> @@ -2837,6 +3180,19 @@ qcow2_co_pwritev_part(BlockDriverState *bs, int64_t offset, int64_t bytes,
>>>          qiov_offset += cur_bytes;
>>>          trace_qcow2_writev_done_part(qemu_coroutine_self(), cur_bytes);
>>>      }
>>> +
>>> +    if (bs->bl.zoned == BLK_Z_HM) {
>>> +        index = start_offset / zone_size;
>>> +        wp = &bs->wps->wp[index];
>>> +        if (!QCOW2_ZT_IS_CONV(*wp)) {
>>> +            /* Advance the write pointer when the write completes */
>>
>> Updating the write pointer after I/O does not prevent other write
>> requests from beginning at the same offset as this request. Multiple
>> write request coroutines can run concurrently and only the first one
>> should succeed. The others should fail if they are using the same
>> offset.
>>
>> The comment above says "Real drives change states before it can write to
>> the zone" and I think it's appropriate to update the write pointer
>> before performing the write too. The qcow2 zone emulation code is
>> different from the file-posix.c passthrough code. We are responsible for
>> maintaining zoned metadata state and cannot wait for the result of the
>> I/O to tell us what happened.

Yes, correct. The wp MUST be updated when issuing the IO, with the assumption
that the write IO will succeed (errors are rare !).

> The problem of updating the write pointer before IO completion is the
> failure case.  It can't be predicted in advance if an IO fails or not.
> When write I/O fails, the wp should not be updated.

Correct, if an IO fails, the wp should not be updated. However, that is not
difficult to deal with:
1) under the zone lock, advance the wp position when issuing the write IO
2) When the write IO completes with success, nothing else needs to be done.
3) When *any* write IO completes with error you need to:
	- Lock the zone
	- Do a report zone for the target zone of the failed write to get the current
wp location
	- Update bs->wps->wp[index] using that current wp location
	- Unlock the zone

With that, one may get a few errors if multiple async writes are being issued,
but that behavior is consistent with the same happening with a real drive. So no
issue. And since the report zones gets you the current wp location, the user can
restart writing from that location once it has dealt with all the previous write
failures.

> The alternative way is to hold the wps lock as is also required for wp
> accessing. Therefore only one of multiple concurrent write requests
> will succeed.

That is a very simple solution that avoids the above error recovery, but that
would be very bad for performance (especially for a pure sequential write
workload as we would limit IOs to quue depth 1). So if we can avoid this simple
approach, that would be a lot better.
Sam Li Sept. 23, 2024, 1:40 p.m. UTC | #4
Hi Damien,

Damien Le Moal <dlemoal@kernel.org> 于2024年9月23日周一 15:22写道:
>
> On 2024/09/23 13:06, Sam Li wrote:
>
> [...]
>
> >>> @@ -2837,6 +3180,19 @@ qcow2_co_pwritev_part(BlockDriverState *bs, int64_t offset, int64_t bytes,
> >>>          qiov_offset += cur_bytes;
> >>>          trace_qcow2_writev_done_part(qemu_coroutine_self(), cur_bytes);
> >>>      }
> >>> +
> >>> +    if (bs->bl.zoned == BLK_Z_HM) {
> >>> +        index = start_offset / zone_size;
> >>> +        wp = &bs->wps->wp[index];
> >>> +        if (!QCOW2_ZT_IS_CONV(*wp)) {
> >>> +            /* Advance the write pointer when the write completes */
> >>
> >> Updating the write pointer after I/O does not prevent other write
> >> requests from beginning at the same offset as this request. Multiple
> >> write request coroutines can run concurrently and only the first one
> >> should succeed. The others should fail if they are using the same
> >> offset.
> >>
> >> The comment above says "Real drives change states before it can write to
> >> the zone" and I think it's appropriate to update the write pointer
> >> before performing the write too. The qcow2 zone emulation code is
> >> different from the file-posix.c passthrough code. We are responsible for
> >> maintaining zoned metadata state and cannot wait for the result of the
> >> I/O to tell us what happened.
>
> Yes, correct. The wp MUST be updated when issuing the IO, with the assumption
> that the write IO will succeed (errors are rare !).
>
> > The problem of updating the write pointer before IO completion is the
> > failure case.  It can't be predicted in advance if an IO fails or not.
> > When write I/O fails, the wp should not be updated.
>
> Correct, if an IO fails, the wp should not be updated. However, that is not
> difficult to deal with:
> 1) under the zone lock, advance the wp position when issuing the write IO
> 2) When the write IO completes with success, nothing else needs to be done.
> 3) When *any* write IO completes with error you need to:
>         - Lock the zone
>         - Do a report zone for the target zone of the failed write to get the current
> wp location
>         - Update bs->wps->wp[index] using that current wp location
>         - Unlock the zone
>
> With that, one may get a few errors if multiple async writes are being issued,
> but that behavior is consistent with the same happening with a real drive. So no
> issue. And since the report zones gets you the current wp location, the user can
> restart writing from that location once it has dealt with all the previous write
> failures.

I see. To allow the concurrent writes, the lock will only be used on
the failure path while processing append writes.

>
> > The alternative way is to hold the wps lock as is also required for wp
> > accessing. Therefore only one of multiple concurrent write requests
> > will succeed.
>
> That is a very simple solution that avoids the above error recovery, but that
> would be very bad for performance (especially for a pure sequential write
> workload as we would limit IOs to quue depth 1). So if we can avoid this simple
> approach, that would be a lot better.

Yeah, I'll drop this approach. Although, it reminds me of how
file-posix driver emulates zone_append. It holds the lock whenever
accessing wps. Does that limit IOs to QD 1 too? If so, it can be
improved.
-- one zone_append starts
>> wp_lock()
>>> IO processing
>>>> wp_update
>>>>> wp_unlock()
-- ends

https://github.com/qemu/qemu/blob/master/block/file-posix.c#L2492

Sam

>
>
> --
> Damien Le Moal
> Western Digital Research
Damien Le Moal Sept. 23, 2024, 1:48 p.m. UTC | #5
On 2024/09/23 15:40, Sam Li wrote:
> Hi Damien,
> 
> Damien Le Moal <dlemoal@kernel.org> 于2024年9月23日周一 15:22写道:
>>
>> On 2024/09/23 13:06, Sam Li wrote:
>>
>> [...]
>>
>>>>> @@ -2837,6 +3180,19 @@ qcow2_co_pwritev_part(BlockDriverState *bs, int64_t offset, int64_t bytes,
>>>>>          qiov_offset += cur_bytes;
>>>>>          trace_qcow2_writev_done_part(qemu_coroutine_self(), cur_bytes);
>>>>>      }
>>>>> +
>>>>> +    if (bs->bl.zoned == BLK_Z_HM) {
>>>>> +        index = start_offset / zone_size;
>>>>> +        wp = &bs->wps->wp[index];
>>>>> +        if (!QCOW2_ZT_IS_CONV(*wp)) {
>>>>> +            /* Advance the write pointer when the write completes */
>>>>
>>>> Updating the write pointer after I/O does not prevent other write
>>>> requests from beginning at the same offset as this request. Multiple
>>>> write request coroutines can run concurrently and only the first one
>>>> should succeed. The others should fail if they are using the same
>>>> offset.
>>>>
>>>> The comment above says "Real drives change states before it can write to
>>>> the zone" and I think it's appropriate to update the write pointer
>>>> before performing the write too. The qcow2 zone emulation code is
>>>> different from the file-posix.c passthrough code. We are responsible for
>>>> maintaining zoned metadata state and cannot wait for the result of the
>>>> I/O to tell us what happened.
>>
>> Yes, correct. The wp MUST be updated when issuing the IO, with the assumption
>> that the write IO will succeed (errors are rare !).
>>
>>> The problem of updating the write pointer before IO completion is the
>>> failure case.  It can't be predicted in advance if an IO fails or not.
>>> When write I/O fails, the wp should not be updated.
>>
>> Correct, if an IO fails, the wp should not be updated. However, that is not
>> difficult to deal with:
>> 1) under the zone lock, advance the wp position when issuing the write IO
>> 2) When the write IO completes with success, nothing else needs to be done.
>> 3) When *any* write IO completes with error you need to:
>>         - Lock the zone
>>         - Do a report zone for the target zone of the failed write to get the current
>> wp location
>>         - Update bs->wps->wp[index] using that current wp location
>>         - Unlock the zone
>>
>> With that, one may get a few errors if multiple async writes are being issued,
>> but that behavior is consistent with the same happening with a real drive. So no
>> issue. And since the report zones gets you the current wp location, the user can
>> restart writing from that location once it has dealt with all the previous write
>> failures.
> 
> I see. To allow the concurrent writes, the lock will only be used on
> the failure path while processing append writes.
> 
>>
>>> The alternative way is to hold the wps lock as is also required for wp
>>> accessing. Therefore only one of multiple concurrent write requests
>>> will succeed.
>>
>> That is a very simple solution that avoids the above error recovery, but that
>> would be very bad for performance (especially for a pure sequential write
>> workload as we would limit IOs to quue depth 1). So if we can avoid this simple
>> approach, that would be a lot better.
> 
> Yeah, I'll drop this approach. Although, it reminds me of how
> file-posix driver emulates zone_append. It holds the lock whenever
> accessing wps. Does that limit IOs to QD 1 too? If so, it can be
> improved.
> -- one zone_append starts
>>> wp_lock()
>>>> IO processing
>>>>> wp_update
>>>>>> wp_unlock()
> -- ends

Yes, this limits the IO queue depth to 1, so not great. file-posix can use the
exact same error recovery mechanism as I explained. The write IO flow would thus
become:

On submission:

1) wp_lock()
2) Check write alignement with wp (or change zone append into regular write)
3) Issue write as an asynchronous IO
4) wp_update
5) wp_unlock()

And on completion, all you need is:
1) If no error, return success
2) wp_lock()
3) Do a report zone and use the reported wp value as the current wp
4) wp_unlock()
5) return IO error

With this simple scheme, when there are no IO errors, things are fast and there
is no "big lock" limiting the number of writes that can be issued. Only write
error recovery ends up locking everything during the report zones, but that is
fine. Errors are rare :)
Sam Li Sept. 23, 2024, 1:57 p.m. UTC | #6
On Mon, Sep 23, 2024 at 15:48 Damien Le Moal <dlemoal@kernel.org> wrote:

> On 2024/09/23 15:40, Sam Li wrote:
> > Hi Damien,
> >
> > Damien Le Moal <dlemoal@kernel.org> 于2024年9月23日周一 15:22写道:
> >>
> >> On 2024/09/23 13:06, Sam Li wrote:
> >>
> >> [...]
> >>
> >>>>> @@ -2837,6 +3180,19 @@ qcow2_co_pwritev_part(BlockDriverState *bs,
> int64_t offset, int64_t bytes,
> >>>>>          qiov_offset += cur_bytes;
> >>>>>          trace_qcow2_writev_done_part(qemu_coroutine_self(),
> cur_bytes);
> >>>>>      }
> >>>>> +
> >>>>> +    if (bs->bl.zoned == BLK_Z_HM) {
> >>>>> +        index = start_offset / zone_size;
> >>>>> +        wp = &bs->wps->wp[index];
> >>>>> +        if (!QCOW2_ZT_IS_CONV(*wp)) {
> >>>>> +            /* Advance the write pointer when the write completes */
> >>>>
> >>>> Updating the write pointer after I/O does not prevent other write
> >>>> requests from beginning at the same offset as this request. Multiple
> >>>> write request coroutines can run concurrently and only the first one
> >>>> should succeed. The others should fail if they are using the same
> >>>> offset.
> >>>>
> >>>> The comment above says "Real drives change states before it can write
> to
> >>>> the zone" and I think it's appropriate to update the write pointer
> >>>> before performing the write too. The qcow2 zone emulation code is
> >>>> different from the file-posix.c passthrough code. We are responsible
> for
> >>>> maintaining zoned metadata state and cannot wait for the result of the
> >>>> I/O to tell us what happened.
> >>
> >> Yes, correct. The wp MUST be updated when issuing the IO, with the
> assumption
> >> that the write IO will succeed (errors are rare !).
> >>
> >>> The problem of updating the write pointer before IO completion is the
> >>> failure case.  It can't be predicted in advance if an IO fails or not.
> >>> When write I/O fails, the wp should not be updated.
> >>
> >> Correct, if an IO fails, the wp should not be updated. However, that is
> not
> >> difficult to deal with:
> >> 1) under the zone lock, advance the wp position when issuing the write
> IO
> >> 2) When the write IO completes with success, nothing else needs to be
> done.
> >> 3) When *any* write IO completes with error you need to:
> >>         - Lock the zone
> >>         - Do a report zone for the target zone of the failed write to
> get the current
> >> wp location
> >>         - Update bs->wps->wp[index] using that current wp location
> >>         - Unlock the zone
> >>
> >> With that, one may get a few errors if multiple async writes are being
> issued,
> >> but that behavior is consistent with the same happening with a real
> drive. So no
> >> issue. And since the report zones gets you the current wp location, the
> user can
> >> restart writing from that location once it has dealt with all the
> previous write
> >> failures.
> >
> > I see. To allow the concurrent writes, the lock will only be used on
> > the failure path while processing append writes.
> >
> >>
> >>> The alternative way is to hold the wps lock as is also required for wp
> >>> accessing. Therefore only one of multiple concurrent write requests
> >>> will succeed.
> >>
> >> That is a very simple solution that avoids the above error recovery,
> but that
> >> would be very bad for performance (especially for a pure sequential
> write
> >> workload as we would limit IOs to quue depth 1). So if we can avoid
> this simple
> >> approach, that would be a lot better.
> >
> > Yeah, I'll drop this approach. Although, it reminds me of how
> > file-posix driver emulates zone_append. It holds the lock whenever
> > accessing wps. Does that limit IOs to QD 1 too? If so, it can be
> > improved.
> > -- one zone_append starts
> >>> wp_lock()
> >>>> IO processing
> >>>>> wp_update
> >>>>>> wp_unlock()
> > -- ends
>
> Yes, this limits the IO queue depth to 1, so not great. file-posix can use
> the
> exact same error recovery mechanism as I explained. The write IO flow
> would thus
> become:
>
> On submission:
>
> 1) wp_lock()
> 2) Check write alignement with wp (or change zone append into regular
> write)
> 3) Issue write as an asynchronous IO
> 4) wp_update
> 5) wp_unlock()
>
> And on completion, all you need is:
> 1) If no error, return success
> 2) wp_lock()
> 3) Do a report zone and use the reported wp value as the current wp
> 4) wp_unlock()
> 5) return IO error
>
> With this simple scheme, when there are no IO errors, things are fast and
> there
> is no "big lock" limiting the number of writes that can be issued. Only
> write
> error recovery ends up locking everything during the report zones, but
> that is
> fine. Errors are rare :)


Neat! I’ll send a separate patch for that.

Additionally, I was thinking about changing the zone size/capacity field to
64 bit in block layer and qcow2 config, as Stefan once suggested, which
will be in the next version of this patch series. Eventually, the 4GB of
zone size might be hit. Better change it beforehand.

Sam


>
>
> --
> Damien Le Moal
> Western Digital Research
>
diff mbox series

Patch

diff --git a/block/qcow2.c b/block/qcow2.c
index b987f1e751..db28585b82 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -195,6 +195,274 @@  qcow2_extract_crypto_opts(QemuOpts *opts, const char *fmt, Error **errp)
     return cryptoopts_qdict;
 }
 
+#define QCOW2_ZT_IS_CONV(wp)    (wp & 1ULL << 59)
+
+/*
+ * To emulate a real zoned device, closed, empty and full states are
+ * preserved after a power cycle. The open states are in-memory and will
+ * be lost after closing the device. Read-only and offline states are
+ * device-internal events, which are not considered for simplicity.
+ */
+static inline BlockZoneState qcow2_get_zone_state(BlockDriverState *bs,
+                                                  uint32_t index)
+{
+    BDRVQcow2State *s = bs->opaque;
+    Qcow2ZoneListEntry *zone_entry = &s->zone_list_entries[index];
+    uint64_t zone_wp = bs->wps->wp[index];
+    uint64_t zone_start;
+
+    if (QCOW2_ZT_IS_CONV(zone_wp)) {
+        return BLK_ZS_NOT_WP;
+    }
+
+    if (QLIST_IS_INSERTED(zone_entry, exp_open_zone_entry)) {
+        return BLK_ZS_EOPEN;
+    }
+    if (QLIST_IS_INSERTED(zone_entry, imp_open_zone_entry)) {
+        return BLK_ZS_IOPEN;
+    }
+
+    zone_start = index * bs->bl.zone_size;
+    if (zone_wp == zone_start) {
+        return BLK_ZS_EMPTY;
+    }
+    if (zone_wp >= zone_start + bs->bl.zone_capacity) {
+        return BLK_ZS_FULL;
+    }
+    if (zone_wp > zone_start) {
+        if (!QLIST_IS_INSERTED(zone_entry, closed_zone_entry)) {
+            /*
+             * The number of closed zones is not always updated in time when
+             * the device is closed. However, it only matters when doing
+             * zone report. Refresh the count and list of closed zones to
+             * provide correct zone states for zone report.
+             */
+            QLIST_INSERT_HEAD(&s->closed_zones, zone_entry, closed_zone_entry);
+            s->nr_zones_closed++;
+        }
+        return BLK_ZS_CLOSED;
+    }
+    return BLK_ZS_NOT_WP;
+}
+
+static void qcow2_rm_exp_open_zone(BDRVQcow2State *s,
+                                   uint32_t index)
+{
+    Qcow2ZoneListEntry *zone_entry = &s->zone_list_entries[index];
+
+    QLIST_REMOVE(zone_entry, exp_open_zone_entry);
+    s->nr_zones_exp_open--;
+}
+
+static void qcow2_rm_imp_open_zone(BDRVQcow2State *s,
+                                   int32_t index)
+{
+    Qcow2ZoneListEntry *zone_entry;
+    if (index < 0) {
+        /* Apply LRU when the index is not specified. */
+        zone_entry = QLIST_LAST(&s->imp_open_zones, imp_open_zone_entry);
+    } else {
+        zone_entry = &s->zone_list_entries[index];
+    }
+
+    QLIST_REMOVE(zone_entry, imp_open_zone_entry);
+    s->nr_zones_imp_open--;
+}
+
+static void qcow2_rm_open_zone(BDRVQcow2State *s,
+                               uint32_t index)
+{
+    Qcow2ZoneListEntry *zone_entry = &s->zone_list_entries[index];
+
+    if (QLIST_IS_INSERTED(zone_entry, exp_open_zone_entry)) {
+        qcow2_rm_exp_open_zone(s, index);
+    } else if (QLIST_IS_INSERTED(zone_entry, imp_open_zone_entry)) {
+        qcow2_rm_imp_open_zone(s, index);
+    }
+}
+
+static void qcow2_rm_closed_zone(BDRVQcow2State *s,
+                                 uint32_t index)
+{
+    Qcow2ZoneListEntry *zone_entry = &s->zone_list_entries[index];
+
+    QLIST_REMOVE(zone_entry, closed_zone_entry);
+    s->nr_zones_closed--;
+}
+
+static void qcow2_do_imp_open_zone(BDRVQcow2State *s,
+                                   uint32_t index,
+                                   BlockZoneState zs)
+{
+    Qcow2ZoneListEntry *zone_entry = &s->zone_list_entries[index];
+
+    switch (zs) {
+    case BLK_ZS_EMPTY:
+        break;
+    case BLK_ZS_CLOSED:
+        s->nr_zones_closed--;
+        break;
+    case BLK_ZS_IOPEN:
+        /*
+         * The LRU policy: update the zone that is most recently
+         * used to the head of the zone list
+         */
+        if (zone_entry == QLIST_FIRST(&s->imp_open_zones)) {
+            return;
+        }
+        QLIST_REMOVE(zone_entry, imp_open_zone_entry);
+        s->nr_zones_imp_open--;
+        break;
+    default:
+        return;
+    }
+
+    QLIST_INSERT_HEAD(&s->imp_open_zones, zone_entry, imp_open_zone_entry);
+    s->nr_zones_imp_open++;
+}
+
+static void qcow2_do_exp_open_zone(BDRVQcow2State *s,
+                                   uint32_t index)
+{
+    Qcow2ZoneListEntry *zone_entry = &s->zone_list_entries[index];
+
+    QLIST_INSERT_HEAD(&s->exp_open_zones, zone_entry, exp_open_zone_entry);
+    s->nr_zones_exp_open++;
+}
+
+/*
+ * The list of zones is managed using an LRU policy: the last
+ * zone of the list is always the one that was least recently used
+ * for writing and is chosen as the zone to close to be able to
+ * implicitly open another zone.
+ *
+ * We can only close the open zones. The index is not specified
+ * when it is less than 0.
+ */
+static void qcow2_do_close_zone(BlockDriverState *bs,
+                                int32_t index,
+                                BlockZoneState zs)
+{
+    BDRVQcow2State *s = bs->opaque;
+    Qcow2ZoneListEntry *zone_entry;
+
+    if (index < 0 && zs == BLK_ZS_IOPEN) {
+        /* before removal of the last implicitly open zone */
+        zone_entry = QLIST_LAST(&s->imp_open_zones, imp_open_zone_entry);
+        qcow2_rm_imp_open_zone(s, index);
+        goto close_zone;
+    }
+
+    if (index >= 0 && zs == BLK_ZS_EOPEN) {
+        zone_entry = &s->zone_list_entries[index];
+        qcow2_rm_exp_open_zone(s, index);
+        /*
+         * The zone state changes when the zone is removed from the list of
+         * open zones (explicitly open -> empty)
+         */
+        qcow2_get_zone_state(bs, index);
+    }
+    return;
+
+close_zone:
+    QLIST_INSERT_HEAD(&s->closed_zones, zone_entry, closed_zone_entry);
+    s->nr_zones_closed++;
+}
+
+/*
+ * Write the new wp value to the dedicated location of the image file.
+ */
+static int coroutine_fn GRAPH_RDLOCK
+qcow2_write_wp_at(BlockDriverState *bs, uint64_t *wp,
+                             int32_t index) {
+    BDRVQcow2State *s = bs->opaque;
+    uint64_t wpv = *wp;
+    int ret;
+
+    ret = bdrv_pwrite(bs->file, s->zoned_header.zonedmeta_offset
+        + sizeof(uint64_t) * index, sizeof(uint64_t), wp, 0);
+    if (ret < 0) {
+        goto exit;
+    }
+    trace_qcow2_wp_tracking(index, *wp >> BDRV_SECTOR_BITS);
+    return ret;
+
+exit:
+    *wp = wpv;
+    error_report("Failed to write metadata with file");
+    return ret;
+}
+
+static bool qcow2_can_activate_zone(BlockDriverState *bs)
+{
+    BDRVQcow2State *s = bs->opaque;
+
+    /* When the max active zone is zero, there is no limit on active zones */
+    if (!s->zoned_header.max_active_zones) {
+        return true;
+    }
+
+    /* Active zones are zones that are open or closed */
+    return s->nr_zones_exp_open + s->nr_zones_imp_open + s->nr_zones_closed
+        < s->zoned_header.max_active_zones;
+}
+
+/*
+ * This function manages open zones under active zones limit. It checks
+ * if a zone can transition to open state while maintaining max open and
+ * active zone limits.
+ */
+static bool qcow2_can_open_zone(BlockDriverState *bs)
+{
+    BDRVQcow2State *s = bs->opaque;
+
+    /* When the max open zone is zero, there is no limit on open zones */
+    if (!s->zoned_header.max_open_zones) {
+        return true;
+    }
+
+    /*
+     * The open zones are zones with the states of explicitly and
+     * implicitly open.
+     */
+    if (s->nr_zones_imp_open + s->nr_zones_exp_open <
+        s->zoned_header.max_open_zones) {
+        return true;
+    }
+
+    /*
+     * Zones are managed one at a time. Thus, the number of implicitly open
+     * zone can never be over the open zone limit. When the active zone limit
+     * is not reached, close only one implicitly open zone.
+     */
+    if (qcow2_can_activate_zone(bs)) {
+        qcow2_do_close_zone(bs, -1, BLK_ZS_IOPEN);
+        trace_qcow2_imp_open_zones(0x23, s->nr_zones_imp_open);
+        return true;
+    }
+    return false;
+}
+
+static inline int coroutine_fn GRAPH_RDLOCK
+qcow2_refresh_zonedmeta(BlockDriverState *bs)
+{
+    int ret;
+    BDRVQcow2State *s = bs->opaque;
+    uint64_t wps_size = s->zoned_header.zonedmeta_size;
+    g_autofree uint64_t *temp;
+
+    temp = g_new(uint64_t, wps_size);
+    ret = bdrv_pread(bs->file, s->zoned_header.zonedmeta_offset,
+                     wps_size, temp, 0);
+    if (ret < 0) {
+        error_report("Can not read metadata");
+        return ret;
+    }
+
+    memcpy(bs->wps->wp, temp, wps_size);
+    return 0;
+}
+
 /*
  * Passing by the zoned device configurations by a zoned_header struct, check
  * if the zone device options are under constraints. Return false when some
@@ -521,7 +789,23 @@  qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset,
                 be32_to_cpu(zoned_ext.max_active_zones);
             zoned_ext.max_append_bytes =
                 be32_to_cpu(zoned_ext.max_append_bytes);
+            zoned_ext.zonedmeta_offset =
+                be64_to_cpu(zoned_ext.zonedmeta_offset);
+            zoned_ext.zonedmeta_size = be64_to_cpu(zoned_ext.zonedmeta_size);
             s->zoned_header = zoned_ext;
+            bs->wps = g_malloc(sizeof(BlockZoneWps)
+                + s->zoned_header.zonedmeta_size);
+            ret = qcow2_refresh_zonedmeta(bs);
+            if (ret < 0) {
+                return ret;
+            }
+
+            s->zone_list_entries = g_new0(Qcow2ZoneListEntry,
+                                          zoned_ext.nr_zones);
+            QLIST_INIT(&s->exp_open_zones);
+            QLIST_INIT(&s->imp_open_zones);
+            QLIST_INIT(&s->closed_zones);
+            qemu_co_mutex_init(&bs->wps->colock);
 
             /* refuse to open broken images */
             if (zoned_ext.nr_zones != DIV_ROUND_UP(bs->total_sectors *
@@ -2789,9 +3073,68 @@  qcow2_co_pwritev_part(BlockDriverState *bs, int64_t offset, int64_t bytes,
     uint64_t host_offset;
     QCowL2Meta *l2meta = NULL;
     AioTaskPool *aio = NULL;
+    int64_t start_offset, start_bytes;
+    BlockZoneState zs;
+    int64_t end_zone, end_offset;
+    uint64_t *wp;
+    int64_t zone_size = bs->bl.zone_size;
+    int64_t zone_capacity = bs->bl.zone_capacity;
+    int index;
 
     trace_qcow2_writev_start_req(qemu_coroutine_self(), offset, bytes);
 
+    start_offset = offset;
+    start_bytes = bytes;
+    end_offset = start_offset + start_bytes;
+    if (bs->bl.zoned == BLK_Z_HM) {
+        index = start_offset / zone_size;
+        wp = &bs->wps->wp[index];
+        if (!QCOW2_ZT_IS_CONV(*wp)) {
+            /* The write offset must be equal to the zone write pointer */
+            if (offset != *wp) {
+                return -EINVAL;
+            }
+
+            /* Only allow writes when there are zone resources left */
+            zs = qcow2_get_zone_state(bs, index);
+            if (zs == BLK_ZS_CLOSED || zs == BLK_ZS_EMPTY) {
+                if (!qcow2_can_open_zone(bs)) {
+                    return -EINVAL;
+                }
+            }
+
+            /*
+             * Align up (start_offset, zone_size), the start offset is not
+             * necessarily power of two.
+             */
+            end_zone = index * zone_size + zone_capacity;
+            /* Write cannot exceed the zone capacity. */
+            if (end_offset > end_zone) {
+                return -EINVAL;
+            }
+
+            /*
+             * Real drives change states before it can write to the zone. If
+             * the write fails, the zone state may have changed.
+             *
+             * The zone state transitions to implicit open when the original
+             * state is empty or closed. When the wp reaches the end, the
+             * open states (explicit open, implicit open) become full.
+             */
+            zs = qcow2_get_zone_state(bs, index);
+            if (!(end_offset & (zone_capacity - 1))) {
+                /* Being aligned to zone capacity implies full state */
+                qcow2_rm_open_zone(s, index);
+                trace_qcow2_imp_open_zones(0x24,
+                                           s->nr_zones_imp_open);
+            } else {
+                qcow2_do_imp_open_zone(s, index, zs);
+                trace_qcow2_imp_open_zones(0x24,
+                                           s->nr_zones_imp_open);
+            }
+        }
+    }
+
     while (bytes != 0 && aio_task_pool_status(aio) == 0) {
 
         l2meta = NULL;
@@ -2837,6 +3180,19 @@  qcow2_co_pwritev_part(BlockDriverState *bs, int64_t offset, int64_t bytes,
         qiov_offset += cur_bytes;
         trace_qcow2_writev_done_part(qemu_coroutine_self(), cur_bytes);
     }
+
+    if (bs->bl.zoned == BLK_Z_HM) {
+        index = start_offset / zone_size;
+        wp = &bs->wps->wp[index];
+        if (!QCOW2_ZT_IS_CONV(*wp)) {
+            /* Advance the write pointer when the write completes */
+            *wp = end_offset;
+            ret = qcow2_write_wp_at(bs, wp, index);
+            if (ret < 0) {
+                goto fail_nometa;
+            }
+        }
+    }
     ret = 0;
 
     qemu_co_mutex_lock(&s->lock);
@@ -2895,6 +3251,25 @@  static int GRAPH_RDLOCK qcow2_inactivate(BlockDriverState *bs)
     return result;
 }
 
+static void qcow2_do_close_all_zone(BDRVQcow2State *s)
+{
+    Qcow2ZoneListEntry *zone_entry, *next;
+
+    QLIST_FOREACH_SAFE(zone_entry, &s->imp_open_zones, imp_open_zone_entry,
+                       next) {
+        QLIST_REMOVE(zone_entry, imp_open_zone_entry);
+        s->nr_zones_imp_open--;
+    }
+
+    QLIST_FOREACH_SAFE(zone_entry, &s->exp_open_zones, exp_open_zone_entry,
+                       next) {
+        QLIST_REMOVE(zone_entry, exp_open_zone_entry);
+        s->nr_zones_exp_open--;
+    }
+
+    assert(s->nr_zones_imp_open + s->nr_zones_exp_open == 0);
+}
+
 static void coroutine_mixed_fn GRAPH_RDLOCK
 qcow2_do_close(BlockDriverState *bs, bool close_data_file)
 {
@@ -2934,6 +3309,8 @@  qcow2_do_close(BlockDriverState *bs, bool close_data_file)
 
     qcow2_refcount_close(bs);
     qcow2_free_snapshots(bs);
+    qcow2_do_close_all_zone(s);
+    g_free(bs->wps);
 }
 
 static void GRAPH_UNLOCKED qcow2_close(BlockDriverState *bs)
@@ -3248,7 +3625,10 @@  int qcow2_update_header(BlockDriverState *bs)
             .max_active_zones   =
                 cpu_to_be32(s->zoned_header.max_active_zones),
             .max_append_bytes =
-                cpu_to_be32(s->zoned_header.max_append_bytes)
+                cpu_to_be32(s->zoned_header.max_append_bytes),
+            .zonedmeta_offset   =
+                cpu_to_be64(s->zoned_header.zonedmeta_offset),
+            .zonedmeta_size     = cpu_to_be64(s->zoned_header.zonedmeta_size),
         };
         ret = header_ext_add(buf, QCOW2_EXT_MAGIC_ZONED_FORMAT,
                              &zoned_header, sizeof(zoned_header),
@@ -3656,7 +4036,8 @@  qcow2_co_create(BlockdevCreateOptions *create_options, Error **errp)
     int version;
     int refcount_order;
     uint64_t *refcount_table;
-    int ret;
+    uint64_t zoned_meta_size, zoned_clusterlen;
+    int ret, offset, i;
     uint8_t compression_type = QCOW2_COMPRESSION_TYPE_ZLIB;
 
     assert(create_options->driver == BLOCKDEV_DRIVER_QCOW2);
@@ -3997,6 +4378,46 @@  qcow2_co_create(BlockdevCreateOptions *create_options, Error **errp)
             ret = -EINVAL;
             goto unlock;
         }
+
+        uint32_t nrz = s->zoned_header.nr_zones;
+        zoned_meta_size =  sizeof(uint64_t) * nrz;
+        g_autofree uint64_t *meta = NULL;
+        meta = g_new0(uint64_t, nrz);
+
+        for (i = 0; i < s->zoned_header.conventional_zones; ++i) {
+            meta[i] = i * s->zoned_header.zone_size;
+            meta[i] |= 1ULL << 59;
+        }
+
+        for (; i < nrz; ++i) {
+            meta[i] = i * s->zoned_header.zone_size;
+        }
+
+        offset = qcow2_alloc_clusters(blk_bs(blk), zoned_meta_size);
+        if (offset < 0) {
+            error_setg_errno(errp, -offset, "Could not allocate clusters "
+                                            "for zoned metadata size");
+            goto unlock;
+        }
+        s->zoned_header.zonedmeta_offset = offset;
+        s->zoned_header.zonedmeta_size = zoned_meta_size;
+
+        zoned_clusterlen = size_to_clusters(s, zoned_meta_size)
+                * s->cluster_size;
+        assert(qcow2_pre_write_overlap_check(bs, 0, offset,
+                                             zoned_clusterlen,false) == 0);
+        ret = bdrv_pwrite_zeroes(blk_bs(blk)->file, offset,
+                                 zoned_clusterlen, 0);
+        if (ret < 0) {
+            error_setg_errno(errp, -ret, "Could not zero fill zoned metadata");
+            goto unlock;
+        }
+        ret = bdrv_pwrite(blk_bs(blk)->file, offset, zoned_meta_size, meta, 0);
+        if (ret < 0) {
+            error_setg_errno(errp, -ret, "Could not write zoned metadata "
+                                         "to disk");
+            goto unlock;
+        }
     } else {
         s->zoned_header.zoned = BLK_Z_NONE;
     }
@@ -4339,6 +4760,368 @@  qcow2_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
     return ret;
 }
 
+static int coroutine_fn
+qcow2_co_zone_report(BlockDriverState *bs, int64_t offset,
+                     unsigned int *nr_zones, BlockZoneDescriptor *zones)
+{
+    BDRVQcow2State *s = bs->opaque;
+    uint64_t zone_size = s->zoned_header.zone_size;
+    int64_t capacity = bs->total_sectors << BDRV_SECTOR_BITS;
+    int64_t size = bs->bl.nr_zones * zone_size;
+    unsigned int nrz;
+    int i = 0;
+    int si;
+
+    if (offset >= capacity) {
+        error_report("offset %" PRId64 " is equal to or greater than the "
+                     "device capacity %" PRId64 "", offset, capacity);
+        return -EINVAL;
+    }
+
+    nrz = ((*nr_zones) < bs->bl.nr_zones) ? (*nr_zones) : bs->bl.nr_zones;
+    si = offset / zone_size; /* Zone size cannot be 0 for zoned device */
+    qemu_co_mutex_lock(&bs->wps->colock);
+    for (; i < nrz; ++i) {
+        if (i + si >= bs->bl.nr_zones) {
+            break;
+        }
+
+        zones[i].start = (si + i) * zone_size;
+
+        /* The last zone can be smaller than the zone size */
+        if ((si + i + 1) == bs->bl.nr_zones && size > capacity) {
+            uint32_t l = zone_size - (size - capacity);
+            zones[i].length = l;
+            zones[i].cap = l;
+        } else {
+            zones[i].length = zone_size;
+            zones[i].cap = zone_size;
+        }
+
+        uint64_t wp = bs->wps->wp[si + i];
+        if (QCOW2_ZT_IS_CONV(wp)) {
+            zones[i].type = BLK_ZT_CONV;
+            zones[i].state = BLK_ZS_NOT_WP;
+            /* Clear the zone type bit */
+            wp &= ~(1ULL << 59);
+        } else {
+            zones[i].type = BLK_ZT_SWR;
+            zones[i].state = qcow2_get_zone_state(bs, si + i);
+        }
+        zones[i].wp = wp;
+    }
+    qemu_co_mutex_unlock(&bs->wps->colock);
+    *nr_zones = i;
+    return 0;
+}
+
+static int coroutine_fn GRAPH_RDLOCK
+qcow2_open_zone(BlockDriverState *bs, uint32_t index) {
+    BDRVQcow2State *s = bs->opaque;
+    int ret;
+
+    qemu_co_mutex_lock(&bs->wps->colock);
+    BlockZoneState zs = qcow2_get_zone_state(bs, index);
+    trace_qcow2_imp_open_zones(BLK_ZO_OPEN, s->nr_zones_imp_open);
+
+    switch(zs) {
+    case BLK_ZS_EMPTY:
+        if (!qcow2_can_activate_zone(bs)) {
+            ret = -EBUSY;
+            goto unlock;
+        }
+        break;
+    case BLK_ZS_IOPEN:
+        qcow2_rm_imp_open_zone(s, index);
+        break;
+    case BLK_ZS_EOPEN:
+        return 0;
+    case BLK_ZS_CLOSED:
+        if (!qcow2_can_open_zone(bs)) {
+            ret = -EINVAL;
+            goto unlock;
+        }
+        s->nr_zones_closed--;
+        break;
+    case BLK_ZS_FULL:
+        break;
+    default:
+        ret = -EINVAL;
+        goto unlock;
+    }
+
+    qcow2_do_exp_open_zone(s, index);
+    ret = 0;
+
+unlock:
+    qemu_co_mutex_unlock(&bs->wps->colock);
+    return ret;
+}
+
+static int qcow2_close_zone(BlockDriverState *bs, uint32_t index) {
+    int ret;
+
+    qemu_co_mutex_lock(&bs->wps->colock);
+    BlockZoneState zs = qcow2_get_zone_state(bs, index);
+
+    switch(zs) {
+    case BLK_ZS_EMPTY:
+        break;
+    case BLK_ZS_IOPEN:
+        break;
+    case BLK_ZS_EOPEN:
+        break;
+    case BLK_ZS_CLOSED:
+        /* Closing a closed zone is not an error */
+        ret = 0;
+        goto unlock;
+    case BLK_ZS_FULL:
+        break;
+    default:
+        ret = -EINVAL;
+        goto unlock;
+    }
+    qcow2_do_close_zone(bs, index, zs);
+    ret = 0;
+
+unlock:
+    qemu_co_mutex_unlock(&bs->wps->colock);
+    return ret;
+}
+
+static int coroutine_fn GRAPH_RDLOCK
+qcow2_finish_zone(BlockDriverState *bs, uint32_t index) {
+    BDRVQcow2State *s = bs->opaque;
+    int ret;
+
+    qemu_co_mutex_lock(&bs->wps->colock);
+    uint64_t *wp = &bs->wps->wp[index];
+    BlockZoneState zs = qcow2_get_zone_state(bs, index);
+
+    switch(zs) {
+    case BLK_ZS_EMPTY:
+        if (!qcow2_can_activate_zone(bs)) {
+            ret = -EBUSY;
+            goto unlock;
+        }
+        break;
+    case BLK_ZS_IOPEN:
+        qcow2_rm_imp_open_zone(s, index);
+        trace_qcow2_imp_open_zones(BLK_ZO_FINISH, s->nr_zones_imp_open);
+        break;
+    case BLK_ZS_EOPEN:
+        qcow2_rm_exp_open_zone(s, index);
+        break;
+    case BLK_ZS_CLOSED:
+        if (!qcow2_can_open_zone(bs)) {
+            ret = -EINVAL;
+            goto unlock;
+        }
+        qcow2_rm_closed_zone(s, index);
+        break;
+    case BLK_ZS_FULL:
+        ret = 0;
+        goto unlock;
+    default:
+        ret = -EINVAL;
+        goto unlock;
+    }
+
+    *wp = ((uint64_t)index + 1) * s->zoned_header.zone_size;
+    ret = qcow2_write_wp_at(bs, wp, index);
+
+unlock:
+    qemu_co_mutex_unlock(&bs->wps->colock);
+    return ret;
+}
+
+static int coroutine_fn GRAPH_RDLOCK
+qcow2_reset_zone(BlockDriverState *bs, uint32_t index,
+                            int64_t len) {
+    BDRVQcow2State *s = bs->opaque;
+    int nrz = bs->bl.nr_zones;
+    int zone_size = bs->bl.zone_size;
+    int n, ret = 0;
+
+    qemu_co_mutex_lock(&bs->wps->colock);
+    uint64_t *wp = &bs->wps->wp[index];
+    if (len == bs->total_sectors << BDRV_SECTOR_BITS) {
+        n = nrz;
+        index = 0;
+    } else {
+        n = len / zone_size;
+    }
+
+    for (int i = 0; i < n; ++i) {
+        uint64_t *wp_i = (uint64_t *)(wp + i);
+        uint64_t wpi_v = *wp_i;
+        if (QCOW2_ZT_IS_CONV(wpi_v)) {
+            continue;
+        }
+        
+        BlockZoneState zs = qcow2_get_zone_state(bs, index + i);
+        switch (zs) {
+        case BLK_ZS_EMPTY:
+            break;
+        case BLK_ZS_IOPEN:
+            qcow2_rm_imp_open_zone(s, index + i);
+            trace_qcow2_imp_open_zones(BLK_ZO_RESET, s->nr_zones_imp_open);
+            break;
+        case BLK_ZS_EOPEN:
+            qcow2_rm_exp_open_zone(s, index + i);
+            break;
+        case BLK_ZS_CLOSED:
+            qcow2_rm_closed_zone(s, index + i);
+            break;
+        case BLK_ZS_FULL:
+            break;
+        default:
+            ret = -EINVAL;
+            goto unlock;
+        }
+
+        if (zs == BLK_ZS_EMPTY) {
+            continue;
+        }
+
+        *wp_i = ((uint64_t)index + i) * zone_size;
+        ret = qcow2_write_wp_at(bs, wp_i, index + i);
+        if (ret < 0) {
+            goto unlock;
+        }
+        /* clear data */
+        ret = qcow2_co_pwrite_zeroes(bs, *wp_i, zone_size, 0);
+        if (ret < 0) {
+            error_report("Failed to reset zone at 0x%" PRIx64 "", *wp_i);
+        }
+    }
+
+unlock:
+    qemu_co_mutex_unlock(&bs->wps->colock);
+    return ret;
+}
+
+static int coroutine_fn GRAPH_RDLOCK
+qcow2_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
+                                           int64_t offset, int64_t len)
+{
+    BDRVQcow2State *s = bs->opaque;
+    int ret = 0;
+    int64_t capacity = bs->total_sectors << BDRV_SECTOR_BITS;
+    int64_t zone_size = s->zoned_header.zone_size;
+    int64_t zone_size_mask = zone_size - 1;
+    uint32_t index = offset / zone_size;
+    BlockZoneWps *wps = bs->wps;
+
+    if (offset >= capacity) {
+        error_report("offset %" PRId64 " is equal to or greater than the"
+                     "device capacity %" PRId64 "", offset, capacity);
+        return -EINVAL;
+    }
+
+    if (offset & zone_size_mask) {
+        error_report("sector offset %" PRId64 " is not aligned to zone size"
+                     " %" PRId64 "", offset / 512, zone_size / 512);
+        return -EINVAL;
+    }
+
+    if (((offset + len) < capacity && len & zone_size_mask) ||
+        offset + len > capacity) {
+        error_report("number of sectors %" PRId64 " is not aligned to zone"
+                     " size %" PRId64 "", len / 512, zone_size / 512);
+        return -EINVAL;
+    }
+
+    qemu_co_mutex_lock(&wps->colock);
+    uint64_t wpv = wps->wp[index];
+    if (QCOW2_ZT_IS_CONV(wpv) && len != capacity) {
+        error_report("zone mgmt operations are not allowed for "
+                     "conventional zones");
+        ret = -EIO;
+        goto unlock;
+    }
+    qemu_co_mutex_unlock(&wps->colock);
+
+    switch(op) {
+    case BLK_ZO_OPEN:
+        ret = qcow2_open_zone(bs, index);
+        break;
+    case BLK_ZO_CLOSE:
+        ret = qcow2_close_zone(bs, index);
+        break;
+    case BLK_ZO_FINISH:
+        ret = qcow2_finish_zone(bs, index);
+        break;
+    case BLK_ZO_RESET:
+        ret = qcow2_reset_zone(bs, index, len);
+        break;
+    default:
+        error_report("Unsupported zone op: 0x%x", op);
+        ret = -ENOTSUP;
+        break;
+    }
+    return ret;
+
+unlock:
+    qemu_co_mutex_unlock(&wps->colock);
+    return ret;
+}
+
+static int coroutine_fn GRAPH_RDLOCK
+qcow2_co_zone_append(BlockDriverState *bs, int64_t *offset, QEMUIOVector *qiov,
+                     BdrvRequestFlags flags)
+{
+    assert(flags == 0);
+    int64_t capacity = bs->total_sectors << BDRV_SECTOR_BITS;
+    uint32_t index;
+    int ret;
+    int64_t zone_size_mask = bs->bl.zone_size - 1;
+    int64_t iov_len = 0;
+    int64_t len = 0;
+
+    if (*offset >= capacity) {
+        error_report("*offset %" PRId64 " is equal to or greater than the"
+                     "device capacity %" PRId64 "", *offset, capacity);
+        return -EINVAL;
+    }
+
+    /* offset + len should not pass the end of that zone starting from offset */
+    if (*offset & zone_size_mask) {
+        error_report("sector offset %" PRId64 " is not aligned to zone size "
+                     "%" PRId32 "", *offset / 512, bs->bl.zone_size / 512);
+        return -EINVAL;
+    }
+
+    int64_t wg = bs->bl.write_granularity;
+    int64_t wg_mask = wg - 1;
+    for (int i = 0; i < qiov->niov; i++) {
+        iov_len = qiov->iov[i].iov_len;
+        if (iov_len & wg_mask) {
+            error_report("len of IOVector[%d] %" PRId64 " is not aligned to "
+                         "block size %" PRId64 "", i, iov_len, wg);
+            return -EINVAL;
+        }
+    }
+    len = qiov->size;
+    index = *offset / bs->bl.zone_size;
+
+    if ((len >> BDRV_SECTOR_BITS) > bs->bl.max_append_sectors) {
+        return -ENOTSUP;
+    }
+
+    qemu_co_mutex_lock(&bs->wps->colock);
+    uint64_t wp_i = bs->wps->wp[index];
+    ret = qcow2_co_pwritev_part(bs, wp_i, len, qiov, 0, 0);
+    if (ret == 0) {
+        *offset = wp_i;
+    } else {
+        error_report("qcow2: zap failed");
+    }
+
+    qemu_co_mutex_unlock(&bs->wps->colock);
+    return ret;
+}
+
 static int coroutine_fn GRAPH_RDLOCK
 qcow2_co_copy_range_from(BlockDriverState *bs,
                          BdrvChild *src, int64_t src_offset,
@@ -6398,6 +7181,10 @@  BlockDriver bdrv_qcow2 = {
     .bdrv_co_pwritev_compressed_part    = qcow2_co_pwritev_compressed_part,
     .bdrv_make_empty                    = qcow2_make_empty,
 
+    .bdrv_co_zone_report    = qcow2_co_zone_report,
+    .bdrv_co_zone_mgmt    = qcow2_co_zone_mgmt,
+    .bdrv_co_zone_append    = qcow2_co_zone_append,
+
     .bdrv_snapshot_create               = qcow2_snapshot_create,
     .bdrv_snapshot_goto                 = qcow2_snapshot_goto,
     .bdrv_snapshot_delete               = qcow2_snapshot_delete,
diff --git a/block/trace-events b/block/trace-events
index 8e789e1f12..e35222e079 100644
--- a/block/trace-events
+++ b/block/trace-events
@@ -82,6 +82,8 @@  qcow2_writev_data(void *co, uint64_t offset) "co %p offset 0x%" PRIx64
 qcow2_pwrite_zeroes_start_req(void *co, int64_t offset, int64_t bytes) "co %p offset 0x%" PRIx64 " bytes %" PRId64
 qcow2_pwrite_zeroes(void *co, int64_t offset, int64_t bytes) "co %p offset 0x%" PRIx64 " bytes %" PRId64
 qcow2_skip_cow(void *co, uint64_t offset, int nb_clusters) "co %p offset 0x%" PRIx64 " nb_clusters %d"
+qcow2_wp_tracking(int index, uint64_t wp) "wps[%d]: 0x%" PRIx64
+qcow2_imp_open_zones(uint8_t op, int nrz) "nr_imp_open_zones after op 0x%x: %d"
 
 # qcow2-cluster.c
 qcow2_alloc_clusters_offset(void *co, uint64_t offset, int bytes) "co %p offset 0x%" PRIx64 " bytes %d"
diff --git a/include/qemu/queue.h b/include/qemu/queue.h
index e029e7bf66..3f0a48740e 100644
--- a/include/qemu/queue.h
+++ b/include/qemu/queue.h
@@ -179,6 +179,7 @@  struct {                                                                \
 #define QLIST_EMPTY(head)                ((head)->lh_first == NULL)
 #define QLIST_FIRST(head)                ((head)->lh_first)
 #define QLIST_NEXT(elm, field)           ((elm)->field.le_next)
+#define QLIST_LAST(head, field)          (*(head)->lh_first->field.le_prev)
 
 
 /*