diff mbox series

[RFC,v2,3/3] dm zoned: add regular device info to metadata

Message ID 20200324110255.8385-4-bob.liu@oracle.com (mailing list archive)
State New, archived
Headers show
Series dm zoned: extend the way of exposing zoned block device | expand

Commit Message

Bob Liu March 24, 2020, 11:02 a.m. UTC
This patch implemented metadata support for regular device by:
 - Emulated zone information for regular device.
 - Store metadata at the beginning of regular device.

     | --- zoned device --- | -- regular device ||
     ^                      ^
     |                      |Metadata
zone 0

Signed-off-by: Bob Liu <bob.liu@oracle.com>
---
 drivers/md/dm-zoned-metadata.c | 135 +++++++++++++++++++++++++++++++----------
 drivers/md/dm-zoned-target.c   |   6 +-
 drivers/md/dm-zoned.h          |   3 +-
 3 files changed, 108 insertions(+), 36 deletions(-)

Comments

Damien Le Moal March 25, 2020, 6:29 a.m. UTC | #1
On 2020/03/24 20:04, Bob Liu wrote:
> This patch implemented metadata support for regular device by:
>  - Emulated zone information for regular device.
>  - Store metadata at the beginning of regular device.
> 
>      | --- zoned device --- | -- regular device ||
>      ^                      ^
>      |                      |Metadata
> zone 0
> 
> Signed-off-by: Bob Liu <bob.liu@oracle.com>
> ---
>  drivers/md/dm-zoned-metadata.c | 135 +++++++++++++++++++++++++++++++----------
>  drivers/md/dm-zoned-target.c   |   6 +-
>  drivers/md/dm-zoned.h          |   3 +-
>  3 files changed, 108 insertions(+), 36 deletions(-)
> 
> diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c
> index e0e8be0..a96158a 100644
> --- a/drivers/md/dm-zoned-metadata.c
> +++ b/drivers/md/dm-zoned-metadata.c
> @@ -131,6 +131,7 @@ struct dmz_sb {
>   */
>  struct dmz_metadata {
>  	struct dmz_dev		*zoned_dev;
> +	struct dmz_dev		*regu_dmz_dev;
>  
>  	sector_t		zone_bitmap_size;
>  	unsigned int		zone_nr_bitmap_blocks;
> @@ -187,6 +188,15 @@ struct dmz_metadata {
>  /*
>   * Various accessors
>   */
> +static inline struct dmz_dev *zmd_mdev(struct dmz_metadata *zmd)
> +{
> +	/* Metadata always stores in regular device if there is. */
> +	if (zmd->regu_dmz_dev)
> +		return zmd->regu_dmz_dev;
> +	else
> +		return zmd->zoned_dev;

OK. I think we will be better off using an array of pointers to struct_dmz_dev
in dmz_target, i.e., a filed "struct dmz_dev	*dev[2]". Doing so, we can be sure
to always have the device holding metatdata in entry 0, which will always be
true for the single drive case too.
With this, you will not need all these dance with "which device has metadata" ?
It always will be dmz->dev[0].

> +}
> +
>  unsigned int dmz_id(struct dmz_metadata *zmd, struct dm_zone *zone)
>  {
>  	return ((unsigned int)(zone - zmd->zones));
> @@ -194,12 +204,33 @@ unsigned int dmz_id(struct dmz_metadata *zmd, struct dm_zone *zone)
>  
>  sector_t dmz_start_sect(struct dmz_metadata *zmd, struct dm_zone *zone)
>  {
> -	return (sector_t)dmz_id(zmd, zone) << zmd->zoned_dev->zone_nr_sectors_shift;

With the array of dev trick, most of the changes below are simplified or go away.

> +	int dmz_real_id;
> +
> +	dmz_real_id = dmz_id(zmd, zone);
> +	if (dmz_real_id >= zmd->zoned_dev->nr_zones) {
> +		/* Regular dev. */
> +		dmz_real_id -= zmd->zoned_dev->nr_zones;
> +		WARN_ON(!zmd->regu_dmz_dev);
> +
> +		return (sector_t)dmz_real_id << zmd->zoned_dev->zone_nr_sectors_shift;
> +	}
> +	return (sector_t)dmz_real_id << zmd->zoned_dev->zone_nr_sectors_shift;
>  }
>  
>  sector_t dmz_start_block(struct dmz_metadata *zmd, struct dm_zone *zone)
>  {
> -	return (sector_t)dmz_id(zmd, zone) << zmd->zoned_dev->zone_nr_blocks_shift;
> +	int dmz_real_id;
> +
> +	dmz_real_id = dmz_id(zmd, zone);
> +	if (dmz_real_id >= zmd->zoned_dev->nr_zones) {
> +		/* Regular dev. */
> +		dmz_real_id -= zmd->zoned_dev->nr_zones;
> +		WARN_ON(!zmd->regu_dmz_dev);
> +
> +		return (sector_t)dmz_real_id << zmd->zoned_dev->zone_nr_blocks_shift;
> +	}
> +
> +	return (sector_t)dmz_real_id << zmd->zoned_dev->zone_nr_blocks_shift;
>  }
>  
>  unsigned int dmz_nr_chunks(struct dmz_metadata *zmd)
> @@ -403,8 +434,10 @@ static struct dmz_mblock *dmz_get_mblock_slow(struct dmz_metadata *zmd,
>  	struct dmz_mblock *mblk, *m;
>  	sector_t block = zmd->sb[zmd->mblk_primary].block + mblk_no;
>  	struct bio *bio;
> +	struct dmz_dev *mdev;
>  
> -	if (dmz_bdev_is_dying(zmd->zoned_dev))
> +	mdev = zmd_mdev(zmd);
> +	if (dmz_bdev_is_dying(mdev))
>  		return ERR_PTR(-EIO);
>  
>  	/* Get a new block and a BIO to read it */
> @@ -440,7 +473,7 @@ static struct dmz_mblock *dmz_get_mblock_slow(struct dmz_metadata *zmd,
>  
>  	/* Submit read BIO */
>  	bio->bi_iter.bi_sector = dmz_blk2sect(block);
> -	bio_set_dev(bio, zmd->zoned_dev->bdev);
> +	bio_set_dev(bio, mdev->bdev);
>  	bio->bi_private = mblk;
>  	bio->bi_end_io = dmz_mblock_bio_end_io;
>  	bio_set_op_attrs(bio, REQ_OP_READ, REQ_META | REQ_PRIO);
> @@ -555,7 +588,7 @@ static struct dmz_mblock *dmz_get_mblock(struct dmz_metadata *zmd,
>  		       TASK_UNINTERRUPTIBLE);
>  	if (test_bit(DMZ_META_ERROR, &mblk->state)) {
>  		dmz_release_mblock(zmd, mblk);
> -		dmz_check_bdev(zmd->zoned_dev);
> +		dmz_check_bdev(zmd_mdev(zmd));
>  		return ERR_PTR(-EIO);
>  	}
>  
> @@ -581,8 +614,10 @@ static int dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk,
>  {
>  	sector_t block = zmd->sb[set].block + mblk->no;
>  	struct bio *bio;
> +	struct dmz_dev *mdev;
>  
> -	if (dmz_bdev_is_dying(zmd->zoned_dev))
> +	mdev = zmd_mdev(zmd);
> +	if (dmz_bdev_is_dying(mdev))
>  		return -EIO;
>  
>  	bio = bio_alloc(GFP_NOIO, 1);
> @@ -594,7 +629,7 @@ static int dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk,
>  	set_bit(DMZ_META_WRITING, &mblk->state);
>  
>  	bio->bi_iter.bi_sector = dmz_blk2sect(block);
> -	bio_set_dev(bio, zmd->zoned_dev->bdev);
> +	bio_set_dev(bio, mdev->bdev);
>  	bio->bi_private = mblk;
>  	bio->bi_end_io = dmz_mblock_bio_end_io;
>  	bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_META | REQ_PRIO);
> @@ -612,8 +647,10 @@ static int dmz_rdwr_block(struct dmz_metadata *zmd, int op, sector_t block,
>  {
>  	struct bio *bio;
>  	int ret;
> +	struct dmz_dev *mdev;
>  
> -	if (dmz_bdev_is_dying(zmd->zoned_dev))
> +	mdev = zmd_mdev(zmd);
> +	if (dmz_bdev_is_dying(mdev))
>  		return -EIO;
>  
>  	bio = bio_alloc(GFP_NOIO, 1);
> @@ -621,14 +658,14 @@ static int dmz_rdwr_block(struct dmz_metadata *zmd, int op, sector_t block,
>  		return -ENOMEM;
>  
>  	bio->bi_iter.bi_sector = dmz_blk2sect(block);
> -	bio_set_dev(bio, zmd->zoned_dev->bdev);
> +	bio_set_dev(bio, mdev->bdev);
>  	bio_set_op_attrs(bio, op, REQ_SYNC | REQ_META | REQ_PRIO);
>  	bio_add_page(bio, page, DMZ_BLOCK_SIZE, 0);
>  	ret = submit_bio_wait(bio);
>  	bio_put(bio);
>  
>  	if (ret)
> -		dmz_check_bdev(zmd->zoned_dev);
> +		dmz_check_bdev(mdev);
>  	return ret;
>  }
>  
> @@ -661,7 +698,7 @@ static int dmz_write_sb(struct dmz_metadata *zmd, unsigned int set)
>  
>  	ret = dmz_rdwr_block(zmd, REQ_OP_WRITE, block, mblk->page);
>  	if (ret == 0)
> -		ret = blkdev_issue_flush(zmd->zoned_dev->bdev, GFP_NOIO, NULL);
> +		ret = blkdev_issue_flush(zmd_mdev(zmd)->bdev, GFP_NOIO, NULL);
>  
>  	return ret;
>  }
> @@ -695,15 +732,20 @@ static int dmz_write_dirty_mblocks(struct dmz_metadata *zmd,
>  			       TASK_UNINTERRUPTIBLE);
>  		if (test_bit(DMZ_META_ERROR, &mblk->state)) {
>  			clear_bit(DMZ_META_ERROR, &mblk->state);
> -			dmz_check_bdev(zmd->zoned_dev);
> +			dmz_check_bdev(zmd_mdev(zmd));
>  			ret = -EIO;
>  		}
>  		nr_mblks_submitted--;
>  	}
>  
>  	/* Flush drive cache (this will also sync data) */
> -	if (ret == 0)
> -		ret = blkdev_issue_flush(zmd->zoned_dev->bdev, GFP_NOIO, NULL);
> +	if (ret == 0) {
> +		/* Flush metadata device */
> +		ret = blkdev_issue_flush(zmd_mdev(zmd)->bdev, GFP_NOIO, NULL);
> +		if ((ret == 0) && zmd->regu_dmz_dev)
> +			/* Flush data device. */
> +			ret = blkdev_issue_flush(zmd->zoned_dev->bdev, GFP_NOIO, NULL);
> +	}
>  
>  	return ret;
>  }
> @@ -760,7 +802,7 @@ int dmz_flush_metadata(struct dmz_metadata *zmd)
>  	 */
>  	dmz_lock_flush(zmd);
>  
> -	if (dmz_bdev_is_dying(zmd->zoned_dev)) {
> +	if (dmz_bdev_is_dying(zmd_mdev(zmd))) {
>  		ret = -EIO;
>  		goto out;
>  	}
> @@ -772,7 +814,7 @@ int dmz_flush_metadata(struct dmz_metadata *zmd)
>  
>  	/* If there are no dirty metadata blocks, just flush the device cache */
>  	if (list_empty(&write_list)) {
> -		ret = blkdev_issue_flush(zmd->zoned_dev->bdev, GFP_NOIO, NULL);
> +		ret = blkdev_issue_flush(zmd_mdev(zmd)->bdev, GFP_NOIO, NULL);
>  		goto err;
>  	}
>  
> @@ -821,7 +863,7 @@ int dmz_flush_metadata(struct dmz_metadata *zmd)
>  		list_splice(&write_list, &zmd->mblk_dirty_list);
>  		spin_unlock(&zmd->mblk_lock);
>  	}
> -	if (!dmz_check_bdev(zmd->zoned_dev))
> +	if (!dmz_check_bdev(zmd_mdev(zmd)))
>  		ret = -EIO;
>  	goto out;
>  }
> @@ -832,10 +874,11 @@ int dmz_flush_metadata(struct dmz_metadata *zmd)
>  static int dmz_check_sb(struct dmz_metadata *zmd, struct dmz_super *sb)
>  {
>  	unsigned int nr_meta_zones, nr_data_zones;
> -	struct dmz_dev *dev = zmd->zoned_dev;
> +	struct dmz_dev *dev;
>  	u32 crc, stored_crc;
>  	u64 gen;
>  
> +	dev = zmd_mdev(zmd);
>  	gen = le64_to_cpu(sb->gen);
>  	stored_crc = le32_to_cpu(sb->crc);
>  	sb->crc = 0;
> @@ -1131,8 +1174,11 @@ static int dmz_init_zone(struct blk_zone *blkz, unsigned int idx, void *data)
>  		zmd->nr_useable_zones++;
>  		if (dmz_is_rnd(zone)) {
>  			zmd->nr_rnd_zones++;
> -			if (!zmd->sb_zone) {
> -				/* Super block zone */
> +			if (!zmd->sb_zone && !zmd->regu_dmz_dev) {
> +				/*
> +				 * Super block zone goes to regular
> +				 * device by default.
> +				 */
>  				zmd->sb_zone = zone;
>  			}
>  		}
> @@ -1157,7 +1203,8 @@ static void dmz_drop_zones(struct dmz_metadata *zmd)
>  static int dmz_init_zones(struct dmz_metadata *zmd)
>  {
>  	struct dmz_dev *dev = zmd->zoned_dev;
> -	int ret;
> +	int ret, i;
> +	unsigned int total_nr_zones;
>  
>  	/* Init */
>  	zmd->zone_bitmap_size = dev->zone_nr_blocks >> 3;
> @@ -1167,7 +1214,10 @@ static int dmz_init_zones(struct dmz_metadata *zmd)
>  					DMZ_BLOCK_SIZE_BITS);
>  
>  	/* Allocate zone array */
> -	zmd->zones = kcalloc(dev->nr_zones, sizeof(struct dm_zone), GFP_KERNEL);
> +	total_nr_zones = dev->nr_zones;
> +	if (zmd->regu_dmz_dev)
> +		total_nr_zones += zmd->regu_dmz_dev->nr_zones;
> +	zmd->zones = kcalloc(total_nr_zones, sizeof(struct dm_zone), GFP_KERNEL);
>  	if (!zmd->zones)
>  		return -ENOMEM;
>  
> @@ -1186,6 +1236,25 @@ static int dmz_init_zones(struct dmz_metadata *zmd)
>  		return ret;
>  	}
>  
> +	if (zmd->regu_dmz_dev) {
> +		/* Emulate zone information for regular device zone. */
> +		for (i = 0; i < zmd->regu_dmz_dev->nr_zones; i++) {
> +			struct dm_zone *zone = &zmd->zones[i + dev->nr_zones];
> +
> +			INIT_LIST_HEAD(&zone->link);
> +			atomic_set(&zone->refcount, 0);
> +			zone->chunk = DMZ_MAP_UNMAPPED;
> +
> +			set_bit(DMZ_RND, &zone->flags);
> +			zmd->nr_rnd_zones++;
> +			zmd->nr_useable_zones++;
> +			zone->wp_block = 0;
> +			if (!zmd->sb_zone)
> +				/* Super block zone */
> +				zmd->sb_zone = zone;
> +		}
> +	}
> +
>  	return 0;
>  }
>  
> @@ -1313,13 +1382,13 @@ static void dmz_get_zone_weight(struct dmz_metadata *zmd, struct dm_zone *zone);
>   */
>  static int dmz_load_mapping(struct dmz_metadata *zmd)
>  {
> -	struct dmz_dev *dev = zmd->zoned_dev;
>  	struct dm_zone *dzone, *bzone;
>  	struct dmz_mblock *dmap_mblk = NULL;
>  	struct dmz_map *dmap;
>  	unsigned int i = 0, e = 0, chunk = 0;
>  	unsigned int dzone_id;
>  	unsigned int bzone_id;
> +	struct dmz_dev *dev = zmd_mdev(zmd);
>  
>  	/* Metadata block array for the chunk mapping table */
>  	zmd->map_mblk = kcalloc(zmd->nr_map_blocks,
> @@ -1345,7 +1414,7 @@ static int dmz_load_mapping(struct dmz_metadata *zmd)
>  		if (dzone_id == DMZ_MAP_UNMAPPED)
>  			goto next;
>  
> -		if (dzone_id >= dev->nr_zones) {
> +		if (dzone_id >= dev->target->nr_zones) {
>  			dmz_dev_err(dev, "Chunk %u mapping: invalid data zone ID %u",
>  				    chunk, dzone_id);
>  			return -EIO;
> @@ -1366,7 +1435,7 @@ static int dmz_load_mapping(struct dmz_metadata *zmd)
>  		if (bzone_id == DMZ_MAP_UNMAPPED)
>  			goto next;
>  
> -		if (bzone_id >= dev->nr_zones) {
> +		if (bzone_id >= dev->target->nr_zones) {
>  			dmz_dev_err(dev, "Chunk %u mapping: invalid buffer zone ID %u",
>  				    chunk, bzone_id);
>  			return -EIO;
> @@ -1398,7 +1467,7 @@ static int dmz_load_mapping(struct dmz_metadata *zmd)
>  	 * fully initialized. All remaining zones are unmapped data
>  	 * zones. Finish initializing those here.
>  	 */
> -	for (i = 0; i < dev->nr_zones; i++) {
> +	for (i = 0; i < dev->target->nr_zones; i++) {
>  		dzone = dmz_get(zmd, i);
>  		if (dmz_is_meta(dzone))
>  			continue;
> @@ -1632,7 +1701,7 @@ struct dm_zone *dmz_get_chunk_mapping(struct dmz_metadata *zmd, unsigned int chu
>  		/* Allocate a random zone */
>  		dzone = dmz_alloc_zone(zmd, DMZ_ALLOC_RND);
>  		if (!dzone) {
> -			if (dmz_bdev_is_dying(zmd->zoned_dev)) {
> +			if (dmz_bdev_is_dying(zmd_mdev(zmd))) {
>  				dzone = ERR_PTR(-EIO);
>  				goto out;
>  			}
> @@ -1733,7 +1802,7 @@ struct dm_zone *dmz_get_chunk_buffer(struct dmz_metadata *zmd,
>  	/* Allocate a random zone */
>  	bzone = dmz_alloc_zone(zmd, DMZ_ALLOC_RND);
>  	if (!bzone) {
> -		if (dmz_bdev_is_dying(zmd->zoned_dev)) {
> +		if (dmz_bdev_is_dying(zmd_mdev(zmd))) {
>  			bzone = ERR_PTR(-EIO);
>  			goto out;
>  		}
> @@ -2360,7 +2429,8 @@ static void dmz_cleanup_metadata(struct dmz_metadata *zmd)
>  /*
>   * Initialize the zoned metadata.
>   */
> -int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **metadata)
> +int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_dev *regu_dmz_dev,
> +		struct dmz_metadata **metadata)
>  {
>  	struct dmz_metadata *zmd;
>  	unsigned int i, zid;
> @@ -2372,6 +2442,7 @@ int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **metadata)
>  		return -ENOMEM;
>  
>  	zmd->zoned_dev = dev;
> +	zmd->regu_dmz_dev = regu_dmz_dev;
>  	zmd->mblk_rbtree = RB_ROOT;
>  	init_rwsem(&zmd->mblk_sem);
>  	mutex_init(&zmd->mblk_flush_lock);
> @@ -2440,9 +2511,9 @@ int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **metadata)
>  		     bdev_zoned_model(dev->bdev) == BLK_ZONED_HA ?
>  		     "aware" : "managed");
>  	dmz_dev_info(dev, "  %llu 512-byte logical sectors",
> -		     (u64)dev->capacity);
> +		     (u64)dev->capacity  + (u64)regu_dmz_dev->capacity);
>  	dmz_dev_info(dev, "  %u zones of %llu 512-byte logical sectors",
> -		     dev->nr_zones, (u64)dev->zone_nr_sectors);
> +		     dev->nr_zones + regu_dmz_dev->nr_zones, (u64)dev->zone_nr_sectors);
>  	dmz_dev_info(dev, "  %u metadata zones",
>  		     zmd->nr_meta_zones * 2);
>  	dmz_dev_info(dev, "  %u data zones for %u chunks",
> @@ -2488,7 +2559,7 @@ void dmz_dtr_metadata(struct dmz_metadata *zmd)
>   */
>  int dmz_resume_metadata(struct dmz_metadata *zmd)
>  {
> -	struct dmz_dev *dev = zmd->zoned_dev;
> +	struct dmz_dev *dev = zmd_mdev(zmd);
>  	struct dm_zone *zone;
>  	sector_t wp_block;
>  	unsigned int i;
> diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c
> index cae4bfe..41dbb9d 100644
> --- a/drivers/md/dm-zoned-target.c
> +++ b/drivers/md/dm-zoned-target.c
> @@ -803,7 +803,7 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv)
>  
>  	/* Initialize metadata */
>  	dev = dmz->zoned_dev;
> -	ret = dmz_ctr_metadata(dev, &dmz->metadata);
> +	ret = dmz_ctr_metadata(dev, dmz->regu_dmz_dev, &dmz->metadata);
>  	if (ret) {
>  		ti->error = "Metadata initialization failed";
>  		goto err_dev;
> @@ -852,8 +852,8 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv)
>  	}
>  	mod_delayed_work(dmz->flush_wq, &dmz->flush_work, DMZ_FLUSH_PERIOD);
>  
> -	/* Initialize reclaim */
> -	ret = dmz_ctr_reclaim(dev, dmz->metadata, &dmz->reclaim);
> +	/* Initialize reclaim, only reclaim from regular device. */
> +	ret = dmz_ctr_reclaim(dmz->regu_dmz_dev, dmz->metadata, &dmz->reclaim);
>  	if (ret) {
>  		ti->error = "Zone reclaim initialization failed";
>  		goto err_fwq;
> diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h
> index a3535bc..7aa1a30 100644
> --- a/drivers/md/dm-zoned.h
> +++ b/drivers/md/dm-zoned.h
> @@ -206,7 +206,8 @@ struct dmz_reclaim;
>  /*
>   * Functions defined in dm-zoned-metadata.c
>   */
> -int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **zmd);
> +int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_dev *regu_dmz_dev,
> +		struct dmz_metadata **zmd);
>  void dmz_dtr_metadata(struct dmz_metadata *zmd);
>  int dmz_resume_metadata(struct dmz_metadata *zmd);
>  
>
Hannes Reinecke March 25, 2020, 6:47 a.m. UTC | #2
On 3/25/20 7:29 AM, Damien Le Moal wrote:
> On 2020/03/24 20:04, Bob Liu wrote:
>> This patch implemented metadata support for regular device by:
>>   - Emulated zone information for regular device.
>>   - Store metadata at the beginning of regular device.
>>
>>       | --- zoned device --- | -- regular device ||
>>       ^                      ^
>>       |                      |Metadata
>> zone 0
>>
>> Signed-off-by: Bob Liu <bob.liu@oracle.com>
>> ---
>>   drivers/md/dm-zoned-metadata.c | 135 +++++++++++++++++++++++++++++++----------
>>   drivers/md/dm-zoned-target.c   |   6 +-
>>   drivers/md/dm-zoned.h          |   3 +-
>>   3 files changed, 108 insertions(+), 36 deletions(-)
>>
Having thought about it some more, I think we cannot continue with this 
'simple' approach.
The immediate problem is that we lie about the disk size; clearly the
metadata cannot be used for regular data, yet we expose a target device 
with the full size of the underlying device.
Making me wonder if anybody ever tested a disk-full scenario...
The other problem is that with two devices we need to be able to stitch 
them together in an automated fashion, eg via a systemd service or udev 
rule.
But for this we need to be able to identify the devices, which means 
both need to carry metadata, and both need to have unique identifier 
within the metadata. Which the current metadata doesn't allow to.

Hence my plan is to implement a v2 metadata, carrying UUIDs for the dmz 
set _and_ the component device. With that we can update blkid to create 
links etc so that the devices can be identified in the system.
Additionally I would be updating dmzadm to write the new metadata.

And I will add a new command 'start' to dmzadm which will then create 
the device-mapper device _with the correct size_. It also has the 
benefit that we can create the device-mapper target with the UUID 
specified in the metadata, so the persistent device links will be 
created automatically.

Bob, can you send me your improvements to dmzadm so that I can include 
them in my changes?

Cheers,

Hannes
Bob Liu March 25, 2020, 7:29 a.m. UTC | #3
On 3/25/20 2:47 PM, Hannes Reinecke wrote:
> On 3/25/20 7:29 AM, Damien Le Moal wrote:
>> On 2020/03/24 20:04, Bob Liu wrote:
>>> This patch implemented metadata support for regular device by:
>>>   - Emulated zone information for regular device.
>>>   - Store metadata at the beginning of regular device.
>>>
>>>       | --- zoned device --- | -- regular device ||
>>>       ^                      ^
>>>       |                      |Metadata
>>> zone 0
>>>
>>> Signed-off-by: Bob Liu <bob.liu@oracle.com>
>>> ---
>>>   drivers/md/dm-zoned-metadata.c | 135 +++++++++++++++++++++++++++++++----------
>>>   drivers/md/dm-zoned-target.c   |   6 +-
>>>   drivers/md/dm-zoned.h          |   3 +-
>>>   3 files changed, 108 insertions(+), 36 deletions(-)
>>>
> Having thought about it some more, I think we cannot continue with this 'simple' approach.
> The immediate problem is that we lie about the disk size; clearly the
> metadata cannot be used for regular data, yet we expose a target device with the full size of the underlying device.

The exposed size is "regular dev size + zoned dev size - metadata size - reserved seq zone size".
I didn't see why there is a lie?

> Making me wonder if anybody ever tested a disk-full scenario...
> The other problem is that with two devices we need to be able to stitch them together in an automated fashion, eg via a systemd service or udev rule.
> But for this we need to be able to identify the devices, which means both need to carry metadata, and both need to have unique identifier within the metadata. Which the current metadata doesn't allow to.
> 
> Hence my plan is to implement a v2 metadata, carrying UUIDs for the dmz set _and_ the component device. With that we can update blkid to create links etc so that the devices can be identified in the system.
> Additionally I would be updating dmzadm to write the new metadata.
> 
> And I will add a new command 'start' to dmzadm which will then create the device-mapper device _with the correct size_. It also has the benefit that we can create the device-mapper target with the UUID specified in the metadata, so the persistent device links will be created automatically.
> 
> Bob, can you send me your improvements to dmzadm so that I can include them in my changes?
> 

Attached, but it's a big patch I haven't split them to smaller one.
The dmz_check/repair can't work neither in current stage.
Damien Le Moal March 25, 2020, 8:02 a.m. UTC | #4
On 2020/03/25 15:47, Hannes Reinecke wrote:
> On 3/25/20 7:29 AM, Damien Le Moal wrote:
>> On 2020/03/24 20:04, Bob Liu wrote:
>>> This patch implemented metadata support for regular device by:
>>>   - Emulated zone information for regular device.
>>>   - Store metadata at the beginning of regular device.
>>>
>>>       | --- zoned device --- | -- regular device ||
>>>       ^                      ^
>>>       |                      |Metadata
>>> zone 0
>>>
>>> Signed-off-by: Bob Liu <bob.liu@oracle.com>
>>> ---
>>>   drivers/md/dm-zoned-metadata.c | 135 +++++++++++++++++++++++++++++++----------
>>>   drivers/md/dm-zoned-target.c   |   6 +-
>>>   drivers/md/dm-zoned.h          |   3 +-
>>>   3 files changed, 108 insertions(+), 36 deletions(-)
>>>
> Having thought about it some more, I think we cannot continue with this 
> 'simple' approach.
> The immediate problem is that we lie about the disk size; clearly the
> metadata cannot be used for regular data, yet we expose a target device 
> with the full size of the underlying device.
> Making me wonder if anybody ever tested a disk-full scenario...

Current dm-zoned does not do that... What is exposed as target capacity is
number of chunks * zone size, with the number of chunks being number of zones
minus metadata zones minus number of zones reserved for reclaim. And I did test
disk full scenario (when performance goes to the trash bin because reclaim
struggles...)

> The other problem is that with two devices we need to be able to stitch 
> them together in an automated fashion, eg via a systemd service or udev 
> rule.

Yes, and that has been on my to-do list forever for the current dm-zoned...

> But for this we need to be able to identify the devices, which means 
> both need to carry metadata, and both need to have unique identifier 
> within the metadata. Which the current metadata doesn't allow to.
> 
> Hence my plan is to implement a v2 metadata, carrying UUIDs for the dmz 
> set _and_ the component device. With that we can update blkid to create 
> links etc so that the devices can be identified in the system.
> Additionally I would be updating dmzadm to write the new metadata.

Yep. I think that is needed. And the metadata for the disk that does not store
the mapping tables and bitmaps can be read-only at run time, that is a super
block only holding identification/UUID.

> And I will add a new command 'start' to dmzadm which will then create 
> the device-mapper device _with the correct size_. It also has the 
> benefit that we can create the device-mapper target with the UUID 
> specified in the metadata, so the persistent device links will be 
> created automatically.

The size now should be correct with single device current setup.

> 
> Bob, can you send me your improvements to dmzadm so that I can include 
> them in my changes?
> 
> Cheers,
> 
> Hannes
>
Hannes Reinecke March 25, 2020, 8:45 a.m. UTC | #5
On 3/25/20 8:29 AM, Bob Liu wrote:
> On 3/25/20 2:47 PM, Hannes Reinecke wrote:
>> On 3/25/20 7:29 AM, Damien Le Moal wrote:
>>> On 2020/03/24 20:04, Bob Liu wrote:
>>>> This patch implemented metadata support for regular device by:
>>>>    - Emulated zone information for regular device.
>>>>    - Store metadata at the beginning of regular device.
>>>>
>>>>        | --- zoned device --- | -- regular device ||
>>>>        ^                      ^
>>>>        |                      |Metadata
>>>> zone 0
>>>>
>>>> Signed-off-by: Bob Liu <bob.liu@oracle.com>
>>>> ---
>>>>    drivers/md/dm-zoned-metadata.c | 135 +++++++++++++++++++++++++++++++----------
>>>>    drivers/md/dm-zoned-target.c   |   6 +-
>>>>    drivers/md/dm-zoned.h          |   3 +-
>>>>    3 files changed, 108 insertions(+), 36 deletions(-)
>>>>
>> Having thought about it some more, I think we cannot continue with this 'simple' approach.
>> The immediate problem is that we lie about the disk size; clearly the
>> metadata cannot be used for regular data, yet we expose a target device with the full size of the underlying device.
> 
> The exposed size is "regular dev size + zoned dev size - metadata size - reserved seq zone size".
> I didn't see why there is a lie?
> 
The lie is in generating the device-mapper line for setting up the 
target device.
Format is

0 <size> zoned <zoned-dev> <cache-dev>

and <size> is the capacity of the resulting device-mapper device.
So we should have adapted this to exclude the metadata size and the 
reserved seq zone size (even with the original implementation); 
'blksize' is certainly wrong here.

>> Making me wonder if anybody ever tested a disk-full scenario...
>> The other problem is that with two devices we need to be able to stitch them together
>> in an automated fashion, eg via a systemd service or udev rule.
>> But for this we need to be able to identify the devices, which means both need to carry
>> metadata, and both need to have unique identifier within the metadata. Which the current
>> metadata doesn't allow to.
>>
>> Hence my plan is to implement a v2 metadata, carrying UUIDs for the dmz set _and_ the
>> component device. With that we can update blkid to create links etc so that the devices
>> can be identified in the system.
>> Additionally I would be updating dmzadm to write the new metadata.
>>
>> And I will add a new command 'start' to dmzadm which will then create the device-mapper
>> device _with the correct size_. It also has the benefit that we can create the device-mapper
>> target with the UUID specified in the metadata, so the persistent device links will be
>> created automatically.
>>
>> Bob, can you send me your improvements to dmzadm so that I can include them in my changes?
>>
> 
> Attached, but it's a big patch I haven't split them to smaller one.
> The dmz_check/repair can't work neither in current stage.
> 
Yeah, of course. Plan is to start with V2 metadata handling first anyway 
(it adding UUIDs), then add the 'start' functionality, and only then 
implement cache device handling.

Thanks for the patch.

Cheers,

Hannes
Hannes Reinecke March 25, 2020, 8:52 a.m. UTC | #6
On 3/25/20 9:02 AM, Damien Le Moal wrote:
> On 2020/03/25 15:47, Hannes Reinecke wrote:
>> On 3/25/20 7:29 AM, Damien Le Moal wrote:
>>> On 2020/03/24 20:04, Bob Liu wrote:
>>>> This patch implemented metadata support for regular device by:
>>>>    - Emulated zone information for regular device.
>>>>    - Store metadata at the beginning of regular device.
>>>>
>>>>        | --- zoned device --- | -- regular device ||
>>>>        ^                      ^
>>>>        |                      |Metadata
>>>> zone 0
>>>>
>>>> Signed-off-by: Bob Liu <bob.liu@oracle.com>
>>>> ---
>>>>    drivers/md/dm-zoned-metadata.c | 135 +++++++++++++++++++++++++++++++----------
>>>>    drivers/md/dm-zoned-target.c   |   6 +-
>>>>    drivers/md/dm-zoned.h          |   3 +-
>>>>    3 files changed, 108 insertions(+), 36 deletions(-)
>>>>
>> Having thought about it some more, I think we cannot continue with this
>> 'simple' approach.
>> The immediate problem is that we lie about the disk size; clearly the
>> metadata cannot be used for regular data, yet we expose a target device
>> with the full size of the underlying device.
>> Making me wonder if anybody ever tested a disk-full scenario...
> 
> Current dm-zoned does not do that... What is exposed as target capacity is
> number of chunks * zone size, with the number of chunks being number of zones
> minus metadata zones minus number of zones reserved for reclaim. And I did test
> disk full scenario (when performance goes to the trash bin because reclaim
> struggles...)
> 
Thing is, the second number for the dmsetup target line is _supposed_ to 
be the target size.
Which clearly is wrong here.
I must admit I'm not sure what device-mapper will do with a target 
definition which is larger than the resulting target device ...
Mike should know, but it's definitely awkward.

Cheers,

Hannes
Damien Le Moal March 25, 2020, 9:10 a.m. UTC | #7
On 2020/03/25 17:52, Hannes Reinecke wrote:
> On 3/25/20 9:02 AM, Damien Le Moal wrote:
>> On 2020/03/25 15:47, Hannes Reinecke wrote:
>>> On 3/25/20 7:29 AM, Damien Le Moal wrote:
>>>> On 2020/03/24 20:04, Bob Liu wrote:
>>>>> This patch implemented metadata support for regular device by:
>>>>>    - Emulated zone information for regular device.
>>>>>    - Store metadata at the beginning of regular device.
>>>>>
>>>>>        | --- zoned device --- | -- regular device ||
>>>>>        ^                      ^
>>>>>        |                      |Metadata
>>>>> zone 0
>>>>>
>>>>> Signed-off-by: Bob Liu <bob.liu@oracle.com>
>>>>> ---
>>>>>    drivers/md/dm-zoned-metadata.c | 135 +++++++++++++++++++++++++++++++----------
>>>>>    drivers/md/dm-zoned-target.c   |   6 +-
>>>>>    drivers/md/dm-zoned.h          |   3 +-
>>>>>    3 files changed, 108 insertions(+), 36 deletions(-)
>>>>>
>>> Having thought about it some more, I think we cannot continue with this
>>> 'simple' approach.
>>> The immediate problem is that we lie about the disk size; clearly the
>>> metadata cannot be used for regular data, yet we expose a target device
>>> with the full size of the underlying device.
>>> Making me wonder if anybody ever tested a disk-full scenario...
>>
>> Current dm-zoned does not do that... What is exposed as target capacity is
>> number of chunks * zone size, with the number of chunks being number of zones
>> minus metadata zones minus number of zones reserved for reclaim. And I did test
>> disk full scenario (when performance goes to the trash bin because reclaim
>> struggles...)
>>
> Thing is, the second number for the dmsetup target line is _supposed_ to 
> be the target size.
> Which clearly is wrong here.
> I must admit I'm not sure what device-mapper will do with a target 
> definition which is larger than the resulting target device ...
> Mike should know, but it's definitely awkward.

AHh. OK. Never thought of it like this, especially considering the fact that the
table entry is checked to see if the entire drive is given. So instead of the
target size, I was in fact using the size parameter of dmsetup as the size to
use on the backend, which for dm-zoned must be the device capacity...

Not sure if we can fix that now ? Especially considering that the number of
reserved seq zones for reclaim is not constant but a dmzadm format option. So
the average user would have to know exactly the useable size to dmsetup the
target. Akward too, or rather, not super easy to use. I wonder how dm-thin or
other targets with metadata handle this ? Do they format themselves
automatically on dmsetup using the size specified ?

> 
> Cheers,
> 
> Hannes
>
Hannes Reinecke March 25, 2020, 10 a.m. UTC | #8
On 3/25/20 10:10 AM, Damien Le Moal wrote:
> On 2020/03/25 17:52, Hannes Reinecke wrote:
>> On 3/25/20 9:02 AM, Damien Le Moal wrote:
>>> On 2020/03/25 15:47, Hannes Reinecke wrote:
>>>> On 3/25/20 7:29 AM, Damien Le Moal wrote:
>>>>> On 2020/03/24 20:04, Bob Liu wrote:
>>>>>> This patch implemented metadata support for regular device by:
>>>>>>     - Emulated zone information for regular device.
>>>>>>     - Store metadata at the beginning of regular device.
>>>>>>
>>>>>>         | --- zoned device --- | -- regular device ||
>>>>>>         ^                      ^
>>>>>>         |                      |Metadata
>>>>>> zone 0
>>>>>>
>>>>>> Signed-off-by: Bob Liu <bob.liu@oracle.com>
>>>>>> ---
>>>>>>     drivers/md/dm-zoned-metadata.c | 135 +++++++++++++++++++++++++++++++----------
>>>>>>     drivers/md/dm-zoned-target.c   |   6 +-
>>>>>>     drivers/md/dm-zoned.h          |   3 +-
>>>>>>     3 files changed, 108 insertions(+), 36 deletions(-)
>>>>>>
>>>> Having thought about it some more, I think we cannot continue with this
>>>> 'simple' approach.
>>>> The immediate problem is that we lie about the disk size; clearly the
>>>> metadata cannot be used for regular data, yet we expose a target device
>>>> with the full size of the underlying device.
>>>> Making me wonder if anybody ever tested a disk-full scenario...
>>>
>>> Current dm-zoned does not do that... What is exposed as target capacity is
>>> number of chunks * zone size, with the number of chunks being number of zones
>>> minus metadata zones minus number of zones reserved for reclaim. And I did test
>>> disk full scenario (when performance goes to the trash bin because reclaim
>>> struggles...)
>>>
>> Thing is, the second number for the dmsetup target line is _supposed_ to
>> be the target size.
>> Which clearly is wrong here.
>> I must admit I'm not sure what device-mapper will do with a target
>> definition which is larger than the resulting target device ...
>> Mike should know, but it's definitely awkward.
> 
> AHh. OK. Never thought of it like this, especially considering the fact that the
> table entry is checked to see if the entire drive is given. So instead of the
> target size, I was in fact using the size parameter of dmsetup as the size to
> use on the backend, which for dm-zoned must be the device capacity...
> 
> Not sure if we can fix that now ? Especially considering that the number of
> reserved seq zones for reclaim is not constant but a dmzadm format option. So
> the average user would have to know exactly the useable size to dmsetup the
> target. Akward too, or rather, not super easy to use. I wonder how dm-thin or
> other targets with metadata handle this ? Do they format themselves
> automatically on dmsetup using the size specified ?
> 
Which is _precisely_ why I want to have the 'start' option to dmzadm.
That can read the metadata, validate it, and then generate the correct 
invocation for device-mapper.
_And_ we get a device-uuid to boot, as this can only be set from the ioctl.

Cheers,

Hannes
Damien Le Moal March 25, 2020, 10:02 a.m. UTC | #9
On 2020/03/25 19:00, Hannes Reinecke wrote:
> On 3/25/20 10:10 AM, Damien Le Moal wrote:
>> On 2020/03/25 17:52, Hannes Reinecke wrote:
>>> On 3/25/20 9:02 AM, Damien Le Moal wrote:
>>>> On 2020/03/25 15:47, Hannes Reinecke wrote:
>>>>> On 3/25/20 7:29 AM, Damien Le Moal wrote:
>>>>>> On 2020/03/24 20:04, Bob Liu wrote:
>>>>>>> This patch implemented metadata support for regular device by:
>>>>>>>     - Emulated zone information for regular device.
>>>>>>>     - Store metadata at the beginning of regular device.
>>>>>>>
>>>>>>>         | --- zoned device --- | -- regular device ||
>>>>>>>         ^                      ^
>>>>>>>         |                      |Metadata
>>>>>>> zone 0
>>>>>>>
>>>>>>> Signed-off-by: Bob Liu <bob.liu@oracle.com>
>>>>>>> ---
>>>>>>>     drivers/md/dm-zoned-metadata.c | 135 +++++++++++++++++++++++++++++++----------
>>>>>>>     drivers/md/dm-zoned-target.c   |   6 +-
>>>>>>>     drivers/md/dm-zoned.h          |   3 +-
>>>>>>>     3 files changed, 108 insertions(+), 36 deletions(-)
>>>>>>>
>>>>> Having thought about it some more, I think we cannot continue with this
>>>>> 'simple' approach.
>>>>> The immediate problem is that we lie about the disk size; clearly the
>>>>> metadata cannot be used for regular data, yet we expose a target device
>>>>> with the full size of the underlying device.
>>>>> Making me wonder if anybody ever tested a disk-full scenario...
>>>>
>>>> Current dm-zoned does not do that... What is exposed as target capacity is
>>>> number of chunks * zone size, with the number of chunks being number of zones
>>>> minus metadata zones minus number of zones reserved for reclaim. And I did test
>>>> disk full scenario (when performance goes to the trash bin because reclaim
>>>> struggles...)
>>>>
>>> Thing is, the second number for the dmsetup target line is _supposed_ to
>>> be the target size.
>>> Which clearly is wrong here.
>>> I must admit I'm not sure what device-mapper will do with a target
>>> definition which is larger than the resulting target device ...
>>> Mike should know, but it's definitely awkward.
>>
>> AHh. OK. Never thought of it like this, especially considering the fact that the
>> table entry is checked to see if the entire drive is given. So instead of the
>> target size, I was in fact using the size parameter of dmsetup as the size to
>> use on the backend, which for dm-zoned must be the device capacity...
>>
>> Not sure if we can fix that now ? Especially considering that the number of
>> reserved seq zones for reclaim is not constant but a dmzadm format option. So
>> the average user would have to know exactly the useable size to dmsetup the
>> target. Akward too, or rather, not super easy to use. I wonder how dm-thin or
>> other targets with metadata handle this ? Do they format themselves
>> automatically on dmsetup using the size specified ?
>>
> Which is _precisely_ why I want to have the 'start' option to dmzadm.
> That can read the metadata, validate it, and then generate the correct 
> invocation for device-mapper.
> _And_ we get a device-uuid to boot, as this can only be set from the ioctl.

OK. Got it. Done like this, it will also be easy to support the v1 metadata.

> 
> Cheers,
> 
> Hannes
>
diff mbox series

Patch

diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c
index e0e8be0..a96158a 100644
--- a/drivers/md/dm-zoned-metadata.c
+++ b/drivers/md/dm-zoned-metadata.c
@@ -131,6 +131,7 @@  struct dmz_sb {
  */
 struct dmz_metadata {
 	struct dmz_dev		*zoned_dev;
+	struct dmz_dev		*regu_dmz_dev;
 
 	sector_t		zone_bitmap_size;
 	unsigned int		zone_nr_bitmap_blocks;
@@ -187,6 +188,15 @@  struct dmz_metadata {
 /*
  * Various accessors
  */
+static inline struct dmz_dev *zmd_mdev(struct dmz_metadata *zmd)
+{
+	/* Metadata always stores in regular device if there is. */
+	if (zmd->regu_dmz_dev)
+		return zmd->regu_dmz_dev;
+	else
+		return zmd->zoned_dev;
+}
+
 unsigned int dmz_id(struct dmz_metadata *zmd, struct dm_zone *zone)
 {
 	return ((unsigned int)(zone - zmd->zones));
@@ -194,12 +204,33 @@  unsigned int dmz_id(struct dmz_metadata *zmd, struct dm_zone *zone)
 
 sector_t dmz_start_sect(struct dmz_metadata *zmd, struct dm_zone *zone)
 {
-	return (sector_t)dmz_id(zmd, zone) << zmd->zoned_dev->zone_nr_sectors_shift;
+	int dmz_real_id;
+
+	dmz_real_id = dmz_id(zmd, zone);
+	if (dmz_real_id >= zmd->zoned_dev->nr_zones) {
+		/* Regular dev. */
+		dmz_real_id -= zmd->zoned_dev->nr_zones;
+		WARN_ON(!zmd->regu_dmz_dev);
+
+		return (sector_t)dmz_real_id << zmd->zoned_dev->zone_nr_sectors_shift;
+	}
+	return (sector_t)dmz_real_id << zmd->zoned_dev->zone_nr_sectors_shift;
 }
 
 sector_t dmz_start_block(struct dmz_metadata *zmd, struct dm_zone *zone)
 {
-	return (sector_t)dmz_id(zmd, zone) << zmd->zoned_dev->zone_nr_blocks_shift;
+	int dmz_real_id;
+
+	dmz_real_id = dmz_id(zmd, zone);
+	if (dmz_real_id >= zmd->zoned_dev->nr_zones) {
+		/* Regular dev. */
+		dmz_real_id -= zmd->zoned_dev->nr_zones;
+		WARN_ON(!zmd->regu_dmz_dev);
+
+		return (sector_t)dmz_real_id << zmd->zoned_dev->zone_nr_blocks_shift;
+	}
+
+	return (sector_t)dmz_real_id << zmd->zoned_dev->zone_nr_blocks_shift;
 }
 
 unsigned int dmz_nr_chunks(struct dmz_metadata *zmd)
@@ -403,8 +434,10 @@  static struct dmz_mblock *dmz_get_mblock_slow(struct dmz_metadata *zmd,
 	struct dmz_mblock *mblk, *m;
 	sector_t block = zmd->sb[zmd->mblk_primary].block + mblk_no;
 	struct bio *bio;
+	struct dmz_dev *mdev;
 
-	if (dmz_bdev_is_dying(zmd->zoned_dev))
+	mdev = zmd_mdev(zmd);
+	if (dmz_bdev_is_dying(mdev))
 		return ERR_PTR(-EIO);
 
 	/* Get a new block and a BIO to read it */
@@ -440,7 +473,7 @@  static struct dmz_mblock *dmz_get_mblock_slow(struct dmz_metadata *zmd,
 
 	/* Submit read BIO */
 	bio->bi_iter.bi_sector = dmz_blk2sect(block);
-	bio_set_dev(bio, zmd->zoned_dev->bdev);
+	bio_set_dev(bio, mdev->bdev);
 	bio->bi_private = mblk;
 	bio->bi_end_io = dmz_mblock_bio_end_io;
 	bio_set_op_attrs(bio, REQ_OP_READ, REQ_META | REQ_PRIO);
@@ -555,7 +588,7 @@  static struct dmz_mblock *dmz_get_mblock(struct dmz_metadata *zmd,
 		       TASK_UNINTERRUPTIBLE);
 	if (test_bit(DMZ_META_ERROR, &mblk->state)) {
 		dmz_release_mblock(zmd, mblk);
-		dmz_check_bdev(zmd->zoned_dev);
+		dmz_check_bdev(zmd_mdev(zmd));
 		return ERR_PTR(-EIO);
 	}
 
@@ -581,8 +614,10 @@  static int dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk,
 {
 	sector_t block = zmd->sb[set].block + mblk->no;
 	struct bio *bio;
+	struct dmz_dev *mdev;
 
-	if (dmz_bdev_is_dying(zmd->zoned_dev))
+	mdev = zmd_mdev(zmd);
+	if (dmz_bdev_is_dying(mdev))
 		return -EIO;
 
 	bio = bio_alloc(GFP_NOIO, 1);
@@ -594,7 +629,7 @@  static int dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk,
 	set_bit(DMZ_META_WRITING, &mblk->state);
 
 	bio->bi_iter.bi_sector = dmz_blk2sect(block);
-	bio_set_dev(bio, zmd->zoned_dev->bdev);
+	bio_set_dev(bio, mdev->bdev);
 	bio->bi_private = mblk;
 	bio->bi_end_io = dmz_mblock_bio_end_io;
 	bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_META | REQ_PRIO);
@@ -612,8 +647,10 @@  static int dmz_rdwr_block(struct dmz_metadata *zmd, int op, sector_t block,
 {
 	struct bio *bio;
 	int ret;
+	struct dmz_dev *mdev;
 
-	if (dmz_bdev_is_dying(zmd->zoned_dev))
+	mdev = zmd_mdev(zmd);
+	if (dmz_bdev_is_dying(mdev))
 		return -EIO;
 
 	bio = bio_alloc(GFP_NOIO, 1);
@@ -621,14 +658,14 @@  static int dmz_rdwr_block(struct dmz_metadata *zmd, int op, sector_t block,
 		return -ENOMEM;
 
 	bio->bi_iter.bi_sector = dmz_blk2sect(block);
-	bio_set_dev(bio, zmd->zoned_dev->bdev);
+	bio_set_dev(bio, mdev->bdev);
 	bio_set_op_attrs(bio, op, REQ_SYNC | REQ_META | REQ_PRIO);
 	bio_add_page(bio, page, DMZ_BLOCK_SIZE, 0);
 	ret = submit_bio_wait(bio);
 	bio_put(bio);
 
 	if (ret)
-		dmz_check_bdev(zmd->zoned_dev);
+		dmz_check_bdev(mdev);
 	return ret;
 }
 
@@ -661,7 +698,7 @@  static int dmz_write_sb(struct dmz_metadata *zmd, unsigned int set)
 
 	ret = dmz_rdwr_block(zmd, REQ_OP_WRITE, block, mblk->page);
 	if (ret == 0)
-		ret = blkdev_issue_flush(zmd->zoned_dev->bdev, GFP_NOIO, NULL);
+		ret = blkdev_issue_flush(zmd_mdev(zmd)->bdev, GFP_NOIO, NULL);
 
 	return ret;
 }
@@ -695,15 +732,20 @@  static int dmz_write_dirty_mblocks(struct dmz_metadata *zmd,
 			       TASK_UNINTERRUPTIBLE);
 		if (test_bit(DMZ_META_ERROR, &mblk->state)) {
 			clear_bit(DMZ_META_ERROR, &mblk->state);
-			dmz_check_bdev(zmd->zoned_dev);
+			dmz_check_bdev(zmd_mdev(zmd));
 			ret = -EIO;
 		}
 		nr_mblks_submitted--;
 	}
 
 	/* Flush drive cache (this will also sync data) */
-	if (ret == 0)
-		ret = blkdev_issue_flush(zmd->zoned_dev->bdev, GFP_NOIO, NULL);
+	if (ret == 0) {
+		/* Flush metadata device */
+		ret = blkdev_issue_flush(zmd_mdev(zmd)->bdev, GFP_NOIO, NULL);
+		if ((ret == 0) && zmd->regu_dmz_dev)
+			/* Flush data device. */
+			ret = blkdev_issue_flush(zmd->zoned_dev->bdev, GFP_NOIO, NULL);
+	}
 
 	return ret;
 }
@@ -760,7 +802,7 @@  int dmz_flush_metadata(struct dmz_metadata *zmd)
 	 */
 	dmz_lock_flush(zmd);
 
-	if (dmz_bdev_is_dying(zmd->zoned_dev)) {
+	if (dmz_bdev_is_dying(zmd_mdev(zmd))) {
 		ret = -EIO;
 		goto out;
 	}
@@ -772,7 +814,7 @@  int dmz_flush_metadata(struct dmz_metadata *zmd)
 
 	/* If there are no dirty metadata blocks, just flush the device cache */
 	if (list_empty(&write_list)) {
-		ret = blkdev_issue_flush(zmd->zoned_dev->bdev, GFP_NOIO, NULL);
+		ret = blkdev_issue_flush(zmd_mdev(zmd)->bdev, GFP_NOIO, NULL);
 		goto err;
 	}
 
@@ -821,7 +863,7 @@  int dmz_flush_metadata(struct dmz_metadata *zmd)
 		list_splice(&write_list, &zmd->mblk_dirty_list);
 		spin_unlock(&zmd->mblk_lock);
 	}
-	if (!dmz_check_bdev(zmd->zoned_dev))
+	if (!dmz_check_bdev(zmd_mdev(zmd)))
 		ret = -EIO;
 	goto out;
 }
@@ -832,10 +874,11 @@  int dmz_flush_metadata(struct dmz_metadata *zmd)
 static int dmz_check_sb(struct dmz_metadata *zmd, struct dmz_super *sb)
 {
 	unsigned int nr_meta_zones, nr_data_zones;
-	struct dmz_dev *dev = zmd->zoned_dev;
+	struct dmz_dev *dev;
 	u32 crc, stored_crc;
 	u64 gen;
 
+	dev = zmd_mdev(zmd);
 	gen = le64_to_cpu(sb->gen);
 	stored_crc = le32_to_cpu(sb->crc);
 	sb->crc = 0;
@@ -1131,8 +1174,11 @@  static int dmz_init_zone(struct blk_zone *blkz, unsigned int idx, void *data)
 		zmd->nr_useable_zones++;
 		if (dmz_is_rnd(zone)) {
 			zmd->nr_rnd_zones++;
-			if (!zmd->sb_zone) {
-				/* Super block zone */
+			if (!zmd->sb_zone && !zmd->regu_dmz_dev) {
+				/*
+				 * Super block zone goes to regular
+				 * device by default.
+				 */
 				zmd->sb_zone = zone;
 			}
 		}
@@ -1157,7 +1203,8 @@  static void dmz_drop_zones(struct dmz_metadata *zmd)
 static int dmz_init_zones(struct dmz_metadata *zmd)
 {
 	struct dmz_dev *dev = zmd->zoned_dev;
-	int ret;
+	int ret, i;
+	unsigned int total_nr_zones;
 
 	/* Init */
 	zmd->zone_bitmap_size = dev->zone_nr_blocks >> 3;
@@ -1167,7 +1214,10 @@  static int dmz_init_zones(struct dmz_metadata *zmd)
 					DMZ_BLOCK_SIZE_BITS);
 
 	/* Allocate zone array */
-	zmd->zones = kcalloc(dev->nr_zones, sizeof(struct dm_zone), GFP_KERNEL);
+	total_nr_zones = dev->nr_zones;
+	if (zmd->regu_dmz_dev)
+		total_nr_zones += zmd->regu_dmz_dev->nr_zones;
+	zmd->zones = kcalloc(total_nr_zones, sizeof(struct dm_zone), GFP_KERNEL);
 	if (!zmd->zones)
 		return -ENOMEM;
 
@@ -1186,6 +1236,25 @@  static int dmz_init_zones(struct dmz_metadata *zmd)
 		return ret;
 	}
 
+	if (zmd->regu_dmz_dev) {
+		/* Emulate zone information for regular device zone. */
+		for (i = 0; i < zmd->regu_dmz_dev->nr_zones; i++) {
+			struct dm_zone *zone = &zmd->zones[i + dev->nr_zones];
+
+			INIT_LIST_HEAD(&zone->link);
+			atomic_set(&zone->refcount, 0);
+			zone->chunk = DMZ_MAP_UNMAPPED;
+
+			set_bit(DMZ_RND, &zone->flags);
+			zmd->nr_rnd_zones++;
+			zmd->nr_useable_zones++;
+			zone->wp_block = 0;
+			if (!zmd->sb_zone)
+				/* Super block zone */
+				zmd->sb_zone = zone;
+		}
+	}
+
 	return 0;
 }
 
@@ -1313,13 +1382,13 @@  static void dmz_get_zone_weight(struct dmz_metadata *zmd, struct dm_zone *zone);
  */
 static int dmz_load_mapping(struct dmz_metadata *zmd)
 {
-	struct dmz_dev *dev = zmd->zoned_dev;
 	struct dm_zone *dzone, *bzone;
 	struct dmz_mblock *dmap_mblk = NULL;
 	struct dmz_map *dmap;
 	unsigned int i = 0, e = 0, chunk = 0;
 	unsigned int dzone_id;
 	unsigned int bzone_id;
+	struct dmz_dev *dev = zmd_mdev(zmd);
 
 	/* Metadata block array for the chunk mapping table */
 	zmd->map_mblk = kcalloc(zmd->nr_map_blocks,
@@ -1345,7 +1414,7 @@  static int dmz_load_mapping(struct dmz_metadata *zmd)
 		if (dzone_id == DMZ_MAP_UNMAPPED)
 			goto next;
 
-		if (dzone_id >= dev->nr_zones) {
+		if (dzone_id >= dev->target->nr_zones) {
 			dmz_dev_err(dev, "Chunk %u mapping: invalid data zone ID %u",
 				    chunk, dzone_id);
 			return -EIO;
@@ -1366,7 +1435,7 @@  static int dmz_load_mapping(struct dmz_metadata *zmd)
 		if (bzone_id == DMZ_MAP_UNMAPPED)
 			goto next;
 
-		if (bzone_id >= dev->nr_zones) {
+		if (bzone_id >= dev->target->nr_zones) {
 			dmz_dev_err(dev, "Chunk %u mapping: invalid buffer zone ID %u",
 				    chunk, bzone_id);
 			return -EIO;
@@ -1398,7 +1467,7 @@  static int dmz_load_mapping(struct dmz_metadata *zmd)
 	 * fully initialized. All remaining zones are unmapped data
 	 * zones. Finish initializing those here.
 	 */
-	for (i = 0; i < dev->nr_zones; i++) {
+	for (i = 0; i < dev->target->nr_zones; i++) {
 		dzone = dmz_get(zmd, i);
 		if (dmz_is_meta(dzone))
 			continue;
@@ -1632,7 +1701,7 @@  struct dm_zone *dmz_get_chunk_mapping(struct dmz_metadata *zmd, unsigned int chu
 		/* Allocate a random zone */
 		dzone = dmz_alloc_zone(zmd, DMZ_ALLOC_RND);
 		if (!dzone) {
-			if (dmz_bdev_is_dying(zmd->zoned_dev)) {
+			if (dmz_bdev_is_dying(zmd_mdev(zmd))) {
 				dzone = ERR_PTR(-EIO);
 				goto out;
 			}
@@ -1733,7 +1802,7 @@  struct dm_zone *dmz_get_chunk_buffer(struct dmz_metadata *zmd,
 	/* Allocate a random zone */
 	bzone = dmz_alloc_zone(zmd, DMZ_ALLOC_RND);
 	if (!bzone) {
-		if (dmz_bdev_is_dying(zmd->zoned_dev)) {
+		if (dmz_bdev_is_dying(zmd_mdev(zmd))) {
 			bzone = ERR_PTR(-EIO);
 			goto out;
 		}
@@ -2360,7 +2429,8 @@  static void dmz_cleanup_metadata(struct dmz_metadata *zmd)
 /*
  * Initialize the zoned metadata.
  */
-int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **metadata)
+int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_dev *regu_dmz_dev,
+		struct dmz_metadata **metadata)
 {
 	struct dmz_metadata *zmd;
 	unsigned int i, zid;
@@ -2372,6 +2442,7 @@  int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **metadata)
 		return -ENOMEM;
 
 	zmd->zoned_dev = dev;
+	zmd->regu_dmz_dev = regu_dmz_dev;
 	zmd->mblk_rbtree = RB_ROOT;
 	init_rwsem(&zmd->mblk_sem);
 	mutex_init(&zmd->mblk_flush_lock);
@@ -2440,9 +2511,9 @@  int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **metadata)
 		     bdev_zoned_model(dev->bdev) == BLK_ZONED_HA ?
 		     "aware" : "managed");
 	dmz_dev_info(dev, "  %llu 512-byte logical sectors",
-		     (u64)dev->capacity);
+		     (u64)dev->capacity  + (u64)regu_dmz_dev->capacity);
 	dmz_dev_info(dev, "  %u zones of %llu 512-byte logical sectors",
-		     dev->nr_zones, (u64)dev->zone_nr_sectors);
+		     dev->nr_zones + regu_dmz_dev->nr_zones, (u64)dev->zone_nr_sectors);
 	dmz_dev_info(dev, "  %u metadata zones",
 		     zmd->nr_meta_zones * 2);
 	dmz_dev_info(dev, "  %u data zones for %u chunks",
@@ -2488,7 +2559,7 @@  void dmz_dtr_metadata(struct dmz_metadata *zmd)
  */
 int dmz_resume_metadata(struct dmz_metadata *zmd)
 {
-	struct dmz_dev *dev = zmd->zoned_dev;
+	struct dmz_dev *dev = zmd_mdev(zmd);
 	struct dm_zone *zone;
 	sector_t wp_block;
 	unsigned int i;
diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c
index cae4bfe..41dbb9d 100644
--- a/drivers/md/dm-zoned-target.c
+++ b/drivers/md/dm-zoned-target.c
@@ -803,7 +803,7 @@  static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 
 	/* Initialize metadata */
 	dev = dmz->zoned_dev;
-	ret = dmz_ctr_metadata(dev, &dmz->metadata);
+	ret = dmz_ctr_metadata(dev, dmz->regu_dmz_dev, &dmz->metadata);
 	if (ret) {
 		ti->error = "Metadata initialization failed";
 		goto err_dev;
@@ -852,8 +852,8 @@  static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	}
 	mod_delayed_work(dmz->flush_wq, &dmz->flush_work, DMZ_FLUSH_PERIOD);
 
-	/* Initialize reclaim */
-	ret = dmz_ctr_reclaim(dev, dmz->metadata, &dmz->reclaim);
+	/* Initialize reclaim, only reclaim from regular device. */
+	ret = dmz_ctr_reclaim(dmz->regu_dmz_dev, dmz->metadata, &dmz->reclaim);
 	if (ret) {
 		ti->error = "Zone reclaim initialization failed";
 		goto err_fwq;
diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h
index a3535bc..7aa1a30 100644
--- a/drivers/md/dm-zoned.h
+++ b/drivers/md/dm-zoned.h
@@ -206,7 +206,8 @@  struct dmz_reclaim;
 /*
  * Functions defined in dm-zoned-metadata.c
  */
-int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **zmd);
+int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_dev *regu_dmz_dev,
+		struct dmz_metadata **zmd);
 void dmz_dtr_metadata(struct dmz_metadata *zmd);
 int dmz_resume_metadata(struct dmz_metadata *zmd);