[v3,06/10] scsi: sd_zbc: emulate ZONE_APPEND commands

Message ID	20200327165012.34443-7-johannes.thumshirn@wdc.com (mailing list archive)
State	Superseded
Headers	show Return-Path: <SRS0=+SnX=5M=vger.kernel.org=linux-scsi-owner@kernel.org> IronPort-SDR: zjbTxuY+SkXtjZq2ovHKodRf6VTZyzpdejuWW+Ca6QD+mT5Iv//5EcHA/upf3NntInP5hbP8Fr 5/AMlOErrRdwGZOzosqjjiwE0rkCnHsctk24Up6HKNaAQyvXxJe0soIGurI/sJyiElzlet2ORk WtDofchTKOseH1MRUcHXgAD/rA3vYPggeq7t2/rQ8awzbqbZ1eIETD0TFEocBOl20ZCTedoilV eq0W+M0Zd29MpQ77+Jn/34hhZe2ve3gcVIuQeGdU0T+34Zga04I9pqphMTBC2OuCXWhkD2uYuI dQY= IronPort-SDR: 93DFMNvViBoVqoB2GQA17XbPk6dxbXeaiP5DGVsVElIyWU4aDCP3BxG3B2zzBsg4wTRri+C1+m +2ZTH11cT7yJ5+xf2tkmPFSxlUvRdRo4lEpDLUIWMETljMu7ySqGSKxwW+/n23TmDweb0HztJn ms+/MUGWJvxemODvmNW3H2kod+Kvq2Y0L/KHrmszKA8oOnGsYpTkNmmYn4/SrAxhG6bVNy/PXH RPNO5kIwxgMNw9HS8CjjQZeN+Xapm+QinAYBuzxJYzBFX45HRI1nazeeF4txPG5YMFiJCL4eOx ccHbnweCWqpU5D9ccgTfjtR5 IronPort-SDR: DmjFkXgJbaHi+4iUXg4DvnX3X7T7cRYgDo/gGyTYS9q917uwehgSlTG7aOnJPc5GS9axlrJ1n0 +W2tlY5ErstEO6Rdp/KzszGnsMYYj50mMvaIXmKJyocZOIsz5R3KGdh97zOuuQgaUuUSOuTKTh jWQTUbktHnG5KiDAOwhbUrx3T0Ow4otb1WfFMHVkEI2Q/vm3pw8CYCVcy5mXZdSgVfDpe8Yg37 bzMlc82bkBx/A2SvJIidisuolPHO+QOtbKopb09P6RWkRF16WkiSl+K1z3oKfVa5WrrUwvQwRq /g8= WDCIronportException: Internal From: Johannes Thumshirn <johannes.thumshirn@wdc.com> To: Jens Axboe <axboe@kernel.dk> Cc: Christoph Hellwig <hch@infradead.org>, linux-block <linux-block@vger.kernel.org>, Damien Le Moal <Damien.LeMoal@wdc.com>, Keith Busch <kbusch@kernel.org>, "linux-scsi @ vger . kernel . org" <linux-scsi@vger.kernel.org>, "Martin K . Petersen" <martin.petersen@oracle.com>, "linux-fsdevel @ vger . kernel . org" <linux-fsdevel@vger.kernel.org>, Johannes Thumshirn <johannes.thumshirn@wdc.com> Subject: [PATCH v3 06/10] scsi: sd_zbc: emulate ZONE_APPEND commands Date: Sat, 28 Mar 2020 01:50:08 +0900 Message-Id: <20200327165012.34443-7-johannes.thumshirn@wdc.com> In-Reply-To: <20200327165012.34443-1-johannes.thumshirn@wdc.com> References: <20200327165012.34443-1-johannes.thumshirn@wdc.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Sender: linux-scsi-owner@vger.kernel.org Precedence: bulk
Series	Introduce Zone Append for writing to zoned block devices \| expand [v3,00/10] Introduce Zone Append for writing to zoned block devices [v3,01/10] block: provide fallbacks for blk_queue_zone_is_seq and blk_queue_zone_no [v3,02/10] block: Introduce REQ_OP_ZONE_APPEND [v3,03/10] block: introduce blk_req_zone_write_trylock [v3,04/10] block: Introduce zone write pointer offset caching [v3,05/10] scsi: sd_zbc: factor out sanity checks for zoned commands [v3,06/10] scsi: sd_zbc: emulate ZONE_APPEND commands [v3,07/10] null_blk: Cleanup zoned device initialization [v3,08/10] null_blk: Support REQ_OP_ZONE_APPEND [v3,09/10] block: export bio_release_pages and bio_iov_iter_get_pages [v3,10/10] zonefs: use REQ_OP_ZONE_APPEND for sync DIO

diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 707f47c0ec98..18584bf01e11 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -1215,6 +1215,12 @@ static blk_status_t sd_setup_read_write_cmnd(struct scsi_cmnd *cmd) else protect = 0; + if (req_op(rq) == REQ_OP_ZONE_APPEND) { + ret = sd_zbc_prepare_zone_append(cmd, &lba, nr_blocks); + if (ret) + return ret; + } + if (protect && sdkp->protection_type == T10_PI_TYPE2_PROTECTION) { ret = sd_setup_rw32_cmnd(cmd, write, lba, nr_blocks, protect | fua); @@ -1287,6 +1293,7 @@ static blk_status_t sd_init_command(struct scsi_cmnd *cmd) return sd_setup_flush_cmnd(cmd); case REQ_OP_READ: case REQ_OP_WRITE: + case REQ_OP_ZONE_APPEND: return sd_setup_read_write_cmnd(cmd); case REQ_OP_ZONE_RESET: return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_RESET_WRITE_POINTER, @@ -2055,7 +2062,7 @@ static int sd_done(struct scsi_cmnd *SCpnt) out: if (sd_is_zoned(sdkp)) - sd_zbc_complete(SCpnt, good_bytes, &sshdr); + good_bytes = sd_zbc_complete(SCpnt, good_bytes, &sshdr); SCSI_LOG_HLCOMPLETE(1, scmd_printk(KERN_INFO, SCpnt, "sd_done: completed %d of %d bytes\n", @@ -3370,6 +3377,8 @@ static int sd_probe(struct device *dev) sdkp->first_scan = 1; sdkp->max_medium_access_timeouts = SD_MAX_MEDIUM_TIMEOUTS; + sd_zbc_init_disk(sdkp); + sd_revalidate_disk(gd); gd->flags = GENHD_FL_EXT_DEVT; @@ -3663,19 +3672,26 @@ static int __init init_sd(void) if (!sd_page_pool) { printk(KERN_ERR "sd: can't init discard page pool\n"); err = -ENOMEM; - goto err_out_ppool; + goto err_out_cdb_pool; } + err = sd_zbc_init(); + if (err) + goto err_out_ppool; + err = scsi_register_driver(&sd_template.gendrv); if (err) - goto err_out_driver; + goto err_out_zbc; return 0; -err_out_driver: - mempool_destroy(sd_page_pool); +err_out_zbc: + sd_zbc_exit(); err_out_ppool: + mempool_destroy(sd_page_pool); + +err_out_cdb_pool: mempool_destroy(sd_cdb_pool); err_out_cache: @@ -3705,6 +3721,8 @@ static void __exit exit_sd(void) mempool_destroy(sd_page_pool); kmem_cache_destroy(sd_cdb_cache); + sd_zbc_exit(); + class_unregister(&sd_disk_class); for (i = 0; i < SD_MAJORS; i++) { diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h index 50fff0bf8c8e..34641be1d434 100644 --- a/drivers/scsi/sd.h +++ b/drivers/scsi/sd.h @@ -79,6 +79,7 @@ struct scsi_disk { u32 zones_optimal_open; u32 zones_optimal_nonseq; u32 zones_max_open; + spinlock_t zone_wp_ofst_lock; #endif atomic_t openers; sector_t capacity; /* size in logical blocks */ @@ -207,17 +208,33 @@ static inline int sd_is_zoned(struct scsi_disk *sdkp) #ifdef CONFIG_BLK_DEV_ZONED +int __init sd_zbc_init(void); +void sd_zbc_exit(void); + +void sd_zbc_init_disk(struct scsi_disk *sdkp); extern int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buffer); extern void sd_zbc_print_zones(struct scsi_disk *sdkp); blk_status_t sd_zbc_setup_zone_mgmt_cmnd(struct scsi_cmnd *cmd, unsigned char op, bool all); -extern void sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes, - struct scsi_sense_hdr *sshdr); +unsigned int sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes, + struct scsi_sense_hdr *sshdr); int sd_zbc_report_zones(struct gendisk *disk, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); +blk_status_t sd_zbc_prepare_zone_append(struct scsi_cmnd *cmd, sector_t *lba, + unsigned int nr_blocks); + #else /* CONFIG_BLK_DEV_ZONED */ +static inline int sd_zbc_init(void) +{ + return 0; +} + +static inline void sd_zbc_exit(void) {} + +static inline void sd_zbc_init_disk(struct scsi_disk *sdkp) {} + static inline int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buf) { @@ -233,9 +250,18 @@ static inline blk_status_t sd_zbc_setup_zone_mgmt_cmnd(struct scsi_cmnd *cmd, return BLK_STS_TARGET; } -static inline void sd_zbc_complete(struct scsi_cmnd *cmd, - unsigned int good_bytes, - struct scsi_sense_hdr *sshdr) {} +static inline unsigned int sd_zbc_complete(struct scsi_cmnd *cmd, + unsigned int good_bytes, struct scsi_sense_hdr *sshdr) +{ + return 0; +} + +static inline blk_status_t sd_zbc_prepare_zone_append(struct scsi_cmnd *cmd, + sector_t *lba, + unsigned int nr_blocks) +{ + return BLK_STS_TARGET; +} #define sd_zbc_report_zones NULL diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index ee156fbf3780..17bdc50d29f3 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -19,6 +19,11 @@ #include "sd.h" +static struct kmem_cache *sd_zbc_zone_work_cache; +static mempool_t *sd_zbc_zone_work_pool; + +#define SD_ZBC_ZONE_WORK_MEMPOOL_SIZE 8 + static int sd_zbc_parse_report(struct scsi_disk *sdkp, u8 *buf, unsigned int idx, report_zones_cb cb, void *data) { @@ -229,6 +234,152 @@ static blk_status_t sd_zbc_cmnd_checks(struct scsi_cmnd *cmd) return BLK_STS_OK; } +#define SD_ZBC_INVALID_WP_OFST ~(0u) +#define SD_ZBC_UPDATING_WP_OFST (SD_ZBC_INVALID_WP_OFST - 1) + +struct sd_zbc_zone_work { + struct work_struct work; + struct scsi_disk *sdkp; + unsigned int zno; + char buf[SD_BUF_SIZE]; +}; + +static int sd_zbc_update_wp_ofst_cb(struct blk_zone *zone, unsigned int idx, + void *data) +{ + struct sd_zbc_zone_work *zwork = data; + struct scsi_disk *sdkp = zwork->sdkp; + struct request_queue *q = sdkp->disk->queue; + int ret; + + spin_lock_bh(&sdkp->zone_wp_ofst_lock); + ret = blk_get_zone_wp_offset(zone, &q->seq_zones_wp_ofst[zwork->zno]); + if (ret) + q->seq_zones_wp_ofst[zwork->zno] = SD_ZBC_INVALID_WP_OFST; + spin_unlock_bh(&sdkp->zone_wp_ofst_lock); + + return ret; +} + +static void sd_zbc_update_wp_ofst_workfn(struct work_struct *work) +{ + struct sd_zbc_zone_work *zwork; + struct scsi_disk *sdkp; + int ret; + + zwork = container_of(work, struct sd_zbc_zone_work, work); + sdkp = zwork->sdkp; + + ret = sd_zbc_do_report_zones(sdkp, zwork->buf, SD_BUF_SIZE, + zwork->zno * sdkp->zone_blocks, true); + if (!ret) + sd_zbc_parse_report(sdkp, zwork->buf + 64, 0, + sd_zbc_update_wp_ofst_cb, zwork); + + mempool_free(zwork, sd_zbc_zone_work_pool); + scsi_device_put(sdkp->device); +} + +static blk_status_t sd_zbc_update_wp_ofst(struct scsi_disk *sdkp, + unsigned int zno) +{ + struct sd_zbc_zone_work *zwork; + + /* + * We are about to schedule work to update a zone write pointer offset, + * which will cause the zone append command to be requeued. So make + * sure that the scsi device does not go away while the work is + * being processed. + */ + if (scsi_device_get(sdkp->device)) + return BLK_STS_IOERR; + + zwork = mempool_alloc(sd_zbc_zone_work_pool, GFP_ATOMIC); + if (!zwork) { + /* Retry later */ + scsi_device_put(sdkp->device); + return BLK_STS_RESOURCE; + } + + memset(zwork, 0, sizeof(struct sd_zbc_zone_work)); + INIT_WORK(&zwork->work, sd_zbc_update_wp_ofst_workfn); + zwork->sdkp = sdkp; + zwork->zno = zno; + + sdkp->disk->queue->seq_zones_wp_ofst[zno] = SD_ZBC_UPDATING_WP_OFST; + + schedule_work(&zwork->work); + + return BLK_STS_RESOURCE; +} + +/** + * sd_zbc_prepare_zone_append() - Prepare an emulated ZONE_APPEND command. + * @cmd: the command to setup + * @lba: the LBA to patch + * @nr_blocks: the number of LBAs to be written + * + * Called from sd_setup_read_write_cmnd() for REQ_OP_ZONE_APPEND. + * @sd_zbc_prepare_zone_append() handles the necessary zone wrote locking and + * patching of the lba for an emulated ZONE_APPEND command. + * + * In case the cached write pointer offset is %SD_ZBC_INVALID_WP_OFST it will + * schedule a REPORT ZONES command and return BLK_STS_IOERR. + */ +blk_status_t sd_zbc_prepare_zone_append(struct scsi_cmnd *cmd, sector_t *lba, + unsigned int nr_blocks) +{ + struct request *rq = cmd->request; + struct scsi_disk *sdkp = scsi_disk(rq->rq_disk); + unsigned int wp_ofst, zno = blk_rq_zone_no(rq); + blk_status_t ret; + + ret = sd_zbc_cmnd_checks(cmd); + if (ret != BLK_STS_OK) + return ret; + + if (!blk_rq_zone_is_seq(rq)) + return BLK_STS_IOERR; + + /* Unlock of the write lock will happen in sd_zbc_complete() */ + if (!blk_req_zone_write_trylock(rq)) + return BLK_STS_ZONE_RESOURCE; + + spin_lock_bh(&sdkp->zone_wp_ofst_lock); + + wp_ofst = rq->q->seq_zones_wp_ofst[zno]; + + if (wp_ofst == SD_ZBC_UPDATING_WP_OFST) { + /* Write pointer offset update in progress: ask for a requeue */ + ret = BLK_STS_RESOURCE; + goto err; + } + + if (wp_ofst == SD_ZBC_INVALID_WP_OFST) { + /* Invalid write pointer offset: trigger an update from disk */ + ret = sd_zbc_update_wp_ofst(sdkp, zno); + goto err; + } + + wp_ofst = sectors_to_logical(sdkp->device, wp_ofst); + if (wp_ofst + nr_blocks > sdkp->zone_blocks) { + ret = BLK_STS_IOERR; + goto err; + } + + /* Set the LBA for the write command used to emulate zone append */ + *lba += wp_ofst; + + spin_unlock_bh(&sdkp->zone_wp_ofst_lock); + + return BLK_STS_OK; + +err: + spin_unlock_bh(&sdkp->zone_wp_ofst_lock); + blk_req_zone_write_unlock(rq); + return ret; +} + /** * sd_zbc_setup_zone_mgmt_cmnd - Prepare a zone ZBC_OUT command. The operations * can be RESET WRITE POINTER, OPEN, CLOSE or FINISH. @@ -266,25 +417,75 @@ blk_status_t sd_zbc_setup_zone_mgmt_cmnd(struct scsi_cmnd *cmd, cmd->transfersize = 0; cmd->allowed = 0; + /* Only zone reset and zone finish need zone write locking */ + if (op != ZO_RESET_WRITE_POINTER && op != ZO_FINISH_ZONE) + return BLK_STS_OK; + + if (all) { + /* We do not write lock all zones for an all zone reset */ + if (op == ZO_RESET_WRITE_POINTER) + return BLK_STS_OK; + + /* Finishing all zones is not supported */ + return BLK_STS_IOERR; + } + + if (!blk_rq_zone_is_seq(rq)) + return BLK_STS_IOERR; + + if (!blk_req_zone_write_trylock(rq)) + return BLK_STS_ZONE_RESOURCE; + return BLK_STS_OK; } +static inline bool sd_zbc_zone_needs_write_unlock(struct request *rq) +{ + /* + * For zone append, the zone was locked in sd_zbc_prepare_zone_append(). + * For zone reset and zone finish, the zone was locked in + * sd_zbc_setup_zone_mgmt_cmnd(). + * For regular writes, the zone is unlocked by the block layer elevator. + */ + return req_op(rq) == REQ_OP_ZONE_APPEND || + req_op(rq) == REQ_OP_ZONE_RESET || + req_op(rq) == REQ_OP_ZONE_FINISH; +} + +static bool sd_zbc_need_zone_wp_update(struct request *rq) +{ + if (req_op(rq) == REQ_OP_WRITE || + req_op(rq) == REQ_OP_WRITE_ZEROES || + req_op(rq) == REQ_OP_WRITE_SAME) + return blk_rq_zone_is_seq(rq); + + if (req_op(rq) == REQ_OP_ZONE_RESET_ALL) + return true; + + return sd_zbc_zone_needs_write_unlock(rq); +} + /** * sd_zbc_complete - ZBC command post processing. * @cmd: Completed command * @good_bytes: Command reply bytes * @sshdr: command sense header * - * Called from sd_done(). Process report zones reply and handle reset zone - * and write commands errors. + * Called from sd_done() to handle zone commands errors and updates to the + * device queue zone write pointer offset cahce. */ -void sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes, +unsigned int sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes, struct scsi_sense_hdr *sshdr) { int result = cmd->result; struct request *rq = cmd->request; + struct request_queue *q = rq->q; + struct gendisk *disk = rq->rq_disk; + struct scsi_disk *sdkp = scsi_disk(disk); + enum req_opf op = req_op(rq); + unsigned int zno; - if (op_is_zone_mgmt(req_op(rq)) && + if (op_is_zone_mgmt(op) && result && sshdr->sense_key == ILLEGAL_REQUEST && sshdr->asc == 0x24) { @@ -294,7 +495,69 @@ void sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes, * so be quiet about the error. */ rq->rq_flags |= RQF_QUIET; + goto unlock_zone; + } + + if (!sd_zbc_need_zone_wp_update(rq)) + goto unlock_zone; + + /* + * If we got an error for a command that needs updating the write + * pointer offset cache, we must mark the zone wp offset entry as + * invalid to force an update from disk the next time a zone append + * command is issued. + */ + zno = blk_rq_zone_no(rq); + spin_lock_bh(&sdkp->zone_wp_ofst_lock); + + if (result && op != REQ_OP_ZONE_RESET_ALL) { + if (op == REQ_OP_ZONE_APPEND) { + /* Force complete completion (no retry) */ + good_bytes = 0; + scsi_set_resid(cmd, blk_rq_bytes(rq)); + } + + /* + * Force an update of the zone write pointer offset on + * the next zone append access. + */ + if (q->seq_zones_wp_ofst[zno] != SD_ZBC_UPDATING_WP_OFST) + q->seq_zones_wp_ofst[zno] = SD_ZBC_INVALID_WP_OFST; + goto unlock_wp_ofst; } + + switch (op) { + case REQ_OP_ZONE_APPEND: + rq->__sector += q->seq_zones_wp_ofst[zno]; + /* fallthrough */ + case REQ_OP_WRITE_ZEROES: + case REQ_OP_WRITE_SAME: + case REQ_OP_WRITE: + if (q->seq_zones_wp_ofst[zno] < sd_zbc_zone_sectors(sdkp)) + q->seq_zones_wp_ofst[zno] += good_bytes >> SECTOR_SHIFT; + break; + case REQ_OP_ZONE_RESET: + q->seq_zones_wp_ofst[zno] = 0; + break; + case REQ_OP_ZONE_FINISH: + q->seq_zones_wp_ofst[zno] = sd_zbc_zone_sectors(sdkp); + break; + case REQ_OP_ZONE_RESET_ALL: + memset(q->seq_zones_wp_ofst, 0, + sdkp->nr_zones * sizeof(unsigned int)); + break; + default: + break; + } + +unlock_wp_ofst: + spin_unlock_bh(&sdkp->zone_wp_ofst_lock); + +unlock_zone: + if (sd_zbc_zone_needs_write_unlock(rq)) + blk_req_zone_write_unlock(rq); + + return good_bytes; } /** @@ -399,6 +662,7 @@ static int sd_zbc_check_capacity(struct scsi_disk *sdkp, unsigned char *buf, int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buf) { struct gendisk *disk = sdkp->disk; + struct request_queue *q = disk->queue; unsigned int nr_zones; u32 zone_blocks = 0; int ret; @@ -421,9 +685,12 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buf) goto err; /* The drive satisfies the kernel restrictions: set it up */ - blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, sdkp->disk->queue); - blk_queue_required_elevator_features(sdkp->disk->queue, - ELEVATOR_F_ZBD_SEQ_WRITE); + blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); + blk_queue_flag_set(QUEUE_FLAG_ZONE_WP_OFST, q); + blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE); + blk_queue_max_zone_append_sectors(q, + min_t(u32, logical_to_sectors(sdkp->device, zone_blocks), + q->limits.max_segments << (PAGE_SHIFT - SECTOR_SHIFT))); nr_zones = round_up(sdkp->capacity, zone_blocks) >> ilog2(zone_blocks); /* READ16/WRITE16 is mandatory for ZBC disks */ @@ -475,3 +742,38 @@ void sd_zbc_print_zones(struct scsi_disk *sdkp) sdkp->nr_zones, sdkp->zone_blocks); } + +void sd_zbc_init_disk(struct scsi_disk *sdkp) +{ + if (!sd_is_zoned(sdkp)) + return; + + spin_lock_init(&sdkp->zone_wp_ofst_lock); +} + +int __init sd_zbc_init(void) +{ + sd_zbc_zone_work_cache = + kmem_cache_create("sd_zbc_zone_work", + sizeof(struct sd_zbc_zone_work), + 0, 0, NULL); + if (!sd_zbc_zone_work_cache) + return -ENOMEM; + + sd_zbc_zone_work_pool = + mempool_create_slab_pool(SD_ZBC_ZONE_WORK_MEMPOOL_SIZE, + sd_zbc_zone_work_cache); + if (!sd_zbc_zone_work_pool) { + kmem_cache_destroy(sd_zbc_zone_work_cache); + printk(KERN_ERR "sd_zbc: create zone work pool failed\n"); + return -ENOMEM; + } + + return 0; +} + +void sd_zbc_exit(void) +{ + mempool_destroy(sd_zbc_zone_work_pool); + kmem_cache_destroy(sd_zbc_zone_work_cache); +}

[v3,06/10] scsi: sd_zbc: emulate ZONE_APPEND commands

Commit Message

Comments

Patch