@@ -679,6 +679,48 @@ struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs)
}
EXPORT_SYMBOL(bio_clone_fast);
+static bool bio_try_merge_zone_append_page(struct bio *bio, struct page *page,
+ unsigned int len, unsigned int off)
+{
+ struct request_queue *q = bio->bi_disk->queue;
+ struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
+ unsigned long mask = queue_segment_boundary(q);
+ phys_addr_t addr1 = page_to_phys(bv->bv_page) + bv->bv_offset;
+ phys_addr_t addr2 = page_to_phys(page) + off + len - 1;
+ bool same_page = false;
+
+ if ((addr1 | mask) != (addr2 | mask))
+ return false;
+ if (bv->bv_len + len > queue_max_segment_size(q))
+ return false;
+ return __bio_try_merge_page(bio, page, len, off, &same_page);
+}
+
+static int bio_add_append_page(struct bio *bio, struct page *page, unsigned len,
+ size_t offset)
+{
+ struct request_queue *q = bio->bi_disk->queue;
+ unsigned int max_append_sectors = queue_max_zone_append_sectors(q);
+
+ if (WARN_ON_ONCE(!max_append_sectors))
+ return 0;
+
+ if (((bio->bi_iter.bi_size + len) >> 9) > max_append_sectors)
+ return 0;
+
+ if (bio->bi_vcnt > 0) {
+ if (bio_try_merge_zone_append_page(bio, page, len, offset))
+ return len;
+ }
+
+ if (bio->bi_vcnt >= queue_max_segments(q))
+ return 0;
+
+ __bio_add_page(bio, page, len, offset);
+
+ return len;
+}
+
static inline bool page_is_mergeable(const struct bio_vec *bv,
struct page *page, unsigned int len, unsigned int off,
bool *same_page)
@@ -866,6 +908,7 @@ int bio_add_page(struct bio *bio, struct page *page,
if (!__bio_try_merge_page(bio, page, len, offset, &same_page)) {
if (bio_full(bio, len))
return 0;
+
__bio_add_page(bio, page, len, offset);
}
return len;
@@ -927,6 +970,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
ssize_t size, left;
unsigned len, i;
size_t offset;
+ unsigned op = bio_op(bio);
/*
* Move page array up in the allocated memory for the bio vecs as far as
@@ -944,13 +988,20 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
struct page *page = pages[i];
len = min_t(size_t, PAGE_SIZE - offset, left);
+ if (op == REQ_OP_ZONE_APPEND) {
+ int ret;
- if (__bio_try_merge_page(bio, page, len, offset, &same_page)) {
+ ret = bio_add_append_page(bio, page, len, offset);
+ if (ret != len)
+ return -EINVAL;
+ } else if (__bio_try_merge_page(bio, page, len, offset,
+ &same_page)) {
if (same_page)
put_page(page);
} else {
if (WARN_ON_ONCE(bio_full(bio, len)))
return -EINVAL;
+
__bio_add_page(bio, page, len, offset);
}
offset = 0;
@@ -1895,6 +1946,10 @@ struct bio *bio_split(struct bio *bio, int sectors,
BUG_ON(sectors <= 0);
BUG_ON(sectors >= bio_sectors(bio));
+ /* Zone append commands cannot be split */
+ if (WARN_ON_ONCE(bio_op(bio) == REQ_OP_ZONE_APPEND))
+ return NULL;
+
split = bio_clone_fast(bio, gfp, bs);
if (!split)
return NULL;
@@ -135,6 +135,7 @@ static const char *const blk_op_name[] = {
REQ_OP_NAME(ZONE_OPEN),
REQ_OP_NAME(ZONE_CLOSE),
REQ_OP_NAME(ZONE_FINISH),
+ REQ_OP_NAME(ZONE_APPEND),
REQ_OP_NAME(WRITE_SAME),
REQ_OP_NAME(WRITE_ZEROES),
REQ_OP_NAME(SCSI_IN),
@@ -240,6 +241,17 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
bio_advance(bio, nbytes);
+ if (req_op(rq) == REQ_OP_ZONE_APPEND && error == BLK_STS_OK) {
+ /*
+ * Partial zone append completions cannot be supported as the
+ * BIO fragments may end up not being written sequentially.
+ */
+ if (bio->bi_iter.bi_size)
+ bio->bi_status = BLK_STS_IOERR;
+ else
+ bio->bi_iter.bi_sector = rq->__sector;
+ }
+
/* don't actually finish bio if it's part of flush sequence */
if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ))
bio_endio(bio);
@@ -865,6 +877,41 @@ static inline int blk_partition_remap(struct bio *bio)
return ret;
}
+/*
+ * Check write append to a zoned block device.
+ */
+static inline blk_status_t blk_check_zone_append(struct request_queue *q,
+ struct bio *bio)
+{
+ sector_t pos = bio->bi_iter.bi_sector;
+ int nr_sectors = bio_sectors(bio);
+
+ /* Only applicable to zoned block devices */
+ if (!blk_queue_is_zoned(q))
+ return BLK_STS_NOTSUPP;
+
+ /* The bio sector must point to the start of a sequential zone */
+ if (pos & (blk_queue_zone_sectors(q) - 1) ||
+ !blk_queue_zone_is_seq(q, pos))
+ return BLK_STS_IOERR;
+
+ /*
+ * Not allowed to cross zone boundaries. Otherwise, the BIO will be
+ * split and could result in non-contiguous sectors being written in
+ * different zones.
+ */
+ if (blk_queue_zone_no(q, pos) != blk_queue_zone_no(q, pos + nr_sectors))
+ return BLK_STS_IOERR;
+
+ /* Make sure the BIO is small enough and will not get split */
+ if (nr_sectors > q->limits.max_zone_append_sectors)
+ return BLK_STS_IOERR;
+
+ bio->bi_opf |= REQ_NOMERGE;
+
+ return BLK_STS_OK;
+}
+
static noinline_for_stack bool
generic_make_request_checks(struct bio *bio)
{
@@ -937,6 +984,11 @@ generic_make_request_checks(struct bio *bio)
if (!q->limits.max_write_same_sectors)
goto not_supported;
break;
+ case REQ_OP_ZONE_APPEND:
+ status = blk_check_zone_append(q, bio);
+ if (status != BLK_STS_OK)
+ goto end_io;
+ break;
case REQ_OP_ZONE_RESET:
case REQ_OP_ZONE_OPEN:
case REQ_OP_ZONE_CLOSE:
@@ -1178,6 +1178,19 @@ static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy)
#define BLK_MQ_RESOURCE_DELAY 3 /* ms units */
+static void blk_mq_handle_zone_resource(struct request *rq,
+ struct list_head *zone_list)
+{
+ /*
+ * If we end up here it is because we cannot dispatch a request to a
+ * specific zone due to LLD level zone-write locking or other zone
+ * related resource not being available. In this case, set the request
+ * aside in zone_list for retrying it later.
+ */
+ list_add(&rq->queuelist, zone_list);
+ __blk_mq_requeue_request(rq);
+}
+
/*
* Returns true if we did some work AND can potentially do more.
*/
@@ -1189,6 +1202,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
bool no_tag = false;
int errors, queued;
blk_status_t ret = BLK_STS_OK;
+ LIST_HEAD(zone_list);
if (list_empty(list))
return false;
@@ -1257,6 +1271,16 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
list_add(&rq->queuelist, list);
__blk_mq_requeue_request(rq);
break;
+ } else if (ret == BLK_STS_ZONE_RESOURCE) {
+ /*
+ * Move the request to zone_list and keep going through
+ * the dispatch list to find more requests the drive can
+ * accept.
+ */
+ blk_mq_handle_zone_resource(rq, &zone_list);
+ if (list_empty(list))
+ break;
+ continue;
}
if (unlikely(ret != BLK_STS_OK)) {
@@ -1268,6 +1292,9 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
queued++;
} while (!list_empty(list));
+ if (!list_empty(&zone_list))
+ list_splice_tail_init(&zone_list, list);
+
hctx->dispatched[queued_to_index(queued)]++;
/*
@@ -48,6 +48,7 @@ void blk_set_default_limits(struct queue_limits *lim)
lim->chunk_sectors = 0;
lim->max_write_same_sectors = 0;
lim->max_write_zeroes_sectors = 0;
+ lim->max_zone_append_sectors = 0;
lim->max_discard_sectors = 0;
lim->max_hw_discard_sectors = 0;
lim->discard_granularity = 0;
@@ -83,6 +84,7 @@ void blk_set_stacking_limits(struct queue_limits *lim)
lim->max_dev_sectors = UINT_MAX;
lim->max_write_same_sectors = UINT_MAX;
lim->max_write_zeroes_sectors = UINT_MAX;
+ lim->max_zone_append_sectors = UINT_MAX;
}
EXPORT_SYMBOL(blk_set_stacking_limits);
@@ -257,6 +259,25 @@ void blk_queue_max_write_zeroes_sectors(struct request_queue *q,
}
EXPORT_SYMBOL(blk_queue_max_write_zeroes_sectors);
+/**
+ * blk_queue_max_zone_append_sectors - set max sectors for a single zone append
+ * @q: the request queue for the device
+ * @max_zone_append_sectors: maximum number of sectors to write per command
+ **/
+void blk_queue_max_zone_append_sectors(struct request_queue *q,
+ unsigned int max_zone_append_sectors)
+{
+ unsigned int max_sectors;
+
+ max_sectors = min(q->limits.max_hw_sectors, max_zone_append_sectors);
+ if (max_sectors)
+ max_sectors = min_not_zero(q->limits.chunk_sectors,
+ max_sectors);
+
+ q->limits.max_zone_append_sectors = max_sectors;
+}
+EXPORT_SYMBOL_GPL(blk_queue_max_zone_append_sectors);
+
/**
* blk_queue_max_segments - set max hw segments for a request for this queue
* @q: the request queue for the device
@@ -506,6 +527,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
b->max_write_same_sectors);
t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors,
b->max_write_zeroes_sectors);
+ t->max_zone_append_sectors = min(t->max_zone_append_sectors,
+ b->max_zone_append_sectors);
t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn);
t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask,
@@ -218,6 +218,13 @@ static ssize_t queue_write_zeroes_max_show(struct request_queue *q, char *page)
(unsigned long long)q->limits.max_write_zeroes_sectors << 9);
}
+static ssize_t queue_zone_append_max_show(struct request_queue *q, char *page)
+{
+ unsigned long long max_sectors = q->limits.max_zone_append_sectors;
+
+ return sprintf(page, "%llu\n", max_sectors << SECTOR_SHIFT);
+}
+
static ssize_t
queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
{
@@ -639,6 +646,11 @@ static struct queue_sysfs_entry queue_write_zeroes_max_entry = {
.show = queue_write_zeroes_max_show,
};
+static struct queue_sysfs_entry queue_zone_append_max_entry = {
+ .attr = {.name = "zone_append_max_bytes", .mode = 0444 },
+ .show = queue_zone_append_max_show,
+};
+
static struct queue_sysfs_entry queue_nonrot_entry = {
.attr = {.name = "rotational", .mode = 0644 },
.show = queue_show_nonrot,
@@ -749,6 +761,7 @@ static struct attribute *queue_attrs[] = {
&queue_discard_zeroes_data_entry.attr,
&queue_write_same_max_entry.attr,
&queue_write_zeroes_max_entry.attr,
+ &queue_zone_append_max_entry.attr,
&queue_nonrot_entry.attr,
&queue_zoned_entry.attr,
&queue_nr_zones_entry.attr,
@@ -1706,6 +1706,7 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
case BLK_STS_OK:
break;
case BLK_STS_RESOURCE:
+ case BLK_STS_ZONE_RESOURCE:
if (atomic_read(&sdev->device_busy) ||
scsi_device_blocked(sdev))
ret = BLK_STS_DEV_RESOURCE;
@@ -63,6 +63,18 @@ typedef u8 __bitwise blk_status_t;
*/
#define BLK_STS_DEV_RESOURCE ((__force blk_status_t)13)
+/*
+ * BLK_STS_ZONE_RESOURCE is returned from the driver to the block layer if zone
+ * related resources are unavailable, but the driver can guarantee the queue
+ * will be rerun in the future once the resources become available again.
+ *
+ * This is different from BLK_STS_DEV_RESOURCE in that it explicitly references
+ * a zone specific resource and IO to a different zone on the same device could
+ * still be served. Examples of that are zones that are write-locked, but a read
+ * to the same zone could be served.
+ */
+#define BLK_STS_ZONE_RESOURCE ((__force blk_status_t)14)
+
/**
* blk_path_error - returns true if error may be path related
* @error: status the request was completed with
@@ -296,6 +308,8 @@ enum req_opf {
REQ_OP_ZONE_CLOSE = 11,
/* Transition a zone to full */
REQ_OP_ZONE_FINISH = 12,
+ /* write data at the current zone write pointer */
+ REQ_OP_ZONE_APPEND = 13,
/* SCSI passthrough using struct scsi_request */
REQ_OP_SCSI_IN = 32,
@@ -336,6 +336,7 @@ struct queue_limits {
unsigned int max_hw_discard_sectors;
unsigned int max_write_same_sectors;
unsigned int max_write_zeroes_sectors;
+ unsigned int max_zone_append_sectors;
unsigned int discard_granularity;
unsigned int discard_alignment;
@@ -757,6 +758,9 @@ static inline bool rq_mergeable(struct request *rq)
if (req_op(rq) == REQ_OP_WRITE_ZEROES)
return false;
+ if (req_op(rq) == REQ_OP_ZONE_APPEND)
+ return false;
+
if (rq->cmd_flags & REQ_NOMERGE_FLAGS)
return false;
if (rq->rq_flags & RQF_NOMERGE_FLAGS)
@@ -1088,6 +1092,8 @@ extern void blk_queue_max_write_same_sectors(struct request_queue *q,
extern void blk_queue_max_write_zeroes_sectors(struct request_queue *q,
unsigned int max_write_same_sectors);
extern void blk_queue_logical_block_size(struct request_queue *, unsigned int);
+extern void blk_queue_max_zone_append_sectors(struct request_queue *q,
+ unsigned int max_zone_append_sectors);
extern void blk_queue_physical_block_size(struct request_queue *, unsigned int);
extern void blk_queue_alignment_offset(struct request_queue *q,
unsigned int alignment);
@@ -1301,6 +1307,11 @@ static inline unsigned int queue_max_segment_size(const struct request_queue *q)
return q->limits.max_segment_size;
}
+static inline unsigned int queue_max_zone_append_sectors(const struct request_queue *q)
+{
+ return q->limits.max_zone_append_sectors;
+}
+
static inline unsigned queue_logical_block_size(const struct request_queue *q)
{
int retval = 512;