Message ID | 20240520102033.9361-13-nj.shetty@samsung.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | [v20,01/12] block: Introduce queue limits and sysfs for copy-offload support | expand |
On 5/20/24 03:20, Nitesh Shetty wrote: > + if (blk_rq_nr_phys_segments(req) != BLK_COPY_MAX_SEGMENTS) > + return status; Why is this check necessary? > + /* > + * First bio contains information about destination and last bio > + * contains information about source. > + */ Please check this at runtime (WARN_ON_ONCE()?). > + __rq_for_each_bio(bio, req) { > + if (seg == blk_rq_nr_phys_segments(req)) { > + sector_in = bio->bi_iter.bi_sector; > + if (rem != bio->bi_iter.bi_size) > + return status; > + } else { > + sector_out = bio->bi_iter.bi_sector; > + rem = bio->bi_iter.bi_size; > + } > + seg++; > + } _rq_for_each_bio() iterates over the bios in a request. Does a copy offload request always have two bios - one copy destination bio and one copy source bio? If so, is 'seg' a bio counter? Why is that bio counter compared with the number of physical segments in the request? > + trace_nullb_copy_op(req, sector_out << SECTOR_SHIFT, > + sector_in << SECTOR_SHIFT, rem); > + > + spin_lock_irq(&nullb->lock); > + while (rem > 0) { > + chunk = min_t(size_t, nullb->dev->blocksize, rem); > + offset_in = (sector_in & SECTOR_MASK) << SECTOR_SHIFT; > + offset_out = (sector_out & SECTOR_MASK) << SECTOR_SHIFT; > + > + if (null_cache_active(nullb) && !is_fua) > + null_make_cache_space(nullb, PAGE_SIZE); > + > + t_page_in = null_lookup_page(nullb, sector_in, false, > + !null_cache_active(nullb)); > + if (!t_page_in) > + goto err; > + t_page_out = null_insert_page(nullb, sector_out, > + !null_cache_active(nullb) || > + is_fua); > + if (!t_page_out) > + goto err; > + > + in = kmap_local_page(t_page_in->page); > + out = kmap_local_page(t_page_out->page); > + > + memcpy(out + offset_out, in + offset_in, chunk); > + kunmap_local(out); > + kunmap_local(in); > + __set_bit(sector_out & SECTOR_MASK, t_page_out->bitmap); > + > + if (is_fua) > + null_free_sector(nullb, sector_out, true); > + > + rem -= chunk; > + sector_in += chunk >> SECTOR_SHIFT; > + sector_out += chunk >> SECTOR_SHIFT; > + } > + > + status = 0; > +err: > + spin_unlock_irq(&nullb->lock); In the worst case, how long does this loop disable interrupts? > +TRACE_EVENT(nullb_copy_op, > + TP_PROTO(struct request *req, > + sector_t dst, sector_t src, size_t len), > + TP_ARGS(req, dst, src, len), > + TP_STRUCT__entry( > + __array(char, disk, DISK_NAME_LEN) > + __field(enum req_op, op) > + __field(sector_t, dst) > + __field(sector_t, src) > + __field(size_t, len) > + ), Isn't __string() preferred over __array() since the former occupies less space in the trace buffer? Thanks, Bart.
On 20/05/24 04:42PM, Bart Van Assche wrote: >On 5/20/24 03:20, Nitesh Shetty wrote: >>+ if (blk_rq_nr_phys_segments(req) != BLK_COPY_MAX_SEGMENTS) >>+ return status; > >Why is this check necessary? > >>+ /* >>+ * First bio contains information about destination and last bio >>+ * contains information about source. >>+ */ > >Please check this at runtime (WARN_ON_ONCE()?). > >>+ __rq_for_each_bio(bio, req) { >>+ if (seg == blk_rq_nr_phys_segments(req)) { >>+ sector_in = bio->bi_iter.bi_sector; >>+ if (rem != bio->bi_iter.bi_size) >>+ return status; >>+ } else { >>+ sector_out = bio->bi_iter.bi_sector; >>+ rem = bio->bi_iter.bi_size; >>+ } >>+ seg++; >>+ } > >_rq_for_each_bio() iterates over the bios in a request. Does a copy >offload request always have two bios - one copy destination bio and >one copy source bio? If so, is 'seg' a bio counter? Why is that bio >counter compared with the number of physical segments in the request? > Yes, your observation is right. We are treating first bio as dst and second as src. If not for that comparision, we might need to store the index in a temporary variable and parse based on index value. >>+ trace_nullb_copy_op(req, sector_out << SECTOR_SHIFT, >>+ sector_in << SECTOR_SHIFT, rem); >>+ >>+ spin_lock_irq(&nullb->lock); >>+ while (rem > 0) { >>+ chunk = min_t(size_t, nullb->dev->blocksize, rem); >>+ offset_in = (sector_in & SECTOR_MASK) << SECTOR_SHIFT; >>+ offset_out = (sector_out & SECTOR_MASK) << SECTOR_SHIFT; >>+ >>+ if (null_cache_active(nullb) && !is_fua) >>+ null_make_cache_space(nullb, PAGE_SIZE); >>+ >>+ t_page_in = null_lookup_page(nullb, sector_in, false, >>+ !null_cache_active(nullb)); >>+ if (!t_page_in) >>+ goto err; >>+ t_page_out = null_insert_page(nullb, sector_out, >>+ !null_cache_active(nullb) || >>+ is_fua); >>+ if (!t_page_out) >>+ goto err; >>+ >>+ in = kmap_local_page(t_page_in->page); >>+ out = kmap_local_page(t_page_out->page); >>+ >>+ memcpy(out + offset_out, in + offset_in, chunk); >>+ kunmap_local(out); >>+ kunmap_local(in); >>+ __set_bit(sector_out & SECTOR_MASK, t_page_out->bitmap); >>+ >>+ if (is_fua) >>+ null_free_sector(nullb, sector_out, true); >>+ >>+ rem -= chunk; >>+ sector_in += chunk >> SECTOR_SHIFT; >>+ sector_out += chunk >> SECTOR_SHIFT; >>+ } >>+ >>+ status = 0; >>+err: >>+ spin_unlock_irq(&nullb->lock); > >In the worst case, how long does this loop disable interrupts? > We havn't measured this. But this should be similar to read and write in present infra, as we followed similar approach. >>+TRACE_EVENT(nullb_copy_op, >>+ TP_PROTO(struct request *req, >>+ sector_t dst, sector_t src, size_t len), >>+ TP_ARGS(req, dst, src, len), >>+ TP_STRUCT__entry( >>+ __array(char, disk, DISK_NAME_LEN) >>+ __field(enum req_op, op) >>+ __field(sector_t, dst) >>+ __field(sector_t, src) >>+ __field(size_t, len) >>+ ), > >Isn't __string() preferred over __array() since the former occupies less space >in the trace buffer? > Again we followed the present existing implementation, to have a simpler series to review. Thank you, Nitesh Shetty
On 5/21/24 07:46, Nitesh Shetty wrote: > On 20/05/24 04:42PM, Bart Van Assche wrote: >> On 5/20/24 03:20, Nitesh Shetty wrote: >>> + __rq_for_each_bio(bio, req) { >>> + if (seg == blk_rq_nr_phys_segments(req)) { >>> + sector_in = bio->bi_iter.bi_sector; >>> + if (rem != bio->bi_iter.bi_size) >>> + return status; >>> + } else { >>> + sector_out = bio->bi_iter.bi_sector; >>> + rem = bio->bi_iter.bi_size; >>> + } >>> + seg++; >>> + } >> >> _rq_for_each_bio() iterates over the bios in a request. Does a copy >> offload request always have two bios - one copy destination bio and >> one copy source bio? If so, is 'seg' a bio counter? Why is that bio >> counter compared with the number of physical segments in the request? >> > Yes, your observation is right. We are treating first bio as dst and > second as src. If not for that comparision, we might need to store the > index in a temporary variable and parse based on index value. I'm still wondering why 'seg' is compared with blk_rq_nr_phys_segments(req). Thanks, Bart.
On 22/05/24 10:52AM, Bart Van Assche wrote: >On 5/21/24 07:46, Nitesh Shetty wrote: >>On 20/05/24 04:42PM, Bart Van Assche wrote: >>>On 5/20/24 03:20, Nitesh Shetty wrote: >>>>+ __rq_for_each_bio(bio, req) { >>>>+ if (seg == blk_rq_nr_phys_segments(req)) { >>>>+ sector_in = bio->bi_iter.bi_sector; >>>>+ if (rem != bio->bi_iter.bi_size) >>>>+ return status; >>>>+ } else { >>>>+ sector_out = bio->bi_iter.bi_sector; >>>>+ rem = bio->bi_iter.bi_size; >>>>+ } >>>>+ seg++; >>>>+ } >>> >>>_rq_for_each_bio() iterates over the bios in a request. Does a copy >>>offload request always have two bios - one copy destination bio and >>>one copy source bio? If so, is 'seg' a bio counter? Why is that bio >>>counter compared with the number of physical segments in the request? >>> >>Yes, your observation is right. We are treating first bio as dst and >>second as src. If not for that comparision, we might need to store the >>index in a temporary variable and parse based on index value. > >I'm still wondering why 'seg' is compared with blk_rq_nr_phys_segments(req). > In this case blk_rq_nr_phys_segments is used as counter value(==2), which tells its a src IO. But using a macro instead of this comparison will avoid this confusion. We will change this in next version to make it explicit. Thank you, Nitesh Shetty
diff --git a/Documentation/block/null_blk.rst b/Documentation/block/null_blk.rst index 4dd78f24d10a..6153e02fcf13 100644 --- a/Documentation/block/null_blk.rst +++ b/Documentation/block/null_blk.rst @@ -149,3 +149,8 @@ zone_size=[MB]: Default: 256 zone_nr_conv=[nr_conv]: Default: 0 The number of conventional zones to create when block device is zoned. If zone_nr_conv >= nr_zones, it will be reduced to nr_zones - 1. + +copy_max_bytes=[size in bytes]: Default: COPY_MAX_BYTES + A module and configfs parameter which can be used to set hardware/driver + supported maximum copy offload limit. + COPY_MAX_BYTES(=128MB at present) is defined in fs.h diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index b33b9ebfebd2..dcfbd5275414 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -172,6 +172,10 @@ static int g_max_sectors; module_param_named(max_sectors, g_max_sectors, int, 0444); MODULE_PARM_DESC(max_sectors, "Maximum size of a command (in 512B sectors)"); +static unsigned long g_copy_max_bytes = BLK_COPY_MAX_BYTES; +module_param_named(copy_max_bytes, g_copy_max_bytes, ulong, 0444); +MODULE_PARM_DESC(copy_max_bytes, "Maximum size of a copy command (in bytes)"); + static unsigned int nr_devices = 1; module_param(nr_devices, uint, 0444); MODULE_PARM_DESC(nr_devices, "Number of devices to register"); @@ -433,6 +437,7 @@ NULLB_DEVICE_ATTR(home_node, uint, NULL); NULLB_DEVICE_ATTR(queue_mode, uint, NULL); NULLB_DEVICE_ATTR(blocksize, uint, NULL); NULLB_DEVICE_ATTR(max_sectors, uint, NULL); +NULLB_DEVICE_ATTR(copy_max_bytes, uint, NULL); NULLB_DEVICE_ATTR(irqmode, uint, NULL); NULLB_DEVICE_ATTR(hw_queue_depth, uint, NULL); NULLB_DEVICE_ATTR(index, uint, NULL); @@ -577,6 +582,7 @@ static struct configfs_attribute *nullb_device_attrs[] = { &nullb_device_attr_queue_mode, &nullb_device_attr_blocksize, &nullb_device_attr_max_sectors, + &nullb_device_attr_copy_max_bytes, &nullb_device_attr_irqmode, &nullb_device_attr_hw_queue_depth, &nullb_device_attr_index, @@ -687,7 +693,7 @@ static ssize_t memb_group_features_show(struct config_item *item, char *page) "shared_tags,size,submit_queues,use_per_node_hctx," "virt_boundary,zoned,zone_capacity,zone_max_active," "zone_max_open,zone_nr_conv,zone_offline,zone_readonly," - "zone_size,zone_append_max_sectors\n"); + "zone_size,zone_append_max_sectors,copy_max_bytes\n"); } CONFIGFS_ATTR_RO(memb_group_, features); @@ -753,6 +759,7 @@ static struct nullb_device *null_alloc_dev(void) dev->queue_mode = g_queue_mode; dev->blocksize = g_bs; dev->max_sectors = g_max_sectors; + dev->copy_max_bytes = g_copy_max_bytes; dev->irqmode = g_irqmode; dev->hw_queue_depth = g_hw_queue_depth; dev->blocking = g_blocking; @@ -1221,6 +1228,81 @@ static int null_transfer(struct nullb *nullb, struct page *page, return err; } +static inline int nullb_setup_copy(struct nullb *nullb, struct request *req, + bool is_fua) +{ + sector_t sector_in = 0, sector_out = 0; + loff_t offset_in, offset_out; + void *in, *out; + ssize_t chunk, rem = 0; + struct bio *bio; + struct nullb_page *t_page_in, *t_page_out; + u16 seg = 1; + int status = -EIO; + + if (blk_rq_nr_phys_segments(req) != BLK_COPY_MAX_SEGMENTS) + return status; + + /* + * First bio contains information about destination and last bio + * contains information about source. + */ + __rq_for_each_bio(bio, req) { + if (seg == blk_rq_nr_phys_segments(req)) { + sector_in = bio->bi_iter.bi_sector; + if (rem != bio->bi_iter.bi_size) + return status; + } else { + sector_out = bio->bi_iter.bi_sector; + rem = bio->bi_iter.bi_size; + } + seg++; + } + + trace_nullb_copy_op(req, sector_out << SECTOR_SHIFT, + sector_in << SECTOR_SHIFT, rem); + + spin_lock_irq(&nullb->lock); + while (rem > 0) { + chunk = min_t(size_t, nullb->dev->blocksize, rem); + offset_in = (sector_in & SECTOR_MASK) << SECTOR_SHIFT; + offset_out = (sector_out & SECTOR_MASK) << SECTOR_SHIFT; + + if (null_cache_active(nullb) && !is_fua) + null_make_cache_space(nullb, PAGE_SIZE); + + t_page_in = null_lookup_page(nullb, sector_in, false, + !null_cache_active(nullb)); + if (!t_page_in) + goto err; + t_page_out = null_insert_page(nullb, sector_out, + !null_cache_active(nullb) || + is_fua); + if (!t_page_out) + goto err; + + in = kmap_local_page(t_page_in->page); + out = kmap_local_page(t_page_out->page); + + memcpy(out + offset_out, in + offset_in, chunk); + kunmap_local(out); + kunmap_local(in); + __set_bit(sector_out & SECTOR_MASK, t_page_out->bitmap); + + if (is_fua) + null_free_sector(nullb, sector_out, true); + + rem -= chunk; + sector_in += chunk >> SECTOR_SHIFT; + sector_out += chunk >> SECTOR_SHIFT; + } + + status = 0; +err: + spin_unlock_irq(&nullb->lock); + return status; +} + static blk_status_t null_handle_rq(struct nullb_cmd *cmd) { struct request *rq = blk_mq_rq_from_pdu(cmd); @@ -1230,13 +1312,16 @@ static blk_status_t null_handle_rq(struct nullb_cmd *cmd) sector_t sector = blk_rq_pos(rq); struct req_iterator iter; struct bio_vec bvec; + bool fua = rq->cmd_flags & REQ_FUA; + + if (op_is_copy(req_op(rq))) + return nullb_setup_copy(nullb, rq, fua); spin_lock_irq(&nullb->lock); rq_for_each_segment(bvec, rq, iter) { len = bvec.bv_len; err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset, - op_is_write(req_op(rq)), sector, - rq->cmd_flags & REQ_FUA); + op_is_write(req_op(rq)), sector, fua); if (err) break; sector += len >> SECTOR_SHIFT; @@ -1721,6 +1806,12 @@ static void null_config_discard(struct nullb *nullb, struct queue_limits *lim) lim->max_hw_discard_sectors = UINT_MAX >> 9; } +static void null_config_copy(struct nullb *nullb, struct queue_limits *lim) +{ + lim->max_copy_hw_sectors = nullb->dev->copy_max_bytes >> SECTOR_SHIFT; + lim->max_copy_sectors = nullb->dev->copy_max_bytes >> SECTOR_SHIFT; +} + static const struct block_device_operations null_ops = { .owner = THIS_MODULE, .report_zones = null_report_zones, @@ -1843,6 +1934,9 @@ static int null_validate_conf(struct nullb_device *dev) return -EINVAL; } + if (dev->queue_mode == NULL_Q_BIO) + dev->copy_max_bytes = 0; + return 0; } @@ -1909,6 +2003,8 @@ static int null_add_dev(struct nullb_device *dev) if (dev->virt_boundary) lim.virt_boundary_mask = PAGE_SIZE - 1; null_config_discard(nullb, &lim); + null_config_copy(nullb, &lim); + if (dev->zoned) { rv = null_init_zoned_dev(dev, &lim); if (rv) diff --git a/drivers/block/null_blk/null_blk.h b/drivers/block/null_blk/null_blk.h index 3234e6c85eed..c588729c17bd 100644 --- a/drivers/block/null_blk/null_blk.h +++ b/drivers/block/null_blk/null_blk.h @@ -91,6 +91,7 @@ struct nullb_device { unsigned int queue_mode; /* block interface */ unsigned int blocksize; /* block size */ unsigned int max_sectors; /* Max sectors per command */ + unsigned long copy_max_bytes; /* Max copy offload length in bytes */ unsigned int irqmode; /* IRQ completion handler */ unsigned int hw_queue_depth; /* queue depth */ unsigned int index; /* index of the disk, only valid with a disk */ diff --git a/drivers/block/null_blk/trace.h b/drivers/block/null_blk/trace.h index f9eadac6b22f..cda1a2249978 100644 --- a/drivers/block/null_blk/trace.h +++ b/drivers/block/null_blk/trace.h @@ -76,6 +76,29 @@ TRACE_EVENT(nullb_report_zones, ); #endif /* CONFIG_BLK_DEV_ZONED */ +TRACE_EVENT(nullb_copy_op, + TP_PROTO(struct request *req, + sector_t dst, sector_t src, size_t len), + TP_ARGS(req, dst, src, len), + TP_STRUCT__entry( + __array(char, disk, DISK_NAME_LEN) + __field(enum req_op, op) + __field(sector_t, dst) + __field(sector_t, src) + __field(size_t, len) + ), + TP_fast_assign( + __entry->op = req_op(req); + __assign_disk_name(__entry->disk, req->q->disk); + __entry->dst = dst; + __entry->src = src; + __entry->len = len; + ), + TP_printk("%s req=%-15s: dst=%llu, src=%llu, len=%lu", + __print_disk_name(__entry->disk), + blk_op_str(__entry->op), + __entry->dst, __entry->src, __entry->len) +); #endif /* _TRACE_NULLB_H */ #undef TRACE_INCLUDE_PATH