Message ID | 20230628124546.1056698-2-chengming.zhou@linux.dev (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | blk-mq: fix start_time_ns and alloc_time_ns for pre-allocated rq | expand |
On Wed, Jun 28, 2023 at 08:45:44PM +0800, chengming.zhou@linux.dev wrote: > After these cleanup, __blk_mq_alloc_requests() is the only entry to > alloc and init rq. I find the code a little hard to follow now, due to the optional setting of the ctx. We also introduce really odd behavior here if the caller for a hctx-specific allocation doesn't have free tags, as we'll now run into the normal retry path. Is this really needed for your timestamp changes? If not I'd prefer to skip it.
On 2023/6/29 13:28, Christoph Hellwig wrote: > On Wed, Jun 28, 2023 at 08:45:44PM +0800, chengming.zhou@linux.dev wrote: >> After these cleanup, __blk_mq_alloc_requests() is the only entry to >> alloc and init rq. > > I find the code a little hard to follow now, due to the optional > setting of the ctx. We also introduce really odd behavior here > if the caller for a hctx-specific allocation doesn't have free > tags, as we'll now run into the normal retry path. > > Is this really needed for your timestamp changes? If not I'd prefer > to skip it. > Thanks for your review! Since hctx-specific allocation path always has BLK_MQ_REQ_NOWAIT flag, it won't retry. But I agree, this makes the general __blk_mq_alloc_requests() more complex. The reason is blk_mq_rq_ctx_init() has some data->rq_flags initialization: ``` if (data->flags & BLK_MQ_REQ_PM) data->rq_flags |= RQF_PM; if (blk_queue_io_stat(q)) data->rq_flags |= RQF_IO_STAT; rq->rq_flags = data->rq_flags; ``` Because we need this data->rq_flags to tell if we need start_time_ns, we need to put these initialization in the callers of blk_mq_rq_ctx_init(). Now we basically have two callers, the 1st is general __blk_mq_alloc_requests(), the 2nd is the special blk_mq_alloc_request_hctx(). So I change the 2nd caller to reuse the 1st __blk_mq_alloc_requests(). Or we put these data->rq_flags initialization in blk_mq_alloc_request_hctx() too? Thanks.
On Thu, Jun 29, 2023 at 03:40:03PM +0800, Chengming Zhou wrote: > Thanks for your review! > > Since hctx-specific allocation path always has BLK_MQ_REQ_NOWAIT flag, > it won't retry. > > But I agree, this makes the general __blk_mq_alloc_requests() more complex. And also very confusing as it pretends to share some code, while almost nothing of __blk_mq_alloc_requests is actually used. > The reason is blk_mq_rq_ctx_init() has some data->rq_flags initialization: > > ``` > if (data->flags & BLK_MQ_REQ_PM) > data->rq_flags |= RQF_PM; > if (blk_queue_io_stat(q)) > data->rq_flags |= RQF_IO_STAT; > rq->rq_flags = data->rq_flags; > ``` > > Because we need this data->rq_flags to tell if we need start_time_ns, > we need to put these initialization in the callers of blk_mq_rq_ctx_init(). Why can't we just always initialize the time stampts after blk_mq_rq_ctx_init? Something like this (untested) variant of your patch 2 from the latest iteration: diff --git a/block/blk-mq.c b/block/blk-mq.c index 5504719b970d59..55bf1009f3e32a 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -328,8 +328,26 @@ void blk_rq_init(struct request_queue *q, struct request *rq) } EXPORT_SYMBOL(blk_rq_init); +/* Set alloc and start time when pre-allocated rq is actually used */ +static inline void blk_mq_rq_time_init(struct request *rq, bool set_alloc_time) +{ + if (blk_mq_need_time_stamp(rq)) { + u64 now = ktime_get_ns(); + +#ifdef CONFIG_BLK_RQ_ALLOC_TIME + /* + * The alloc time is only used by iocost for now, + * only possible when blk_mq_need_time_stamp(). + */ + if (set_alloc_time) + rq->alloc_time_ns = now; +#endif + rq->start_time_ns = now; + } +} + static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, - struct blk_mq_tags *tags, unsigned int tag, u64 alloc_time_ns) + struct blk_mq_tags *tags, unsigned int tag) { struct blk_mq_ctx *ctx = data->ctx; struct blk_mq_hw_ctx *hctx = data->hctx; @@ -356,14 +374,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, } rq->timeout = 0; - if (blk_mq_need_time_stamp(rq)) - rq->start_time_ns = ktime_get_ns(); - else - rq->start_time_ns = 0; rq->part = NULL; -#ifdef CONFIG_BLK_RQ_ALLOC_TIME - rq->alloc_time_ns = alloc_time_ns; -#endif rq->io_start_time_ns = 0; rq->stats_sectors = 0; rq->nr_phys_segments = 0; @@ -393,8 +404,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, } static inline struct request * -__blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data, - u64 alloc_time_ns) +__blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data) { unsigned int tag, tag_offset; struct blk_mq_tags *tags; @@ -413,7 +423,7 @@ __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data, tag = tag_offset + i; prefetch(tags->static_rqs[tag]); tag_mask &= ~(1UL << i); - rq = blk_mq_rq_ctx_init(data, tags, tag, alloc_time_ns); + rq = blk_mq_rq_ctx_init(data, tags, tag); rq_list_add(data->cached_rq, rq); nr++; } @@ -427,12 +437,13 @@ __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data, static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data) { struct request_queue *q = data->q; + bool set_alloc_time = blk_queue_rq_alloc_time(q); u64 alloc_time_ns = 0; struct request *rq; unsigned int tag; /* alloc_time includes depth and tag waits */ - if (blk_queue_rq_alloc_time(q)) + if (set_alloc_time) alloc_time_ns = ktime_get_ns(); if (data->cmd_flags & REQ_NOWAIT) @@ -474,9 +485,11 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data) * Try batched alloc if we want more than 1 tag. */ if (data->nr_tags > 1) { - rq = __blk_mq_alloc_requests_batch(data, alloc_time_ns); - if (rq) + rq = __blk_mq_alloc_requests_batch(data); + if (rq) { + blk_mq_rq_time_init(rq, true); return rq; + } data->nr_tags = 1; } @@ -499,8 +512,10 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data) goto retry; } - return blk_mq_rq_ctx_init(data, blk_mq_tags_from_data(data), tag, - alloc_time_ns); + rq = blk_mq_rq_ctx_init(data, blk_mq_tags_from_data(data), tag); + if (rq) + blk_mq_rq_time_init(rq, set_alloc_time); + return rq; } static struct request *blk_mq_rq_cache_fill(struct request_queue *q, @@ -555,6 +570,7 @@ static struct request *blk_mq_alloc_cached_request(struct request_queue *q, return NULL; plug->cached_rq = rq_list_next(rq); + blk_mq_rq_time_init(rq, blk_queue_rq_alloc_time(rq->q)); } rq->cmd_flags = opf; @@ -656,8 +672,8 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, tag = blk_mq_get_tag(&data); if (tag == BLK_MQ_NO_TAG) goto out_queue_exit; - rq = blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag, - alloc_time_ns); + rq = blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag); + blk_mq_rq_time_init(rq, blk_queue_rq_alloc_time(rq->q)); rq->__data_len = 0; rq->__sector = (sector_t) -1; rq->bio = rq->biotail = NULL; @@ -2896,6 +2912,7 @@ static inline struct request *blk_mq_get_cached_request(struct request_queue *q, plug->cached_rq = rq_list_next(rq); rq_qos_throttle(q, *bio); + blk_mq_rq_time_init(rq, blk_queue_rq_alloc_time(rq->q)); rq->cmd_flags = (*bio)->bi_opf; INIT_LIST_HEAD(&rq->queuelist); return rq;
On 2023/7/10 15:36, Christoph Hellwig wrote: > On Thu, Jun 29, 2023 at 03:40:03PM +0800, Chengming Zhou wrote: >> Thanks for your review! >> >> Since hctx-specific allocation path always has BLK_MQ_REQ_NOWAIT flag, >> it won't retry. >> >> But I agree, this makes the general __blk_mq_alloc_requests() more complex. > > And also very confusing as it pretends to share some code, while almost > nothing of __blk_mq_alloc_requests is actually used. You are right. I will not mess with reusing __blk_mq_alloc_requests() in the next version. > >> The reason is blk_mq_rq_ctx_init() has some data->rq_flags initialization: >> >> ``` >> if (data->flags & BLK_MQ_REQ_PM) >> data->rq_flags |= RQF_PM; >> if (blk_queue_io_stat(q)) >> data->rq_flags |= RQF_IO_STAT; >> rq->rq_flags = data->rq_flags; >> ``` >> >> Because we need this data->rq_flags to tell if we need start_time_ns, >> we need to put these initialization in the callers of blk_mq_rq_ctx_init(). > > Why can't we just always initialize the time stampts after > blk_mq_rq_ctx_init? Something like this (untested) variant of your > patch 2 from the latest iteration: I get what you mean: always initialize the two time stamps after blk_mq_rq_ctx_init(), so we know whether the time stamps are needed in blk_mq_rq_time_init(). It seems better and clearer indeed, I will try to change as you suggest. > > diff --git a/block/blk-mq.c b/block/blk-mq.c > index 5504719b970d59..55bf1009f3e32a 100644 > --- a/block/blk-mq.c > +++ b/block/blk-mq.c > @@ -328,8 +328,26 @@ void blk_rq_init(struct request_queue *q, struct request *rq) > } > EXPORT_SYMBOL(blk_rq_init); > > +/* Set alloc and start time when pre-allocated rq is actually used */ > +static inline void blk_mq_rq_time_init(struct request *rq, bool set_alloc_time) We need to pass "u64 alloc_time_ns" here, which includes depth and tag waits time by definition. So: 1. for non-batched request that need alloc_time_ns: passed alloc_time_ns != 0 2. for batched request that need alloc_time_ns: passed alloc_time_ns == 0, will be set to start_time_ns I have just updated the patch: https://lore.kernel.org/all/20230710105516.2053478-1-chengming.zhou@linux.dev/ Thanks! > +{ > + if (blk_mq_need_time_stamp(rq)) { > + u64 now = ktime_get_ns(); > + > +#ifdef CONFIG_BLK_RQ_ALLOC_TIME > + /* > + * The alloc time is only used by iocost for now, > + * only possible when blk_mq_need_time_stamp(). > + */ > + if (set_alloc_time) > + rq->alloc_time_ns = now; > +#endif > + rq->start_time_ns = now; > + } > +} > + > static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, > - struct blk_mq_tags *tags, unsigned int tag, u64 alloc_time_ns) > + struct blk_mq_tags *tags, unsigned int tag) > { > struct blk_mq_ctx *ctx = data->ctx; > struct blk_mq_hw_ctx *hctx = data->hctx; > @@ -356,14 +374,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, > } > rq->timeout = 0; > > - if (blk_mq_need_time_stamp(rq)) > - rq->start_time_ns = ktime_get_ns(); > - else > - rq->start_time_ns = 0; > rq->part = NULL; > -#ifdef CONFIG_BLK_RQ_ALLOC_TIME > - rq->alloc_time_ns = alloc_time_ns; > -#endif > rq->io_start_time_ns = 0; > rq->stats_sectors = 0; > rq->nr_phys_segments = 0; > @@ -393,8 +404,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, > } > > static inline struct request * > -__blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data, > - u64 alloc_time_ns) > +__blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data) > { > unsigned int tag, tag_offset; > struct blk_mq_tags *tags; > @@ -413,7 +423,7 @@ __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data, > tag = tag_offset + i; > prefetch(tags->static_rqs[tag]); > tag_mask &= ~(1UL << i); > - rq = blk_mq_rq_ctx_init(data, tags, tag, alloc_time_ns); > + rq = blk_mq_rq_ctx_init(data, tags, tag); > rq_list_add(data->cached_rq, rq); > nr++; > } > @@ -427,12 +437,13 @@ __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data, > static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data) > { > struct request_queue *q = data->q; > + bool set_alloc_time = blk_queue_rq_alloc_time(q); > u64 alloc_time_ns = 0; > struct request *rq; > unsigned int tag; > > /* alloc_time includes depth and tag waits */ > - if (blk_queue_rq_alloc_time(q)) > + if (set_alloc_time) > alloc_time_ns = ktime_get_ns(); > > if (data->cmd_flags & REQ_NOWAIT) > @@ -474,9 +485,11 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data) > * Try batched alloc if we want more than 1 tag. > */ > if (data->nr_tags > 1) { > - rq = __blk_mq_alloc_requests_batch(data, alloc_time_ns); > - if (rq) > + rq = __blk_mq_alloc_requests_batch(data); > + if (rq) { > + blk_mq_rq_time_init(rq, true); > return rq; > + } > data->nr_tags = 1; > } > > @@ -499,8 +512,10 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data) > goto retry; > } > > - return blk_mq_rq_ctx_init(data, blk_mq_tags_from_data(data), tag, > - alloc_time_ns); > + rq = blk_mq_rq_ctx_init(data, blk_mq_tags_from_data(data), tag); > + if (rq) > + blk_mq_rq_time_init(rq, set_alloc_time); > + return rq; > } > > static struct request *blk_mq_rq_cache_fill(struct request_queue *q, > @@ -555,6 +570,7 @@ static struct request *blk_mq_alloc_cached_request(struct request_queue *q, > return NULL; > > plug->cached_rq = rq_list_next(rq); > + blk_mq_rq_time_init(rq, blk_queue_rq_alloc_time(rq->q)); > } > > rq->cmd_flags = opf; > @@ -656,8 +672,8 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, > tag = blk_mq_get_tag(&data); > if (tag == BLK_MQ_NO_TAG) > goto out_queue_exit; > - rq = blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag, > - alloc_time_ns); > + rq = blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag); > + blk_mq_rq_time_init(rq, blk_queue_rq_alloc_time(rq->q)); > rq->__data_len = 0; > rq->__sector = (sector_t) -1; > rq->bio = rq->biotail = NULL; > @@ -2896,6 +2912,7 @@ static inline struct request *blk_mq_get_cached_request(struct request_queue *q, > plug->cached_rq = rq_list_next(rq); > rq_qos_throttle(q, *bio); > > + blk_mq_rq_time_init(rq, blk_queue_rq_alloc_time(rq->q)); > rq->cmd_flags = (*bio)->bi_opf; > INIT_LIST_HEAD(&rq->queuelist); > return rq;
diff --git a/block/blk-mq.c b/block/blk-mq.c index decb6ab2d508..c50ef953759f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -349,11 +349,6 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, rq->mq_ctx = ctx; rq->mq_hctx = hctx; rq->cmd_flags = data->cmd_flags; - - if (data->flags & BLK_MQ_REQ_PM) - data->rq_flags |= RQF_PM; - if (blk_queue_io_stat(q)) - data->rq_flags |= RQF_IO_STAT; rq->rq_flags = data->rq_flags; if (data->rq_flags & RQF_SCHED_TAGS) { @@ -447,6 +442,15 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data) if (data->cmd_flags & REQ_NOWAIT) data->flags |= BLK_MQ_REQ_NOWAIT; + if (data->flags & BLK_MQ_REQ_RESERVED) + data->rq_flags |= RQF_RESV; + + if (data->flags & BLK_MQ_REQ_PM) + data->rq_flags |= RQF_PM; + + if (blk_queue_io_stat(q)) + data->rq_flags |= RQF_IO_STAT; + if (q->elevator) { /* * All requests use scheduler tags when an I/O scheduler is @@ -471,14 +475,15 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data) } retry: - data->ctx = blk_mq_get_ctx(q); - data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx); + /* See blk_mq_alloc_request_hctx() for details */ + if (!data->ctx) { + data->ctx = blk_mq_get_ctx(q); + data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx); + } + if (!(data->rq_flags & RQF_SCHED_TAGS)) blk_mq_tag_busy(data->hctx); - if (data->flags & BLK_MQ_REQ_RESERVED) - data->rq_flags |= RQF_RESV; - /* * Try batched alloc if we want more than 1 tag. */ @@ -505,6 +510,7 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data) * is going away. */ msleep(3); + data->ctx = NULL; goto retry; } @@ -613,16 +619,10 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, .cmd_flags = opf, .nr_tags = 1, }; - u64 alloc_time_ns = 0; struct request *rq; unsigned int cpu; - unsigned int tag; int ret; - /* alloc_time includes depth and tag waits */ - if (blk_queue_rq_alloc_time(q)) - alloc_time_ns = ktime_get_ns(); - /* * If the tag allocator sleeps we could get an allocation for a * different hardware context. No need to complicate the low level @@ -653,20 +653,10 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, goto out_queue_exit; data.ctx = __blk_mq_get_ctx(q, cpu); - if (q->elevator) - data.rq_flags |= RQF_SCHED_TAGS; - else - blk_mq_tag_busy(data.hctx); - - if (flags & BLK_MQ_REQ_RESERVED) - data.rq_flags |= RQF_RESV; - ret = -EWOULDBLOCK; - tag = blk_mq_get_tag(&data); - if (tag == BLK_MQ_NO_TAG) + rq = __blk_mq_alloc_requests(&data); + if (!rq) goto out_queue_exit; - rq = blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag, - alloc_time_ns); rq->__data_len = 0; rq->__sector = (sector_t) -1; rq->bio = rq->biotail = NULL;