[2/4] blk-flush: count inflight flush_data requests

Message ID	20230627120854.971475-3-chengming.zhou@linux.dev (mailing list archive)
State	New, archived
Headers	show Return-Path: <linux-block-owner@vger.kernel.org> From: chengming.zhou@linux.dev To: axboe@kernel.dk, tj@kernel.org, hch@lst.de, ming.lei@redhat.com Cc: linux-block@vger.kernel.org, linux-kernel@vger.kernel.org, zhouchengming@bytedance.com Subject: [PATCH 2/4] blk-flush: count inflight flush_data requests Date: Tue, 27 Jun 2023 20:08:52 +0800 Message-Id: <20230627120854.971475-3-chengming.zhou@linux.dev> In-Reply-To: <20230627120854.971475-1-chengming.zhou@linux.dev> References: <20230627120854.971475-1-chengming.zhou@linux.dev> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Precedence: bulk
Series	blk-mq: optimize the size of struct request \| expand [0/4] blk-mq: optimize the size of struct request [1/4] blk-mq: use percpu csd to remote complete instead of per-rq csd [2/4] blk-flush: count inflight flush_data requests [3/4] blk-flush: reuse rq queuelist in flush state machine [4/4] blk-mq: delete unused completion_data in struct request

Message ID

20230627120854.971475-3-chengming.zhou@linux.dev (mailing list archive)

State

New, archived

Headers

From: chengming.zhou@linux.dev
To: axboe@kernel.dk, tj@kernel.org, hch@lst.de, ming.lei@redhat.com
Cc: linux-block@vger.kernel.org, linux-kernel@vger.kernel.org,
        zhouchengming@bytedance.com
Subject: [PATCH 2/4] blk-flush: count inflight flush_data requests
Date: Tue, 27 Jun 2023 20:08:52 +0800
Message-Id: <20230627120854.971475-3-chengming.zhou@linux.dev>
In-Reply-To: <20230627120854.971475-1-chengming.zhou@linux.dev>
References: <20230627120854.971475-1-chengming.zhou@linux.dev>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
Precedence: bulk

Series

blk-mq: optimize the size of struct request | expand

Commit Message

Chengming Zhou June 27, 2023, 12:08 p.m. UTC

From: Chengming Zhou <zhouchengming@bytedance.com>

The flush state machine use a double list to link all inflight
flush_data requests, to avoid issuing separate post-flushes for
these flush_data requests which shared PREFLUSH.

So we can't reuse rq->queuelist, this is why we need rq->flush.list

In preparation of the next patch that reuse rq->queuelist for flush
state machine, we change the double linked list to a u64 counter,
which count all inflight flush_data requests.

This is ok since we only need to know if there is any inflight
flush_data request, so a u64 counter is good. The only problem I can
think of is that u64 counter may overflow, which should be unlikely happen.

Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
---
 block/blk-flush.c | 9 +++++----
 block/blk.h       | 5 ++---
 2 files changed, 7 insertions(+), 7 deletions(-)

Comments

Ming Lei June 28, 2023, 4:13 a.m. UTC | #1

On Tue, Jun 27, 2023 at 08:08:52PM +0800, chengming.zhou@linux.dev wrote:
> From: Chengming Zhou <zhouchengming@bytedance.com>
> 
> The flush state machine use a double list to link all inflight
> flush_data requests, to avoid issuing separate post-flushes for
> these flush_data requests which shared PREFLUSH.
> 
> So we can't reuse rq->queuelist, this is why we need rq->flush.list
> 
> In preparation of the next patch that reuse rq->queuelist for flush
> state machine, we change the double linked list to a u64 counter,
> which count all inflight flush_data requests.
> 
> This is ok since we only need to know if there is any inflight
> flush_data request, so a u64 counter is good. The only problem I can
> think of is that u64 counter may overflow, which should be unlikely happen.

It won't overflow, q->nr_requests is 'unsigned long', which should have
been limited to one more reasonable value, such as 2 * BLK_MQ_MAX_DEPTH, so
u16 should be big enough in theory.

> 
> Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
> ---
>  block/blk-flush.c | 9 +++++----
>  block/blk.h       | 5 ++---
>  2 files changed, 7 insertions(+), 7 deletions(-)
> 
> diff --git a/block/blk-flush.c b/block/blk-flush.c
> index dba392cf22be..bb7adfc2a5da 100644
> --- a/block/blk-flush.c
> +++ b/block/blk-flush.c
> @@ -187,7 +187,8 @@ static void blk_flush_complete_seq(struct request *rq,
>  		break;
>  
>  	case REQ_FSEQ_DATA:
> -		list_move_tail(&rq->flush.list, &fq->flush_data_in_flight);
> +		list_del_init(&rq->flush.list);
> +		fq->flush_data_in_flight++;
>  		spin_lock(&q->requeue_lock);
>  		list_add_tail(&rq->queuelist, &q->flush_list);
>  		spin_unlock(&q->requeue_lock);
> @@ -299,7 +300,7 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
>  		return;
>  
>  	/* C2 and C3 */
> -	if (!list_empty(&fq->flush_data_in_flight) &&
> +	if (fq->flush_data_in_flight &&
>  	    time_before(jiffies,
>  			fq->flush_pending_since + FLUSH_PENDING_TIMEOUT))
>  		return;
> @@ -374,6 +375,7 @@ static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq,
>  	 * the comment in flush_end_io().
>  	 */
>  	spin_lock_irqsave(&fq->mq_flush_lock, flags);
> +	fq->flush_data_in_flight--;
>  	blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error);
>  	spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
>  
> @@ -445,7 +447,7 @@ bool blk_insert_flush(struct request *rq)
>  		blk_rq_init_flush(rq);
>  		rq->flush.seq |= REQ_FSEQ_POSTFLUSH;
>  		spin_lock_irq(&fq->mq_flush_lock);
> -		list_move_tail(&rq->flush.list, &fq->flush_data_in_flight);
> +		fq->flush_data_in_flight++;
>  		spin_unlock_irq(&fq->mq_flush_lock);
>  		return false;
>  	default:
> @@ -496,7 +498,6 @@ struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size,
>  
>  	INIT_LIST_HEAD(&fq->flush_queue[0]);
>  	INIT_LIST_HEAD(&fq->flush_queue[1]);
> -	INIT_LIST_HEAD(&fq->flush_data_in_flight);
>  
>  	return fq;
>  
> diff --git a/block/blk.h b/block/blk.h
> index 608c5dcc516b..686712e13835 100644
> --- a/block/blk.h
> +++ b/block/blk.h
> @@ -15,15 +15,14 @@ struct elevator_type;
>  extern struct dentry *blk_debugfs_root;
>  
>  struct blk_flush_queue {
> +	spinlock_t		mq_flush_lock;
>  	unsigned int		flush_pending_idx:1;
>  	unsigned int		flush_running_idx:1;
>  	blk_status_t 		rq_status;
>  	unsigned long		flush_pending_since;
>  	struct list_head	flush_queue[2];
> -	struct list_head	flush_data_in_flight;
> +	unsigned long		flush_data_in_flight;
>  	struct request		*flush_rq;
> -
> -	spinlock_t		mq_flush_lock;
>  };

The part of replacing inflight data rq list with counter looks fine.

Thanks,
Ming

Chengming Zhou June 28, 2023, 4:55 a.m. UTC | #2

On 2023/6/28 12:13, Ming Lei wrote:
> On Tue, Jun 27, 2023 at 08:08:52PM +0800, chengming.zhou@linux.dev wrote:
>> From: Chengming Zhou <zhouchengming@bytedance.com>
>>
>> The flush state machine use a double list to link all inflight
>> flush_data requests, to avoid issuing separate post-flushes for
>> these flush_data requests which shared PREFLUSH.
>>
>> So we can't reuse rq->queuelist, this is why we need rq->flush.list
>>
>> In preparation of the next patch that reuse rq->queuelist for flush
>> state machine, we change the double linked list to a u64 counter,
>> which count all inflight flush_data requests.
>>
>> This is ok since we only need to know if there is any inflight
>> flush_data request, so a u64 counter is good. The only problem I can
>> think of is that u64 counter may overflow, which should be unlikely happen.
> 
> It won't overflow, q->nr_requests is 'unsigned long', which should have
> been limited to one more reasonable value, such as 2 * BLK_MQ_MAX_DEPTH, so
> u16 should be big enough in theory.

Ah, right. q->nr_requests is 'unsigned long' and q->queue_depth is 'unsigned int',
so 'unsigned long' counter here won't overflow.

Should I change it to smaller 'unsigned short' or just leave it as 'unsigned long' ?
(Now the size of struct blk_flush_queue is exactly 64 bytes)

Thanks.

> 
>>
>> Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
>> ---
>>  block/blk-flush.c | 9 +++++----
>>  block/blk.h       | 5 ++---
>>  2 files changed, 7 insertions(+), 7 deletions(-)
>>
>> diff --git a/block/blk-flush.c b/block/blk-flush.c
>> index dba392cf22be..bb7adfc2a5da 100644
>> --- a/block/blk-flush.c
>> +++ b/block/blk-flush.c
>> @@ -187,7 +187,8 @@ static void blk_flush_complete_seq(struct request *rq,
>>  		break;
>>  
>>  	case REQ_FSEQ_DATA:
>> -		list_move_tail(&rq->flush.list, &fq->flush_data_in_flight);
>> +		list_del_init(&rq->flush.list);
>> +		fq->flush_data_in_flight++;
>>  		spin_lock(&q->requeue_lock);
>>  		list_add_tail(&rq->queuelist, &q->flush_list);
>>  		spin_unlock(&q->requeue_lock);
>> @@ -299,7 +300,7 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
>>  		return;
>>  
>>  	/* C2 and C3 */
>> -	if (!list_empty(&fq->flush_data_in_flight) &&
>> +	if (fq->flush_data_in_flight &&
>>  	    time_before(jiffies,
>>  			fq->flush_pending_since + FLUSH_PENDING_TIMEOUT))
>>  		return;
>> @@ -374,6 +375,7 @@ static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq,
>>  	 * the comment in flush_end_io().
>>  	 */
>>  	spin_lock_irqsave(&fq->mq_flush_lock, flags);
>> +	fq->flush_data_in_flight--;
>>  	blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error);
>>  	spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
>>  
>> @@ -445,7 +447,7 @@ bool blk_insert_flush(struct request *rq)
>>  		blk_rq_init_flush(rq);
>>  		rq->flush.seq |= REQ_FSEQ_POSTFLUSH;
>>  		spin_lock_irq(&fq->mq_flush_lock);
>> -		list_move_tail(&rq->flush.list, &fq->flush_data_in_flight);
>> +		fq->flush_data_in_flight++;
>>  		spin_unlock_irq(&fq->mq_flush_lock);
>>  		return false;
>>  	default:
>> @@ -496,7 +498,6 @@ struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size,
>>  
>>  	INIT_LIST_HEAD(&fq->flush_queue[0]);
>>  	INIT_LIST_HEAD(&fq->flush_queue[1]);
>> -	INIT_LIST_HEAD(&fq->flush_data_in_flight);
>>  
>>  	return fq;
>>  
>> diff --git a/block/blk.h b/block/blk.h
>> index 608c5dcc516b..686712e13835 100644
>> --- a/block/blk.h
>> +++ b/block/blk.h
>> @@ -15,15 +15,14 @@ struct elevator_type;
>>  extern struct dentry *blk_debugfs_root;
>>  
>>  struct blk_flush_queue {
>> +	spinlock_t		mq_flush_lock;
>>  	unsigned int		flush_pending_idx:1;
>>  	unsigned int		flush_running_idx:1;
>>  	blk_status_t 		rq_status;
>>  	unsigned long		flush_pending_since;
>>  	struct list_head	flush_queue[2];
>> -	struct list_head	flush_data_in_flight;
>> +	unsigned long		flush_data_in_flight;
>>  	struct request		*flush_rq;
>> -
>> -	spinlock_t		mq_flush_lock;
>>  };
> 
> The part of replacing inflight data rq list with counter looks fine.
> 
> Thanks,
> Ming
>

Ming Lei June 28, 2023, 7:22 a.m. UTC | #3

On Wed, Jun 28, 2023 at 12:55:49PM +0800, Chengming Zhou wrote:
> On 2023/6/28 12:13, Ming Lei wrote:
> > On Tue, Jun 27, 2023 at 08:08:52PM +0800, chengming.zhou@linux.dev wrote:
> >> From: Chengming Zhou <zhouchengming@bytedance.com>
> >>
> >> The flush state machine use a double list to link all inflight
> >> flush_data requests, to avoid issuing separate post-flushes for
> >> these flush_data requests which shared PREFLUSH.
> >>
> >> So we can't reuse rq->queuelist, this is why we need rq->flush.list
> >>
> >> In preparation of the next patch that reuse rq->queuelist for flush
> >> state machine, we change the double linked list to a u64 counter,
> >> which count all inflight flush_data requests.
> >>
> >> This is ok since we only need to know if there is any inflight
> >> flush_data request, so a u64 counter is good. The only problem I can
> >> think of is that u64 counter may overflow, which should be unlikely happen.
> > 
> > It won't overflow, q->nr_requests is 'unsigned long', which should have
> > been limited to one more reasonable value, such as 2 * BLK_MQ_MAX_DEPTH, so
> > u16 should be big enough in theory.
> 
> Ah, right. q->nr_requests is 'unsigned long' and q->queue_depth is 'unsigned int',
> so 'unsigned long' counter here won't overflow.

Not like q->nr_requests, q->queue_depth usually means the whole queue's depth,
which may cover all hw queue's depth. And it is only used by scsi, but it
should be held in "unsigned short" too.

> 
> Should I change it to smaller 'unsigned short' or just leave it as 'unsigned long' ?
> (Now the size of struct blk_flush_queue is exactly 64 bytes)

You have to limit q->nr_requests first, which may need a bit more work for avoiding
compiling warning or sort of thing. And 64k is big enough for holding per-queue
scheduler request.

Once it is done, it is fine to define this counter as 'unsigned short'.


Thanks,
Ming

Chengming Zhou June 28, 2023, 12:55 p.m. UTC | #4

On 2023/6/28 15:22, Ming Lei wrote:
> On Wed, Jun 28, 2023 at 12:55:49PM +0800, Chengming Zhou wrote:
>> On 2023/6/28 12:13, Ming Lei wrote:
>>> On Tue, Jun 27, 2023 at 08:08:52PM +0800, chengming.zhou@linux.dev wrote:
>>>> From: Chengming Zhou <zhouchengming@bytedance.com>
>>>>
>>>> The flush state machine use a double list to link all inflight
>>>> flush_data requests, to avoid issuing separate post-flushes for
>>>> these flush_data requests which shared PREFLUSH.
>>>>
>>>> So we can't reuse rq->queuelist, this is why we need rq->flush.list
>>>>
>>>> In preparation of the next patch that reuse rq->queuelist for flush
>>>> state machine, we change the double linked list to a u64 counter,
>>>> which count all inflight flush_data requests.
>>>>
>>>> This is ok since we only need to know if there is any inflight
>>>> flush_data request, so a u64 counter is good. The only problem I can
>>>> think of is that u64 counter may overflow, which should be unlikely happen.
>>>
>>> It won't overflow, q->nr_requests is 'unsigned long', which should have
>>> been limited to one more reasonable value, such as 2 * BLK_MQ_MAX_DEPTH, so
>>> u16 should be big enough in theory.
>>
>> Ah, right. q->nr_requests is 'unsigned long' and q->queue_depth is 'unsigned int',
>> so 'unsigned long' counter here won't overflow.
> 
> Not like q->nr_requests, q->queue_depth usually means the whole queue's depth,
> which may cover all hw queue's depth. And it is only used by scsi, but it
> should be held in "unsigned short" too.
> 
>>
>> Should I change it to smaller 'unsigned short' or just leave it as 'unsigned long' ?
>> (Now the size of struct blk_flush_queue is exactly 64 bytes)
> 
> You have to limit q->nr_requests first, which may need a bit more work for avoiding
> compiling warning or sort of thing. And 64k is big enough for holding per-queue
> scheduler request.
> 
> Once it is done, it is fine to define this counter as 'unsigned short'.
> 

Ok, I looked around these related code, found it maybe subtle to me for now.
So I'd better just leave it 'unsigned long' here. :)

Thanks.

diff --git a/block/blk-flush.c b/block/blk-flush.c
index dba392cf22be..bb7adfc2a5da 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -187,7 +187,8 @@  static void blk_flush_complete_seq(struct request *rq,
 		break;
 
 	case REQ_FSEQ_DATA:
-		list_move_tail(&rq->flush.list, &fq->flush_data_in_flight);
+		list_del_init(&rq->flush.list);
+		fq->flush_data_in_flight++;
 		spin_lock(&q->requeue_lock);
 		list_add_tail(&rq->queuelist, &q->flush_list);
 		spin_unlock(&q->requeue_lock);
@@ -299,7 +300,7 @@  static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
 		return;
 
 	/* C2 and C3 */
-	if (!list_empty(&fq->flush_data_in_flight) &&
+	if (fq->flush_data_in_flight &&
 	    time_before(jiffies,
 			fq->flush_pending_since + FLUSH_PENDING_TIMEOUT))
 		return;
@@ -374,6 +375,7 @@  static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq,
 	 * the comment in flush_end_io().
 	 */
 	spin_lock_irqsave(&fq->mq_flush_lock, flags);
+	fq->flush_data_in_flight--;
 	blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error);
 	spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
 
@@ -445,7 +447,7 @@  bool blk_insert_flush(struct request *rq)
 		blk_rq_init_flush(rq);
 		rq->flush.seq |= REQ_FSEQ_POSTFLUSH;
 		spin_lock_irq(&fq->mq_flush_lock);
-		list_move_tail(&rq->flush.list, &fq->flush_data_in_flight);
+		fq->flush_data_in_flight++;
 		spin_unlock_irq(&fq->mq_flush_lock);
 		return false;
 	default:
@@ -496,7 +498,6 @@  struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size,
 
 	INIT_LIST_HEAD(&fq->flush_queue[0]);
 	INIT_LIST_HEAD(&fq->flush_queue[1]);
-	INIT_LIST_HEAD(&fq->flush_data_in_flight);
 
 	return fq;
 
diff --git a/block/blk.h b/block/blk.h
index 608c5dcc516b..686712e13835 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -15,15 +15,14 @@  struct elevator_type;
 extern struct dentry *blk_debugfs_root;
 
 struct blk_flush_queue {
+	spinlock_t		mq_flush_lock;
 	unsigned int		flush_pending_idx:1;
 	unsigned int		flush_running_idx:1;
 	blk_status_t 		rq_status;
 	unsigned long		flush_pending_since;
 	struct list_head	flush_queue[2];
-	struct list_head	flush_data_in_flight;
+	unsigned long		flush_data_in_flight;
 	struct request		*flush_rq;
-
-	spinlock_t		mq_flush_lock;
 };
 
 bool is_flush_rq(struct request *req);

[2/4] blk-flush: count inflight flush_data requests

Commit Message

Comments

Patch