[2/9] sbitmap: add helper to clear a batch of tags

Message ID	20211012181742.672391-3-axboe@kernel.dk (mailing list archive)
State	New, archived
Headers	show Return-Path: <linux-block-owner@kernel.org> From: Jens Axboe <axboe@kernel.dk> To: linux-block@vger.kernel.org Cc: Jens Axboe <axboe@kernel.dk> Subject: [PATCH 2/9] sbitmap: add helper to clear a batch of tags Date: Tue, 12 Oct 2021 12:17:35 -0600 Message-Id: <20211012181742.672391-3-axboe@kernel.dk> In-Reply-To: <20211012181742.672391-1-axboe@kernel.dk> References: <20211012181742.672391-1-axboe@kernel.dk> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Precedence: bulk
Series	Batched completions \| expand [PATCHSET,0/9] Batched completions [1/9] block: add a struct io_batch argument to fops->iopoll() [2/9] sbitmap: add helper to clear a batch of tags [3/9] sbitmap: test bit before calling test_and_set_bit() [4/9] block: add support for blk_mq_end_request_batch() [5/9] nvme: move the fast path nvme error and disposition helpers [6/9] nvme: add support for batched completion of polled IO [7/9] block: assign batch completion handler in blk_poll() [8/9] io_uring: utilize the io_batch infrastructure for more efficient polled IO [9/9] nvme: wire up completion batching for the IRQ path

Message ID

20211012181742.672391-3-axboe@kernel.dk (mailing list archive)

State

New, archived

Headers

From: Jens Axboe <axboe@kernel.dk>
To: linux-block@vger.kernel.org
Cc: Jens Axboe <axboe@kernel.dk>
Subject: [PATCH 2/9] sbitmap: add helper to clear a batch of tags
Date: Tue, 12 Oct 2021 12:17:35 -0600
Message-Id: <20211012181742.672391-3-axboe@kernel.dk>
In-Reply-To: <20211012181742.672391-1-axboe@kernel.dk>
References: <20211012181742.672391-1-axboe@kernel.dk>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
Precedence: bulk

Series

Batched completions | expand

Commit Message

Jens Axboe Oct. 12, 2021, 6:17 p.m. UTC

sbitmap currently only supports clearing tags one-by-one, add a helper
that allows the caller to pass in an array of tags to clear.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/sbitmap.h | 11 +++++++++++
 lib/sbitmap.c           | 44 ++++++++++++++++++++++++++++++++++++++---
 2 files changed, 52 insertions(+), 3 deletions(-)

Comments

Bart Van Assche Oct. 12, 2021, 6:29 p.m. UTC | #1

On 10/12/21 11:17 AM, Jens Axboe wrote:
> +void sbitmap_queue_clear_batch(struct sbitmap_queue *sbq, int offset,
> +				int *tags, int nr_tags)
> +{
> +	struct sbitmap *sb = &sbq->sb;
> +	unsigned long *addr = NULL;
> +	unsigned long mask = 0;
> +	int i;
> +
> +	smp_mb__before_atomic();
> +	for (i = 0; i < nr_tags; i++) {
> +		const int tag = tags[i] - offset;
> +		unsigned long *this_addr;
> +
> +		/* since we're clearing a batch, skip the deferred map */
> +		this_addr = &sb->map[SB_NR_TO_INDEX(sb, tag)].word;
> +		if (!addr) {
> +			addr = this_addr;
> +		} else if (addr != this_addr) {
> +			atomic_long_andnot(mask, (atomic_long_t *) addr);
> +			mask = 0;
> +			addr = this_addr;
> +		}
> +		mask |= (1UL << SB_NR_TO_BIT(sb, tag));
> +	}
> +
> +	if (mask)
> +		atomic_long_andnot(mask, (atomic_long_t *) addr);
> +
> +	smp_mb__after_atomic();
> +	sbitmap_queue_wake_up(sbq);
> +	sbitmap_update_cpu_hint(&sbq->sb, raw_smp_processor_id(),
> +					tags[nr_tags - 1] - offset);
> +}
> +
>   void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
>   			 unsigned int cpu)
>   {

How does replacing the sbitmap_queue_clear() implementation by calling 
sbitmap_queue_clear_batch() affect performance? I'm wondering whether it 
is possible to prevent code duplication without affecting performance 
negatively.

Thanks,

Bart.

Jens Axboe Oct. 12, 2021, 6:34 p.m. UTC | #2

On 10/12/21 12:29 PM, Bart Van Assche wrote:
> On 10/12/21 11:17 AM, Jens Axboe wrote:
>> +void sbitmap_queue_clear_batch(struct sbitmap_queue *sbq, int offset,
>> +				int *tags, int nr_tags)
>> +{
>> +	struct sbitmap *sb = &sbq->sb;
>> +	unsigned long *addr = NULL;
>> +	unsigned long mask = 0;
>> +	int i;
>> +
>> +	smp_mb__before_atomic();
>> +	for (i = 0; i < nr_tags; i++) {
>> +		const int tag = tags[i] - offset;
>> +		unsigned long *this_addr;
>> +
>> +		/* since we're clearing a batch, skip the deferred map */
>> +		this_addr = &sb->map[SB_NR_TO_INDEX(sb, tag)].word;
>> +		if (!addr) {
>> +			addr = this_addr;
>> +		} else if (addr != this_addr) {
>> +			atomic_long_andnot(mask, (atomic_long_t *) addr);
>> +			mask = 0;
>> +			addr = this_addr;
>> +		}
>> +		mask |= (1UL << SB_NR_TO_BIT(sb, tag));
>> +	}
>> +
>> +	if (mask)
>> +		atomic_long_andnot(mask, (atomic_long_t *) addr);
>> +
>> +	smp_mb__after_atomic();
>> +	sbitmap_queue_wake_up(sbq);
>> +	sbitmap_update_cpu_hint(&sbq->sb, raw_smp_processor_id(),
>> +					tags[nr_tags - 1] - offset);
>> +}
>> +
>>   void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
>>   			 unsigned int cpu)
>>   {
> 
> How does replacing the sbitmap_queue_clear() implementation by calling 
> sbitmap_queue_clear_batch() affect performance? I'm wondering whether it 
> is possible to prevent code duplication without affecting performance 
> negatively.

Good question, I'd rather defer that to a followup though if it ends up
making sense. It's not that simple, as we play some tricks for the usual
clear path by inserting a deferred mask to avoid hitting the cacheline
repeatedly. That doesn't make sense to do for batched clears, obviously,
so they work in slightly different ways where the single bit clear has
an extra step to increase the efficiency.

diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index e30b56023ead..4a6ff274335a 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -528,6 +528,17 @@  void sbitmap_queue_min_shallow_depth(struct sbitmap_queue *sbq,
 void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
 			 unsigned int cpu);
 
+/**
+ * sbitmap_queue_clear_batch() - Free a batch of allocated bits
+ * &struct sbitmap_queue.
+ * @sbq: Bitmap to free from.
+ * @offset: offset for each tag in array
+ * @tags: array of tags
+ * @nr_tags: number of tags in array
+ */
+void sbitmap_queue_clear_batch(struct sbitmap_queue *sbq, int offset,
+				int *tags, int nr_tags);
+
 static inline int sbq_index_inc(int index)
 {
 	return (index + 1) & (SBQ_WAIT_QUEUES - 1);
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index f398e0ae548e..c6e2f1f2c4d2 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -628,6 +628,46 @@  void sbitmap_queue_wake_up(struct sbitmap_queue *sbq)
 }
 EXPORT_SYMBOL_GPL(sbitmap_queue_wake_up);
 
+static inline void sbitmap_update_cpu_hint(struct sbitmap *sb, int cpu, int tag)
+{
+	if (likely(!sb->round_robin && tag < sb->depth))
+		*per_cpu_ptr(sb->alloc_hint, cpu) = tag;
+}
+
+void sbitmap_queue_clear_batch(struct sbitmap_queue *sbq, int offset,
+				int *tags, int nr_tags)
+{
+	struct sbitmap *sb = &sbq->sb;
+	unsigned long *addr = NULL;
+	unsigned long mask = 0;
+	int i;
+
+	smp_mb__before_atomic();
+	for (i = 0; i < nr_tags; i++) {
+		const int tag = tags[i] - offset;
+		unsigned long *this_addr;
+
+		/* since we're clearing a batch, skip the deferred map */
+		this_addr = &sb->map[SB_NR_TO_INDEX(sb, tag)].word;
+		if (!addr) {
+			addr = this_addr;
+		} else if (addr != this_addr) {
+			atomic_long_andnot(mask, (atomic_long_t *) addr);
+			mask = 0;
+			addr = this_addr;
+		}
+		mask |= (1UL << SB_NR_TO_BIT(sb, tag));
+	}
+
+	if (mask)
+		atomic_long_andnot(mask, (atomic_long_t *) addr);
+
+	smp_mb__after_atomic();
+	sbitmap_queue_wake_up(sbq);
+	sbitmap_update_cpu_hint(&sbq->sb, raw_smp_processor_id(),
+					tags[nr_tags - 1] - offset);
+}
+
 void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
 			 unsigned int cpu)
 {
@@ -652,9 +692,7 @@  void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
 	 */
 	smp_mb__after_atomic();
 	sbitmap_queue_wake_up(sbq);
-
-	if (likely(!sbq->sb.round_robin && nr < sbq->sb.depth))
-		*per_cpu_ptr(sbq->sb.alloc_hint, cpu) = nr;
+	sbitmap_update_cpu_hint(&sbq->sb, cpu, nr);
 }
 EXPORT_SYMBOL_GPL(sbitmap_queue_clear);

[2/9] sbitmap: add helper to clear a batch of tags

Commit Message

Comments

Patch