Message ID | 20211012181742.672391-3-axboe@kernel.dk (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Batched completions | expand |
On 10/12/21 11:17 AM, Jens Axboe wrote: > +void sbitmap_queue_clear_batch(struct sbitmap_queue *sbq, int offset, > + int *tags, int nr_tags) > +{ > + struct sbitmap *sb = &sbq->sb; > + unsigned long *addr = NULL; > + unsigned long mask = 0; > + int i; > + > + smp_mb__before_atomic(); > + for (i = 0; i < nr_tags; i++) { > + const int tag = tags[i] - offset; > + unsigned long *this_addr; > + > + /* since we're clearing a batch, skip the deferred map */ > + this_addr = &sb->map[SB_NR_TO_INDEX(sb, tag)].word; > + if (!addr) { > + addr = this_addr; > + } else if (addr != this_addr) { > + atomic_long_andnot(mask, (atomic_long_t *) addr); > + mask = 0; > + addr = this_addr; > + } > + mask |= (1UL << SB_NR_TO_BIT(sb, tag)); > + } > + > + if (mask) > + atomic_long_andnot(mask, (atomic_long_t *) addr); > + > + smp_mb__after_atomic(); > + sbitmap_queue_wake_up(sbq); > + sbitmap_update_cpu_hint(&sbq->sb, raw_smp_processor_id(), > + tags[nr_tags - 1] - offset); > +} > + > void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr, > unsigned int cpu) > { How does replacing the sbitmap_queue_clear() implementation by calling sbitmap_queue_clear_batch() affect performance? I'm wondering whether it is possible to prevent code duplication without affecting performance negatively. Thanks, Bart.
On 10/12/21 12:29 PM, Bart Van Assche wrote: > On 10/12/21 11:17 AM, Jens Axboe wrote: >> +void sbitmap_queue_clear_batch(struct sbitmap_queue *sbq, int offset, >> + int *tags, int nr_tags) >> +{ >> + struct sbitmap *sb = &sbq->sb; >> + unsigned long *addr = NULL; >> + unsigned long mask = 0; >> + int i; >> + >> + smp_mb__before_atomic(); >> + for (i = 0; i < nr_tags; i++) { >> + const int tag = tags[i] - offset; >> + unsigned long *this_addr; >> + >> + /* since we're clearing a batch, skip the deferred map */ >> + this_addr = &sb->map[SB_NR_TO_INDEX(sb, tag)].word; >> + if (!addr) { >> + addr = this_addr; >> + } else if (addr != this_addr) { >> + atomic_long_andnot(mask, (atomic_long_t *) addr); >> + mask = 0; >> + addr = this_addr; >> + } >> + mask |= (1UL << SB_NR_TO_BIT(sb, tag)); >> + } >> + >> + if (mask) >> + atomic_long_andnot(mask, (atomic_long_t *) addr); >> + >> + smp_mb__after_atomic(); >> + sbitmap_queue_wake_up(sbq); >> + sbitmap_update_cpu_hint(&sbq->sb, raw_smp_processor_id(), >> + tags[nr_tags - 1] - offset); >> +} >> + >> void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr, >> unsigned int cpu) >> { > > How does replacing the sbitmap_queue_clear() implementation by calling > sbitmap_queue_clear_batch() affect performance? I'm wondering whether it > is possible to prevent code duplication without affecting performance > negatively. Good question, I'd rather defer that to a followup though if it ends up making sense. It's not that simple, as we play some tricks for the usual clear path by inserting a deferred mask to avoid hitting the cacheline repeatedly. That doesn't make sense to do for batched clears, obviously, so they work in slightly different ways where the single bit clear has an extra step to increase the efficiency.
diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h index e30b56023ead..4a6ff274335a 100644 --- a/include/linux/sbitmap.h +++ b/include/linux/sbitmap.h @@ -528,6 +528,17 @@ void sbitmap_queue_min_shallow_depth(struct sbitmap_queue *sbq, void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr, unsigned int cpu); +/** + * sbitmap_queue_clear_batch() - Free a batch of allocated bits + * &struct sbitmap_queue. + * @sbq: Bitmap to free from. + * @offset: offset for each tag in array + * @tags: array of tags + * @nr_tags: number of tags in array + */ +void sbitmap_queue_clear_batch(struct sbitmap_queue *sbq, int offset, + int *tags, int nr_tags); + static inline int sbq_index_inc(int index) { return (index + 1) & (SBQ_WAIT_QUEUES - 1); diff --git a/lib/sbitmap.c b/lib/sbitmap.c index f398e0ae548e..c6e2f1f2c4d2 100644 --- a/lib/sbitmap.c +++ b/lib/sbitmap.c @@ -628,6 +628,46 @@ void sbitmap_queue_wake_up(struct sbitmap_queue *sbq) } EXPORT_SYMBOL_GPL(sbitmap_queue_wake_up); +static inline void sbitmap_update_cpu_hint(struct sbitmap *sb, int cpu, int tag) +{ + if (likely(!sb->round_robin && tag < sb->depth)) + *per_cpu_ptr(sb->alloc_hint, cpu) = tag; +} + +void sbitmap_queue_clear_batch(struct sbitmap_queue *sbq, int offset, + int *tags, int nr_tags) +{ + struct sbitmap *sb = &sbq->sb; + unsigned long *addr = NULL; + unsigned long mask = 0; + int i; + + smp_mb__before_atomic(); + for (i = 0; i < nr_tags; i++) { + const int tag = tags[i] - offset; + unsigned long *this_addr; + + /* since we're clearing a batch, skip the deferred map */ + this_addr = &sb->map[SB_NR_TO_INDEX(sb, tag)].word; + if (!addr) { + addr = this_addr; + } else if (addr != this_addr) { + atomic_long_andnot(mask, (atomic_long_t *) addr); + mask = 0; + addr = this_addr; + } + mask |= (1UL << SB_NR_TO_BIT(sb, tag)); + } + + if (mask) + atomic_long_andnot(mask, (atomic_long_t *) addr); + + smp_mb__after_atomic(); + sbitmap_queue_wake_up(sbq); + sbitmap_update_cpu_hint(&sbq->sb, raw_smp_processor_id(), + tags[nr_tags - 1] - offset); +} + void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr, unsigned int cpu) { @@ -652,9 +692,7 @@ void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr, */ smp_mb__after_atomic(); sbitmap_queue_wake_up(sbq); - - if (likely(!sbq->sb.round_robin && nr < sbq->sb.depth)) - *per_cpu_ptr(sbq->sb.alloc_hint, cpu) = nr; + sbitmap_update_cpu_hint(&sbq->sb, cpu, nr); } EXPORT_SYMBOL_GPL(sbitmap_queue_clear);
sbitmap currently only supports clearing tags one-by-one, add a helper that allows the caller to pass in an array of tags to clear. Signed-off-by: Jens Axboe <axboe@kernel.dk> --- include/linux/sbitmap.h | 11 +++++++++++ lib/sbitmap.c | 44 ++++++++++++++++++++++++++++++++++++++--- 2 files changed, 52 insertions(+), 3 deletions(-)