diff mbox series

block: switch to atomic_t for request references

Message ID 9f2ad6f1-c1bb-dfac-95c8-7d9eaa7110cc@kernel.dk (mailing list archive)
State New, archived
Headers show
Series block: switch to atomic_t for request references | expand

Commit Message

Jens Axboe Dec. 3, 2021, 3:35 p.m. UTC
refcount_t is not as expensive as it used to be, but it's still more
expensive than the io_uring method of using atomic_t and just checking
for potential over/underflow.

This borrows that same implementation, which in turn is based on the
mm implementation from Linus.

Signed-off-by: Jens Axboe <axboe@kernel.dk>

---

Comments

Keith Busch Dec. 3, 2021, 3:56 p.m. UTC | #1
On Fri, Dec 03, 2021 at 08:35:40AM -0700, Jens Axboe wrote:
> refcount_t is not as expensive as it used to be, but it's still more
> expensive than the io_uring method of using atomic_t and just checking
> for potential over/underflow.
> 
> This borrows that same implementation, which in turn is based on the
> mm implementation from Linus.

Looks good.

Reviewed-by: Keith Busch <kbusch@kernel.org>
Christoph Hellwig Dec. 6, 2021, 6:53 a.m. UTC | #2
On Fri, Dec 03, 2021 at 08:35:40AM -0700, Jens Axboe wrote:
> refcount_t is not as expensive as it used to be, but it's still more
> expensive than the io_uring method of using atomic_t and just checking
> for potential over/underflow.
> 
> This borrows that same implementation, which in turn is based on the
> mm implementation from Linus.

If refcount_t isn't good enough for a normal kernel fast path we have
a problem.  Can we discuss that with the maintainers instead of coming
up with our home grown schemes again?

> 
> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> 
> ---
> 
> diff --git a/block/blk-flush.c b/block/blk-flush.c
> index f78bb39e589e..e4df894189ce 100644
> --- a/block/blk-flush.c
> +++ b/block/blk-flush.c
> @@ -229,7 +229,7 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
>  	/* release the tag's ownership to the req cloned from */
>  	spin_lock_irqsave(&fq->mq_flush_lock, flags);
>  
> -	if (!refcount_dec_and_test(&flush_rq->ref)) {
> +	if (!req_ref_put_and_test(flush_rq)) {
>  		fq->rq_status = error;
>  		spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
>  		return;
> @@ -349,7 +349,7 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
>  	 * and READ flush_rq->end_io
>  	 */
>  	smp_wmb();
> -	refcount_set(&flush_rq->ref, 1);
> +	req_ref_set(flush_rq, 1);
>  
>  	blk_flush_queue_rq(flush_rq, false);
>  }
> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> index 995336abee33..380e2dd31bfc 100644
> --- a/block/blk-mq-tag.c
> +++ b/block/blk-mq-tag.c
> @@ -228,7 +228,7 @@ static struct request *blk_mq_find_and_get_req(struct blk_mq_tags *tags,
>  
>  	spin_lock_irqsave(&tags->lock, flags);
>  	rq = tags->rqs[bitnr];
> -	if (!rq || rq->tag != bitnr || !refcount_inc_not_zero(&rq->ref))
> +	if (!rq || rq->tag != bitnr || !req_ref_inc_not_zero(rq))
>  		rq = NULL;
>  	spin_unlock_irqrestore(&tags->lock, flags);
>  	return rq;
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index fa464a3e2f9a..22ec21aa0c22 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -386,7 +386,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
>  	INIT_LIST_HEAD(&rq->queuelist);
>  	/* tag was already set */
>  	WRITE_ONCE(rq->deadline, 0);
> -	refcount_set(&rq->ref, 1);
> +	req_ref_set(rq, 1);
>  
>  	if (rq->rq_flags & RQF_ELV) {
>  		struct elevator_queue *e = data->q->elevator;
> @@ -634,7 +634,7 @@ void blk_mq_free_request(struct request *rq)
>  	rq_qos_done(q, rq);
>  
>  	WRITE_ONCE(rq->state, MQ_RQ_IDLE);
> -	if (refcount_dec_and_test(&rq->ref))
> +	if (req_ref_put_and_test(rq))
>  		__blk_mq_free_request(rq);
>  }
>  EXPORT_SYMBOL_GPL(blk_mq_free_request);
> @@ -930,7 +930,7 @@ void blk_mq_end_request_batch(struct io_comp_batch *iob)
>  		rq_qos_done(rq->q, rq);
>  
>  		WRITE_ONCE(rq->state, MQ_RQ_IDLE);
> -		if (!refcount_dec_and_test(&rq->ref))
> +		if (!req_ref_put_and_test(rq))
>  			continue;
>  
>  		blk_crypto_free_request(rq);
> @@ -1373,7 +1373,7 @@ void blk_mq_put_rq_ref(struct request *rq)
>  {
>  	if (is_flush_rq(rq))
>  		rq->end_io(rq, 0);
> -	else if (refcount_dec_and_test(&rq->ref))
> +	else if (req_ref_put_and_test(rq))
>  		__blk_mq_free_request(rq);
>  }
>  
> @@ -3003,7 +3003,7 @@ static void blk_mq_clear_rq_mapping(struct blk_mq_tags *drv_tags,
>  			unsigned long rq_addr = (unsigned long)rq;
>  
>  			if (rq_addr >= start && rq_addr < end) {
> -				WARN_ON_ONCE(refcount_read(&rq->ref) != 0);
> +				WARN_ON_ONCE(req_ref_read(rq) != 0);
>  				cmpxchg(&drv_tags->rqs[i], rq, NULL);
>  			}
>  		}
> @@ -3337,7 +3337,7 @@ static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags,
>  	if (!tags)
>  		return;
>  
> -	WARN_ON_ONCE(refcount_read(&flush_rq->ref) != 0);
> +	WARN_ON_ONCE(req_ref_read(flush_rq) != 0);
>  
>  	for (i = 0; i < queue_depth; i++)
>  		cmpxchg(&tags->rqs[i], flush_rq, NULL);
> diff --git a/block/blk.h b/block/blk.h
> index 296411900c55..f869f4b2dec9 100644
> --- a/block/blk.h
> +++ b/block/blk.h
> @@ -473,4 +473,35 @@ static inline bool should_fail_request(struct block_device *part,
>  }
>  #endif /* CONFIG_FAIL_MAKE_REQUEST */
>  
> +/*
> + * Optimized request reference counting. Ideally we'd make timeouts be more
> + * clever, as that's the only reason we need references at all... But until
> + * this happens, this is faster than using refcount_t. Also see:
> + *
> + * abc54d634334 ("io_uring: switch to atomic_t for io_kiocb reference count")
> + */
> +#define req_ref_zero_or_close_to_overflow(req)	\
> +	((unsigned int) atomic_read(&(req->ref)) + 127u <= 127u)
> +
> +static inline bool req_ref_inc_not_zero(struct request *req)
> +{
> +	return atomic_inc_not_zero(&req->ref);
> +}
> +
> +static inline bool req_ref_put_and_test(struct request *req)
> +{
> +	WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
> +	return atomic_dec_and_test(&req->ref);
> +}
> +
> +static inline void req_ref_set(struct request *req, int value)
> +{
> +	atomic_set(&req->ref, value);
> +}
> +
> +static inline int req_ref_read(struct request *req)
> +{
> +	return atomic_read(&req->ref);
> +}
> +
>  #endif /* BLK_INTERNAL_H */
> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> index bfc3cc61f653..ecdc049b52fa 100644
> --- a/include/linux/blk-mq.h
> +++ b/include/linux/blk-mq.h
> @@ -138,7 +138,7 @@ struct request {
>  	unsigned short ioprio;
>  
>  	enum mq_rq_state state;
> -	refcount_t ref;
> +	atomic_t ref;
>  
>  	unsigned long deadline;
>  
> -- 
> Jens Axboe
> 
---end quoted text---
Peter Zijlstra Dec. 6, 2021, 8:31 a.m. UTC | #3
On Sun, Dec 05, 2021 at 10:53:49PM -0800, Christoph Hellwig wrote:
> On Fri, Dec 03, 2021 at 08:35:40AM -0700, Jens Axboe wrote:
> > refcount_t is not as expensive as it used to be, but it's still more
> > expensive than the io_uring method of using atomic_t and just checking
> > for potential over/underflow.
> > 
> > This borrows that same implementation, which in turn is based on the
> > mm implementation from Linus.
> 
> If refcount_t isn't good enough for a normal kernel fast path we have
> a problem.  Can we discuss that with the maintainers instead of coming
> up with our home grown schemes again?

Quite; and for something that pretends to be about performance, it also
lacks any actual numbers to back that claim.

The proposed implementation also doesn't do nearly as much as the
refcount_t one does.

Anyway refcount_t is just a single "lock xadd" and a few branches, where
does it go wrong? Do you have perf output to compare between them?
Jens Axboe Dec. 6, 2021, 4:31 p.m. UTC | #4
On 12/5/21 11:53 PM, Christoph Hellwig wrote:
> On Fri, Dec 03, 2021 at 08:35:40AM -0700, Jens Axboe wrote:
>> refcount_t is not as expensive as it used to be, but it's still more
>> expensive than the io_uring method of using atomic_t and just checking
>> for potential over/underflow.
>>
>> This borrows that same implementation, which in turn is based on the
>> mm implementation from Linus.
> 
> If refcount_t isn't good enough for a normal kernel fast path we have
> a problem.  Can we discuss that with the maintainers instead of coming
> up with our home grown schemes again?

I think what needs to happen next here is that the code is put into a
separate header so that the vm, io_uring, and block can all use it.
refcount_t is better than it used to be, but there's a difference
between fast path and tens of millions of inc/decs per second.
Jens Axboe Dec. 6, 2021, 4:32 p.m. UTC | #5
On 12/6/21 1:31 AM, Peter Zijlstra wrote:
> On Sun, Dec 05, 2021 at 10:53:49PM -0800, Christoph Hellwig wrote:
>> On Fri, Dec 03, 2021 at 08:35:40AM -0700, Jens Axboe wrote:
>>> refcount_t is not as expensive as it used to be, but it's still more
>>> expensive than the io_uring method of using atomic_t and just checking
>>> for potential over/underflow.
>>>
>>> This borrows that same implementation, which in turn is based on the
>>> mm implementation from Linus.
>>
>> If refcount_t isn't good enough for a normal kernel fast path we have
>> a problem.  Can we discuss that with the maintainers instead of coming
>> up with our home grown schemes again?
> 
> Quite; and for something that pretends to be about performance, it also
> lacks any actual numbers to back that claim.

I can certainly generate that, it was already done for the two previous
similar conversions though.

> The proposed implementation also doesn't do nearly as much as the
> refcount_t one does.
> 
> Anyway refcount_t is just a single "lock xadd" and a few branches, where
> does it go wrong? Do you have perf output to compare between them?

I'll generate that.
Peter Zijlstra Dec. 6, 2021, 5:19 p.m. UTC | #6
On Mon, Dec 06, 2021 at 09:32:06AM -0700, Jens Axboe wrote:
> On 12/6/21 1:31 AM, Peter Zijlstra wrote:
> > On Sun, Dec 05, 2021 at 10:53:49PM -0800, Christoph Hellwig wrote:
> >> On Fri, Dec 03, 2021 at 08:35:40AM -0700, Jens Axboe wrote:
> >>> refcount_t is not as expensive as it used to be, but it's still more
> >>> expensive than the io_uring method of using atomic_t and just checking
> >>> for potential over/underflow.
> >>>
> >>> This borrows that same implementation, which in turn is based on the
> >>> mm implementation from Linus.
> >>
> >> If refcount_t isn't good enough for a normal kernel fast path we have
> >> a problem.  Can we discuss that with the maintainers instead of coming
> >> up with our home grown schemes again?
> > 
> > Quite; and for something that pretends to be about performance, it also
> > lacks any actual numbers to back that claim.
> 
> I can certainly generate that, it was already done for the two previous
> similar conversions though.

I've never seen those :-(
Linus Torvalds Dec. 6, 2021, 5:35 p.m. UTC | #7
On Mon, Dec 6, 2021 at 12:31 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> Quite; and for something that pretends to be about performance, it also
> lacks any actual numbers to back that claim.
>
> The proposed implementation also doesn't do nearly as much as the
> refcount_t one does.

Stop pretending refcoutn_t is that great.

It's horrid. The code it generators is disgusting. It should never
have been inlines in the first place, and the design decsisions were
questionable to begin with.

There's a reason core stuff (like the page counters) DO NOT USE REFCOUNT_T.

I seriously believe that refcount_t should be used for things like
device reference counting or similar issues, and not for _any_ truly
core code.

                  Linus
Jens Axboe Dec. 6, 2021, 6:13 p.m. UTC | #8
On 12/6/21 10:35 AM, Linus Torvalds wrote:
> On Mon, Dec 6, 2021 at 12:31 AM Peter Zijlstra <peterz@infradead.org> wrote:
>>
>> Quite; and for something that pretends to be about performance, it also
>> lacks any actual numbers to back that claim.
>>
>> The proposed implementation also doesn't do nearly as much as the
>> refcount_t one does.
> 
> Stop pretending refcoutn_t is that great.
> 
> It's horrid. The code it generators is disgusting. It should never
> have been inlines in the first place, and the design decsisions were
> questionable to begin with.
> 
> There's a reason core stuff (like the page counters) DO NOT USE REFCOUNT_T.
> 
> I seriously believe that refcount_t should be used for things like
> device reference counting or similar issues, and not for _any_ truly
> core code.

Maybe we just need to embrace it generically, took a quick stab at it
which is attached. Totally untested...
Kees Cook Dec. 6, 2021, 8:51 p.m. UTC | #9
On Mon, Dec 06, 2021 at 11:13:20AM -0700, Jens Axboe wrote:
> On 12/6/21 10:35 AM, Linus Torvalds wrote:
> > On Mon, Dec 6, 2021 at 12:31 AM Peter Zijlstra <peterz@infradead.org> wrote:
> >>
> >> Quite; and for something that pretends to be about performance, it also
> >> lacks any actual numbers to back that claim.
> >>
> >> The proposed implementation also doesn't do nearly as much as the
> >> refcount_t one does.
> > 
> > Stop pretending refcoutn_t is that great.
> > 
> > It's horrid. The code it generators is disgusting. It should never
> > have been inlines in the first place, and the design decsisions were
> > questionable to begin with.
> > 
> > There's a reason core stuff (like the page counters) DO NOT USE REFCOUNT_T.
> > 
> > I seriously believe that refcount_t should be used for things like
> > device reference counting or similar issues, and not for _any_ truly
> > core code.

I'd like core code to be safe too, though. :)

> Maybe we just need to embrace it generically, took a quick stab at it
> which is attached. Totally untested...

As long as we have an API that can't end up in a pathological state, I'm
happy. The problem with prior atomic_t use was that it never noticed
when it was entering a condition that could be used to confuse system
state (use-after-free, etc). Depending on people to "use it correctly"
or never make mistakes is not sufficient: we need an API that protects
itself. We have to assume there are, and will continue to be, bugs with
refcounting.
Linus Torvalds Dec. 6, 2021, 9:17 p.m. UTC | #10
On Mon, Dec 6, 2021 at 12:51 PM Kees Cook <keescook@chromium.org> wrote:
>
> I'd like core code to be safe too, though. :)

I've told you before, and I'll tell you again: 'refcount_t' is not "safe".

refcount_t is pure garbage, and is HORRID.

The saturating arithmetic is a fundamental and fatal flaw. It is pure
and utter crap.

It means that you *cannot* recover from mistakes, and the refcount
code is broken.

Stop calling it "safe" or arguing that it protects against anything at all.

It is literally and objectively WORSE than what the page counting code
does using atomics.

I don't know why you cannot seem to admit to that. refcount_t is
misdesigned in a very fundamental way.

It generates horrible code, but it also generates actively BROKEN
code. Saturation is not the answer. Saturation is the question, and
the answer is "No".

And no, anybody who thinks that saturation is "safe" is just lying to
themselves and others.

The most realistic main worry for refcounting tends to be underflow,
and the whole recount underflow situation is no better than an atomic
with a warning.

Because by the time the underflow is detected, it's already too late -
the "decrement to zero" was what resulted in the data being free'd -
so the whole "decrement past zero" is when things have already gone
south earlier, and all you can do is warn.

End result: atomics with warnings are no worse in the underflow case.

And the overflow case where the saturation is 'safe", has been
literally mis-designed to not be recoverable, so refcount_t is
literally broken.

End result: atomics are _better_ in the overflow case, and it's why
the page counters could not use the garbage that is refcount_t, and
instead did it properly.

See? In absolutely neither case is recount_t "safer". It's only worse.

I like Jens' patches. They take the _good_ code - the code we use for
page counters - and make that proper interface available to others.

Not the broken refcount_t code that is unfixable, and only good for
"we have a thousand drivers, let them wallow in this thing because we
can never make them all handle overflows properly".

And every single core use of refcount_t that doesn't have a million
random drivers using it should be converted to use that proper atomic
interface.

                    Linus
Kees Cook Dec. 6, 2021, 11:28 p.m. UTC | #11
On Mon, Dec 06, 2021 at 01:17:17PM -0800, Linus Torvalds wrote:
> End result: atomics are _better_ in the overflow case, and it's why
> the page counters could not use the garbage that is refcount_t, and
> instead did it properly.
> 
> See? In absolutely neither case is recount_t "safer". It's only worse.

Right, I understand your objection; it is valid. One of the dimensions of
"safe" is "not exploitable", which is _also_ a valid concern. As you
say, refcount_t works for the "never make them all handle overflows
properly" case, and I'm fine with using something else where we need a
better set of behaviors.

> I like Jens' patches. They take the _good_ code - the code we use for
> page counters - and make that proper interface available to others.

I am fine with whatever can be made safe in all dimensions, but I don't
want to lose sight of the exploitability dimension. This is a lot like
the BUG vs WARN situation: BUG is unfixable because there is no recovery.
WARN allows the code to do something sensible in the pathological case,
but the code needs to have been designed to handle that case. The
widely used older atomic_t reference counting code pattern had no path
to handling failure.

In the proposed API, we get a warning (sometimes) in bad states, but
there is no handling of the broken reference counter. For example,
with atomic_ref_inc_not_zero():

- there's no __must_check hint for callers to actually check it happened
- overflow is silent, so wrapping around to 1 and then having a
  call to atomic_ref_put_and_test() has no protection against exploitation
  at all. This particular code pattern mistake (missed "put") is the
  fundamental path to nearly all the refcount overflow exploits of the
  past couple decades. e.g. see:
	- CVE-2016-0728 - Keyring refcount overflow.
	  Exploit: https://www.exploit-db.com/exploits/39277/
	- CVE-2016-4558 - BPF reference count mishandling.
          Explot: https://www.exploit-db.com/exploits/39773/
  Losing that protection is just inviting these exploits back again (of
  which we've had none in the past few years).

For the API generally, nothing about the type stops someone from
accidentally using the standard atomic_t helpers instead, accidentally
bypassing any potential WARNs. It should do something similar to
refcount_t so the compiler can help guide people correctly instead of
blindly accepting an accident.

And if we're speaking to safety/robustness generally, where are the unit
tests, run-time tests (LKDTM provides this for refcount_t so it should
be easy to repurpose them), kern-doc, etc?

I'm not arguing for refcount_t -- I'm arguing for an API that isn't a
regression of features that have been protecting the kernel from bugs.
Linus Torvalds Dec. 7, 2021, 12:13 a.m. UTC | #12
On Mon, Dec 6, 2021 at 3:28 PM Kees Cook <keescook@chromium.org> wrote:
>
> I'm not arguing for refcount_t -- I'm arguing for an API that isn't a
> regression of features that have been protecting the kernel from bugs.

Maybe somebody could actually just fix refcount_t instead. Somebody
who cares about that currently horrendously bad interface.

Fix it to not do the fundamentally broken saturation that actively
destroys state: fix it to have a safe "try to increment", instead of
an unsafe "increment and do bad things".

Fix it to not unnecessarily use expensive compare-and-exchange loops,
when you can safely just race a bit, safe in the knowledge that you're
not going to race 2**31 times.

IOW, I think that "try_get_page()" function is basically the *much*
superior version of what is currently a broken "refcount_inc()".

And yes, it does warn about that overflow case that you claim only
refcount_t does. And does so without the broken semantics that
refcount h as.

                Linus
Kees Cook Dec. 7, 2021, 4:56 a.m. UTC | #13
On Mon, Dec 06, 2021 at 04:13:00PM -0800, Linus Torvalds wrote:
> On Mon, Dec 6, 2021 at 3:28 PM Kees Cook <keescook@chromium.org> wrote:
> >
> > I'm not arguing for refcount_t -- I'm arguing for an API that isn't a
> > regression of features that have been protecting the kernel from bugs.
> 
> Maybe somebody could actually just fix refcount_t instead. Somebody
> who cares about that currently horrendously bad interface.
> 
> Fix it to not do the fundamentally broken saturation that actively
> destroys state: fix it to have a safe "try to increment", instead of
> an unsafe "increment and do bad things".

There would need to be a pretty hefty transition -- there are a lot of
refcount_inc() uses that would need checking and error handling (which
might not be sane to add to ancient drivers):

      2 block
      2 crypto
      2 ipc
      2 virt
      3 mm
      4 sound
      5 rust
     10 arch
     13 security
     31 kernel
     88 include
    192 fs
    192 net
    358 drivers

refcount_inc_not_zero() already uses __must_check, etc.

I'm not afraid of giant transitions, but this could be pretty tricky.
I'm open to ideas. Maybe a treewide change of refcount_inc() ->
refcount_inc_saturating() and then start fixing all the _unsafe() cases
where a sensible error path could be created and tested?
Peter Zijlstra Dec. 7, 2021, 9:34 a.m. UTC | #14
On Mon, Dec 06, 2021 at 04:13:00PM -0800, Linus Torvalds wrote:

> Fix it to not unnecessarily use expensive compare-and-exchange loops,
> when you can safely just race a bit, safe in the knowledge that you're
> not going to race 2**31 times.
> 
> IOW, I think that "try_get_page()" function is basically the *much*
> superior version of what is currently a broken "refcount_inc()".
> 
> And yes, it does warn about that overflow case that you claim only
> refcount_t does. And does so without the broken semantics that
> refcount h as.

I think you should perhaps look at the current code again. Those cmpxchg
loops are gone. The only cmpxchg loop that's still there is for
add_not_zero(), and that is the exact same loop we have for
atomic_inc_not_zero(v) := atomic_add_unless(v,1,0) :=
atomic_fetch_add_unless(v,1,0) != 0.

The rest is exactly that LOCK XADD and assume you don't race 2^31 bits
worth.

Now, it could be GCC generates atrociously bad code simply because it
doesn't know it can use the flags from the XADD in which case we can
look at doing an arch asm implementation.
Peter Zijlstra Dec. 7, 2021, 10:30 a.m. UTC | #15
On Mon, Dec 06, 2021 at 04:13:00PM -0800, Linus Torvalds wrote:

> IOW, I think that "try_get_page()" function is basically the *much*
> superior version of what is currently a broken "refcount_inc()".
> 
> And yes, it does warn about that overflow case that you claim only
> refcount_t does. And does so without the broken semantics that
> refcount h as.

That places the burden of the unlikely overflow case on the user. Now
every driver author needs to consider the overflow case and have a
corresponding error path. How many bugs will that introduce?

Why is saturation; iow. leaking the memory, a worse option than having
bad/broken/never-tested error paths all over the kernel?
Peter Zijlstra Dec. 7, 2021, 11:26 a.m. UTC | #16
On Sun, Dec 05, 2021 at 10:53:49PM -0800, Christoph Hellwig wrote:

> > +#define req_ref_zero_or_close_to_overflow(req)	\
> > +	((unsigned int) atomic_read(&(req->ref)) + 127u <= 127u)
> > +
> > +static inline bool req_ref_inc_not_zero(struct request *req)
> > +{
> > +	return atomic_inc_not_zero(&req->ref);
> > +}
> > +
> > +static inline bool req_ref_put_and_test(struct request *req)
> > +{
> > +	WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
> > +	return atomic_dec_and_test(&req->ref);
> > +}

So it's just about these two ops, right?

Now, afaict refcount_inc_not_zero() doesn't actually generate terrible
code here's the fast-path of kernel/events/core.c:ring_buffer_get()

refcount_inc_not_zero():

    a9d0:       41 54                   push   %r12
    a9d2:       49 89 fc                mov    %rdi,%r12
    a9d5:       e8 00 00 00 00          call   a9da <ring_buffer_get+0xa>       a9d6: R_X86_64_PLT32    __rcu_read_lock-0x4
    a9da:       4d 8b a4 24 c8 02 00 00         mov    0x2c8(%r12),%r12
    a9e2:       4d 85 e4                test   %r12,%r12
    a9e5:       74 24                   je     aa0b <ring_buffer_get+0x3b>
    a9e7:       41 8b 14 24             mov    (%r12),%edx
    a9eb:       85 d2                   test   %edx,%edx
    a9ed:       74 1c                   je     aa0b <ring_buffer_get+0x3b>
    a9ef:       8d 4a 01                lea    0x1(%rdx),%ecx
*   a9f2:       89 d0                   mov    %edx,%eax
    a9f4:       f0 41 0f b1 0c 24       lock cmpxchg %ecx,(%r12)
    a9fa:       75 32                   jne    aa2e <ring_buffer_get+0x5e>
*   a9fc:       09 ca                   or     %ecx,%edx
*   a9fe:       78 19                   js     aa19 <ring_buffer_get+0x49>
    aa00:       e8 00 00 00 00          call   aa05 <ring_buffer_get+0x35>      aa01: R_X86_64_PLT32    __rcu_read_unlock-0x4
    aa05:       4c 89 e0                mov    %r12,%rax
    aa08:       41 5c                   pop    %r12
    aa0a:       c3                      ret

The * marked instructions are the difference, vs atomic_inc_not_zero():

    a9d0:       41 54                   push   %r12
    a9d2:       49 89 fc                mov    %rdi,%r12
    a9d5:       e8 00 00 00 00          call   a9da <ring_buffer_get+0xa>       a9d6: R_X86_64_PLT32    __rcu_read_lock-0x4
    a9da:       4d 8b a4 24 c8 02 00 00         mov    0x2c8(%r12),%r12
    a9e2:       4d 85 e4                test   %r12,%r12
    a9e5:       74 1e                   je     aa05 <ring_buffer_get+0x35>
    a9e7:       41 8b 04 24             mov    (%r12),%eax
    a9eb:       85 c0                   test   %eax,%eax
    a9ed:       74 16                   je     aa05 <ring_buffer_get+0x35>
    a9ef:       8d 50 01                lea    0x1(%rax),%edx
    a9f2:       f0 41 0f b1 14 24       lock cmpxchg %edx,(%r12)
    a9f8:       75 f1                   jne    a9eb <ring_buffer_get+0x1b>
    a9fa:       e8 00 00 00 00          call   a9ff <ring_buffer_get+0x2f>      a9fb: R_X86_64_PLT32    __rcu_read_unlock-0x4
    a9ff:       4c 89 e0                mov    %r12,%rax
    aa02:       41 5c                   pop    %r12
    aa04:       c3                      ret


Now, ring_buffer_put(), which uses refcount_dec_and_test():

refcount_dec_and_test()

    aa40:       b8 ff ff ff ff          mov    $0xffffffff,%eax
    aa45:       f0 0f c1 07             lock xadd %eax,(%rdi)
    aa49:       83 f8 01                cmp    $0x1,%eax
    aa4c:       74 05                   je     aa53 <ring_buffer_put+0x13>
    aa4e:       85 c0                   test   %eax,%eax
    aa50:       7e 1e                   jle    aa70 <ring_buffer_put+0x30>
    aa52:       c3                      ret

atomic_dec_and_test():

    aa40:       f0 ff 0f                lock decl (%rdi)
    aa43:       75 1d                   jne    aa62 <ring_buffer_put+0x22>

    ...

    aa62:       c3                      ret

Has a larger difference, which is fixable with the below patch, leading
to:


    a9f0:       f0 ff 0f                lock decl (%rdi)
    a9f3:       74 03                   je     a9f8 <ring_buffer_put+0x8>
    a9f5:       7c 1e                   jl     aa15 <ring_buffer_put+0x25>
    a9f7:       c3                      ret


So where exactly is the performance fail? Is it purely the mess made of
refcount_dec_and_test() ?

---

diff --git a/arch/x86/include/asm/refcount.h b/arch/x86/include/asm/refcount.h
new file mode 100644
index 000000000000..89e1f84f9170
--- /dev/null
+++ b/arch/x86/include/asm/refcount.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_REFCOUNT_H
+#define _ASM_X86_REFCOUNT_H
+
+#define refcount_dec_and_test refcount_dec_and_test
+static inline bool refcount_dec_and_test(refcount_t *r)
+{
+	asm_volatile_goto (LOCK_PREFIX "decl %[var]\n\t"
+			   "jz %l[cc_zero]\n\t"
+			   "jl %l[cc_error]"
+			   : : [var] "m" (r->refs.counter)
+			   : "memory"
+			   : cc_zero, cc_error);
+	return false;
+
+cc_zero:
+	return true;
+
+cc_error:
+	refcount_warn_saturate(r, REFCOUNT_SUB_UAF);
+	return false;
+}
+
+#endif
diff --git a/include/linux/refcount.h b/include/linux/refcount.h
index b8a6e387f8f9..776b035e12a1 100644
--- a/include/linux/refcount.h
+++ b/include/linux/refcount.h
@@ -126,6 +126,8 @@ enum refcount_saturation_type {
 
 void refcount_warn_saturate(refcount_t *r, enum refcount_saturation_type t);
 
+#include <asm/refcount.h>
+
 /**
  * refcount_set - set a refcount's value
  * @r: the refcount
@@ -328,10 +330,12 @@ static inline __must_check bool __refcount_dec_and_test(refcount_t *r, int *oldp
  *
  * Return: true if the resulting refcount is 0, false otherwise
  */
+#ifndef refcount_dec_and_test
 static inline __must_check bool refcount_dec_and_test(refcount_t *r)
 {
 	return __refcount_dec_and_test(r, NULL);
 }
+#endif
 
 static inline void __refcount_dec(refcount_t *r, int *oldp)
 {
Peter Zijlstra Dec. 7, 2021, 1:28 p.m. UTC | #17
On Tue, Dec 07, 2021 at 12:26:24PM +0100, Peter Zijlstra wrote:
> On Sun, Dec 05, 2021 at 10:53:49PM -0800, Christoph Hellwig wrote:
> 
> > > +#define req_ref_zero_or_close_to_overflow(req)	\
> > > +	((unsigned int) atomic_read(&(req->ref)) + 127u <= 127u)
> > > +
> > > +static inline bool req_ref_inc_not_zero(struct request *req)
> > > +{
> > > +	return atomic_inc_not_zero(&req->ref);
> > > +}
> > > +
> > > +static inline bool req_ref_put_and_test(struct request *req)
> > > +{
> > > +	WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
> > > +	return atomic_dec_and_test(&req->ref);
> > > +}
> 
> So it's just about these two ops, right?
> 
> Now, afaict refcount_inc_not_zero() doesn't actually generate terrible
> code here's the fast-path of kernel/events/core.c:ring_buffer_get()
> 
> refcount_inc_not_zero():
> 
>     a9d0:       41 54                   push   %r12
>     a9d2:       49 89 fc                mov    %rdi,%r12
>     a9d5:       e8 00 00 00 00          call   a9da <ring_buffer_get+0xa>       a9d6: R_X86_64_PLT32    __rcu_read_lock-0x4
>     a9da:       4d 8b a4 24 c8 02 00 00         mov    0x2c8(%r12),%r12
>     a9e2:       4d 85 e4                test   %r12,%r12
>     a9e5:       74 24                   je     aa0b <ring_buffer_get+0x3b>
>     a9e7:       41 8b 14 24             mov    (%r12),%edx
>     a9eb:       85 d2                   test   %edx,%edx
>     a9ed:       74 1c                   je     aa0b <ring_buffer_get+0x3b>
>     a9ef:       8d 4a 01                lea    0x1(%rdx),%ecx
> *   a9f2:       89 d0                   mov    %edx,%eax
>     a9f4:       f0 41 0f b1 0c 24       lock cmpxchg %ecx,(%r12)
>     a9fa:       75 32                   jne    aa2e <ring_buffer_get+0x5e>
> *   a9fc:       09 ca                   or     %ecx,%edx
> *   a9fe:       78 19                   js     aa19 <ring_buffer_get+0x49>
>     aa00:       e8 00 00 00 00          call   aa05 <ring_buffer_get+0x35>      aa01: R_X86_64_PLT32    __rcu_read_unlock-0x4
>     aa05:       4c 89 e0                mov    %r12,%rax
>     aa08:       41 5c                   pop    %r12
>     aa0a:       c3                      ret
> 
> The * marked instructions are the difference, vs atomic_inc_not_zero():
> 
>     a9d0:       41 54                   push   %r12
>     a9d2:       49 89 fc                mov    %rdi,%r12
>     a9d5:       e8 00 00 00 00          call   a9da <ring_buffer_get+0xa>       a9d6: R_X86_64_PLT32    __rcu_read_lock-0x4
>     a9da:       4d 8b a4 24 c8 02 00 00         mov    0x2c8(%r12),%r12
>     a9e2:       4d 85 e4                test   %r12,%r12
>     a9e5:       74 1e                   je     aa05 <ring_buffer_get+0x35>
>     a9e7:       41 8b 04 24             mov    (%r12),%eax
>     a9eb:       85 c0                   test   %eax,%eax
>     a9ed:       74 16                   je     aa05 <ring_buffer_get+0x35>
>     a9ef:       8d 50 01                lea    0x1(%rax),%edx
>     a9f2:       f0 41 0f b1 14 24       lock cmpxchg %edx,(%r12)
>     a9f8:       75 f1                   jne    a9eb <ring_buffer_get+0x1b>
>     a9fa:       e8 00 00 00 00          call   a9ff <ring_buffer_get+0x2f>      a9fb: R_X86_64_PLT32    __rcu_read_unlock-0x4
>     a9ff:       4c 89 e0                mov    %r12,%rax
>     aa02:       41 5c                   pop    %r12
>     aa04:       c3                      ret
> 
> 
> Now, ring_buffer_put(), which uses refcount_dec_and_test():
> 
> refcount_dec_and_test()
> 
>     aa40:       b8 ff ff ff ff          mov    $0xffffffff,%eax
>     aa45:       f0 0f c1 07             lock xadd %eax,(%rdi)
>     aa49:       83 f8 01                cmp    $0x1,%eax
>     aa4c:       74 05                   je     aa53 <ring_buffer_put+0x13>
>     aa4e:       85 c0                   test   %eax,%eax
>     aa50:       7e 1e                   jle    aa70 <ring_buffer_put+0x30>
>     aa52:       c3                      ret
> 
> atomic_dec_and_test():
> 
>     aa40:       f0 ff 0f                lock decl (%rdi)
>     aa43:       75 1d                   jne    aa62 <ring_buffer_put+0x22>
> 
>     ...
> 
>     aa62:       c3                      ret
> 
> Has a larger difference, which is fixable with the below patch, leading
> to:
> 
> 
>     a9f0:       f0 ff 0f                lock decl (%rdi)
>     a9f3:       74 03                   je     a9f8 <ring_buffer_put+0x8>
>     a9f5:       7c 1e                   jl     aa15 <ring_buffer_put+0x25>
>     a9f7:       c3                      ret
> 
> 
> So where exactly is the performance fail? Is it purely the mess made of
> refcount_dec_and_test() ?
> 

For refcount_inc(), as extracted from alloc_perf_context(), I get:

    4b68:       b8 01 00 00 00          mov    $0x1,%eax
    4b6d:       f0 0f c1 43 28          lock xadd %eax,0x28(%rbx)
    4b72:       85 c0                   test   %eax,%eax
    4b74:       74 1b                   je     4b91 <alloc_perf_context+0xf1>
    4b76:       8d 50 01                lea    0x1(%rax),%edx
    4b79:       09 c2                   or     %eax,%edx
    4b7b:       78 20                   js     4b9d <alloc_perf_context+0xfd>

the best I can seem to find is: https://godbolt.org/z/ne5o6eEEW

    4b68:	b8 01 00 00 00       	mov    $0x1,%eax
    4b6d:	f0 0f c1 07          	lock xadd %eax,(%rdi)
    4b71:	83 c0 01             	add    $0x1,%eax
    4b74:	74 16                	je     4b8c <alloc_perf_context+0xec>
    4b76:	78 20                	js     4b98 <alloc_perf_context+0xf8>

per the below updated patch. Still, not really pretty, but loads better
I'd say.


---
 arch/x86/include/asm/refcount.h | 48 +++++++++++++++++++++++++++++++++++++++++
 include/linux/refcount.h        |  6 ++++++
 2 files changed, 54 insertions(+)

diff --git a/arch/x86/include/asm/refcount.h b/arch/x86/include/asm/refcount.h
new file mode 100644
index 000000000000..b32b4a5e0f37
--- /dev/null
+++ b/arch/x86/include/asm/refcount.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_REFCOUNT_H
+#define _ASM_X86_REFCOUNT_H
+
+#define refcount_dec_and_test refcount_dec_and_test
+static inline bool refcount_dec_and_test(refcount_t *r)
+{
+	asm_volatile_goto (LOCK_PREFIX "decl %[var]\n\t"
+			   "jz %l[cc_zero]\n\t"
+			   "jl %l[cc_error]"
+			   : : [var] "m" (r->refs.counter)
+			   : "memory"
+			   : cc_zero, cc_error);
+	return false;
+
+cc_zero:
+	return true;
+
+cc_error:
+	refcount_warn_saturate(r, REFCOUNT_SUB_UAF);
+	return false;
+}
+
+#define refcount_inc refcount_inc
+static inline void refcount_inc(refcount_t *r)
+{
+	int one = 1;
+
+	asm_volatile_goto (LOCK_PREFIX "xaddl %%eax, %[var]\n\t"
+			   "addl $1, %%eax\n\t"
+			   "je %l[cc_zero]\n\t"
+			   "js %l[cc_error]"
+			   : : [var] "m" (r->refs.counter), "a" (one)
+			   : "memory"
+			   : cc_zero, cc_error);
+
+	return;
+
+cc_zero:
+	refcount_warn_saturate(r, REFCOUNT_ADD_UAF);
+	return;
+
+cc_error:
+	refcount_warn_saturate(r, REFCOUNT_ADD_OVF);
+	return;
+}
+
+#endif
diff --git a/include/linux/refcount.h b/include/linux/refcount.h
index b8a6e387f8f9..3ea5757c0b35 100644
--- a/include/linux/refcount.h
+++ b/include/linux/refcount.h
@@ -147,6 +147,8 @@ static inline unsigned int refcount_read(const refcount_t *r)
 	return atomic_read(&r->refs);
 }
 
+#include <asm/refcount.h>
+
 static inline __must_check bool __refcount_add_not_zero(int i, refcount_t *r, int *oldp)
 {
 	int old = refcount_read(r);
@@ -262,10 +264,12 @@ static inline void __refcount_inc(refcount_t *r, int *oldp)
  * Will WARN if the refcount is 0, as this represents a possible use-after-free
  * condition.
  */
+#ifndef refcount_inc
 static inline void refcount_inc(refcount_t *r)
 {
 	__refcount_inc(r, NULL);
 }
+#endif
 
 static inline __must_check bool __refcount_sub_and_test(int i, refcount_t *r, int *oldp)
 {
@@ -328,10 +332,12 @@ static inline __must_check bool __refcount_dec_and_test(refcount_t *r, int *oldp
  *
  * Return: true if the resulting refcount is 0, false otherwise
  */
+#ifndef refcount_dec_and_test
 static inline __must_check bool refcount_dec_and_test(refcount_t *r)
 {
 	return __refcount_dec_and_test(r, NULL);
 }
+#endif
 
 static inline void __refcount_dec(refcount_t *r, int *oldp)
 {
Peter Zijlstra Dec. 7, 2021, 3:51 p.m. UTC | #18
On Tue, Dec 07, 2021 at 02:28:22PM +0100, Peter Zijlstra wrote:
> +static inline void refcount_inc(refcount_t *r)
> +{
> +	int one = 1;
> +
> +	asm_volatile_goto (LOCK_PREFIX "xaddl %%eax, %[var]\n\t"
> +			   "addl $1, %%eax\n\t"
> +			   "je %l[cc_zero]\n\t"
> +			   "js %l[cc_error]"
> +			   : : [var] "m" (r->refs.counter), "a" (one)
> +			   : "memory"
> +			   : cc_zero, cc_error);

+       asm_volatile_goto (LOCK_PREFIX "xaddl %[reg], %[var]\n\t"
+                          "addl $1, %[reg]\n\t"
+                          "jz %l[cc_zero]\n\t"
+                          "js %l[cc_error]"
+                          : : [var] "m" (r->refs.counter), [reg] "r" (1)
+                          : "memory"
+                          : cc_zero, cc_error);

Is of course a better implementation, but I'm not sure I actually
understand this code. Afaict: add $1,%[reg], will only set ZF when
%[reg] was -1 such that the result is now 0.

But that's not what the code said; is this GCC going funny in the head
or should I just stop staring at this...
Linus Torvalds Dec. 7, 2021, 4:03 p.m. UTC | #19
On Tue, Dec 7, 2021 at 1:34 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> Now, it could be GCC generates atrociously bad code simply because it
> doesn't know it can use the flags from the XADD in which case we can
> look at doing an arch asm implementation.

That may help.

This thread started because of alleged performance problems with
refcount_t.  No numbers, and maybe it was wrong, but they have been
seen before, so I wouldn't dismiss the issue.

What I've seen personally is the horrendous "multiple different calls
to overflow functions with different arguments", which makes
__refcount_add() do stupid things and blow up the code. All for
entirely pointless debugging code.

                  Linus
Linus Torvalds Dec. 7, 2021, 4:10 p.m. UTC | #20
On Tue, Dec 7, 2021 at 2:30 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Mon, Dec 06, 2021 at 04:13:00PM -0800, Linus Torvalds wrote:
>
> > IOW, I think that "try_get_page()" function is basically the *much*
> > superior version of what is currently a broken "refcount_inc()".
>
> That places the burden of the unlikely overflow case on the user. Now
> every driver author needs to consider the overflow case and have a
> corresponding error path. How many bugs will that introduce?

Did you even *look* at the patch that started this discussion?

The patch replaced refcount_t, made for no more complex code.

> Why is saturation; iow. leaking the memory, a worse option than having
> bad/broken/never-tested error paths all over the kernel?

.. which is why I'm fine with refcount_t - for driver reference counts
etc sysfs behavior etc.

What I am *NOT* fine with is somebody then piping up claimign that
refcount_t is "better" than doing it properly by hand, and complaining
about patches that replace it with something else.

See the difference here?

'refcount_t' is fundamentally broken, cannot handle overflows
properly, and is *designed* to do that. You even seem to make excuses
for that very design.

And that "lazy mans overflow protection" is fine if we're talking
random code, and are talking maintainers who doesn't want to deal with
it.

But that is also why I do not EVER want to hear "no, don't convert
away from refcount_t".

Can you really not understand my dislike of a data type that is
fundamentally a lazy shortcut and intentionally hides error cases with
leaks?

Doing it properly is always the better option, and "refcount_t" really
*fundamentally* can never do it properly. It doesn't have the proper
interfaces, and it doesn't return enough information to recover.

I'm not arguing that we have to replace every refcount_t user.

But I *am* arguing that if somebody wants to replace their refcount_t
with something better - whatever the reason - they had better not hear
the mindless whining about it.

                Linus
Linus Torvalds Dec. 7, 2021, 4:13 p.m. UTC | #21
On Tue, Dec 7, 2021 at 5:28 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> +#define refcount_dec_and_test refcount_dec_and_test
> +static inline bool refcount_dec_and_test(refcount_t *r)
> +{
> +       asm_volatile_goto (LOCK_PREFIX "decl %[var]\n\t"
> +                          "jz %l[cc_zero]\n\t"
> +                          "jl %l[cc_error]"
> +                          : : [var] "m" (r->refs.counter)
> +                          : "memory"
> +                          : cc_zero, cc_error);
> +       return false;
> +
> +cc_zero:
> +       return true;
> +
> +cc_error:
> +       refcount_warn_saturate(r, REFCOUNT_SUB_UAF);
> +       return false;
> +}

Please don't add broken arch-specific helpers that are useless for
anything else than the broken refcount_t use.

Make it return three return values, call it atomic_dec_and_test_ref()
or something like that, and now people can use it without having to
use a refcount_t.

Nobody really wants two different error cases anyway. The fact that
refcount_warn_saturate() has different cases is only an annoyance.

             Linus
Peter Zijlstra Dec. 7, 2021, 4:23 p.m. UTC | #22
On Tue, Dec 07, 2021 at 08:10:02AM -0800, Linus Torvalds wrote:
> Can you really not understand my dislike of a data type that is
> fundamentally a lazy shortcut and intentionally hides error cases with
> leaks?

I see the distinction; I just don't see it as hiding. refcount_t will
very much generate a splat indicating things need fixing, the leak
simply ensures that it doesn't get worse from there on out.

Anyway, if the code already needs to handle that failure case, such as
inc_not_zero() usage, then there is indeed no additional complexity.
Peter Zijlstra Dec. 7, 2021, 4:52 p.m. UTC | #23
On Tue, Dec 07, 2021 at 08:13:38AM -0800, Linus Torvalds wrote:
> On Tue, Dec 7, 2021 at 5:28 AM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > +#define refcount_dec_and_test refcount_dec_and_test
> > +static inline bool refcount_dec_and_test(refcount_t *r)
> > +{
> > +       asm_volatile_goto (LOCK_PREFIX "decl %[var]\n\t"
> > +                          "jz %l[cc_zero]\n\t"
> > +                          "jl %l[cc_error]"
> > +                          : : [var] "m" (r->refs.counter)
> > +                          : "memory"
> > +                          : cc_zero, cc_error);
> > +       return false;
> > +
> > +cc_zero:
> > +       return true;
> > +
> > +cc_error:
> > +       refcount_warn_saturate(r, REFCOUNT_SUB_UAF);
> > +       return false;
> > +}
> 
> Please don't add broken arch-specific helpers that are useless for
> anything else than the broken refcount_t use.

I take issue with the broken, but I concede the point.

> Make it return three return values, call it atomic_dec_and_test_ref()
> or something like that, and now people can use it without having to
> use a refcount_t.
> 
> Nobody really wants two different error cases anyway. The fact that
> refcount_warn_saturate() has different cases is only an annoyance.

How about we do something like the unsafe_ uaccess functions and do it
like so?

It's a bit gross, and there seems to be a problem with macro expansion
of __ofl, but it 'works'.

---
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index 5e754e895767..921ecfd5a40b 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -263,6 +263,22 @@ static __always_inline int arch_atomic_fetch_xor(int i, atomic_t *v)
 }
 #define arch_atomic_fetch_xor arch_atomic_fetch_xor
 
+#define atomic_dec_and_test_ofl(_v, __ofl)				\
+({									\
+	__label__ __zero;						\
+	__label__ __out;						\
+	bool __ret = false;						\
+	asm_volatile_goto (LOCK_PREFIX "decl %[var]\n\t"		\
+			   "jz %l[__zero]\n\t"				\
+			   "jl %l[__ofl]"				\
+			   : : [var] "m" ((_v)->counter)		\
+			   : "memory"					\
+			   : __zero, __ofl);				\
+	goto __out;							\
+__zero:	__ret = true;							\
+__out:	__ret;								\
+})
+
 #ifdef CONFIG_X86_32
 # include <asm/atomic64_32.h>
 #else
diff --git a/include/linux/refcount.h b/include/linux/refcount.h
index b8a6e387f8f9..f11ce70de2da 100644
--- a/include/linux/refcount.h
+++ b/include/linux/refcount.h
@@ -330,7 +330,15 @@ static inline __must_check bool __refcount_dec_and_test(refcount_t *r, int *oldp
  */
 static inline __must_check bool refcount_dec_and_test(refcount_t *r)
 {
+#ifdef atomic_dec_and_test_ofl
+	return atomic_dec_and_test_ofl(&r->refs, __ofl);
+
+__ofl:
+	refcount_warn_saturate(r, REFCOUNT_SUB_UAF);
+	return false;
+#else
 	return __refcount_dec_and_test(r, NULL);
+#endif
 }
 
 static inline void __refcount_dec(refcount_t *r, int *oldp)
Peter Zijlstra Dec. 7, 2021, 5:41 p.m. UTC | #24
On Tue, Dec 07, 2021 at 05:52:13PM +0100, Peter Zijlstra wrote:
> It's a bit gross, and there seems to be a problem with macro expansion
> of __ofl, but it 'works'.
> 
> ---
> diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
> index 5e754e895767..921ecfd5a40b 100644
> --- a/arch/x86/include/asm/atomic.h
> +++ b/arch/x86/include/asm/atomic.h
> @@ -263,6 +263,22 @@ static __always_inline int arch_atomic_fetch_xor(int i, atomic_t *v)
>  }
>  #define arch_atomic_fetch_xor arch_atomic_fetch_xor
>  
> +#define atomic_dec_and_test_ofl(_v, __ofl)				\
> +({									\
> +	__label__ __zero;						\
> +	__label__ __out;						\
> +	bool __ret = false;						\
> +	asm_volatile_goto (LOCK_PREFIX "decl %[var]\n\t"		\
> +			   "jz %l[__zero]\n\t"				\
> +			   "jl %l[__ofl]"				\
				%l2

and it works much better...

> +			   : : [var] "m" ((_v)->counter)		\
> +			   : "memory"					\
> +			   : __zero, __ofl);				\
> +	goto __out;							\
> +__zero:	__ret = true;							\
> +__out:	__ret;								\
> +})
Linus Torvalds Dec. 7, 2021, 5:43 p.m. UTC | #25
On Tue, Dec 7, 2021 at 8:52 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> How about we do something like the unsafe_ uaccess functions and do it
> like so?

Look ok by me, but I'd suggest simply making both error cases labels
in that case.

If somebody wants to distinguish them, it's easy to do, and if not you
can just use the same label.

Yes, it's a bit unusual, but once you start using labels for the
exceptional cases, why not do so consistently?

In the case of "dec_and_test" the "decrement to zero" case may not be
hugely exceptional, but if you do the same for "increment with
overflow protection" you do end up having the two different "zero vs
too big", so it would actually be more consistent, I think..

                     Linus
Linus Torvalds Dec. 7, 2021, 5:45 p.m. UTC | #26
On Tue, Dec 7, 2021 at 9:43 AM Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> In the case of "dec_and_test" the "decrement to zero" case may not be
> hugely exceptional, but if you do the same for "increment with
> overflow protection" you do end up having the two different "zero vs
> too big", so it would actually be more consistent, I think..

Hmm.. Of course, that "increment with overflow protection" might not
actually want two targets in the first place, so maybe this argument
is BS.

Particularly if you can do the "zero or overflow" case with just one
test, maybe you really do want both cases to be one single error case.

And then your "refcount_dec_and_test()" with a return value (for
decrement to zero) and a separate error case (for errors) looks fine.

               Linus
Peter Zijlstra Dec. 7, 2021, 8:28 p.m. UTC | #27
On Tue, Dec 07, 2021 at 02:28:22PM +0100, Peter Zijlstra wrote:

> For refcount_inc(), as extracted from alloc_perf_context(), I get:
> 
>     4b68:       b8 01 00 00 00          mov    $0x1,%eax
>     4b6d:       f0 0f c1 43 28          lock xadd %eax,0x28(%rbx)
>     4b72:       85 c0                   test   %eax,%eax
>     4b74:       74 1b                   je     4b91 <alloc_perf_context+0xf1>
>     4b76:       8d 50 01                lea    0x1(%rax),%edx
>     4b79:       09 c2                   or     %eax,%edx
>     4b7b:       78 20                   js     4b9d <alloc_perf_context+0xfd>
> 
> the best I can seem to find is: https://godbolt.org/z/ne5o6eEEW

Argh.. __atomic_add_fetch() != __atomic_fetch_add(); much confusion for
GCC having both. With the right primitive it becomes:

        movl    $1, %eax
        lock xaddl      %eax, (%rdi)
        testl   %eax, %eax
        je      .L5
        js      .L6

Which makes a whole lot more sense.
Linus Torvalds Dec. 7, 2021, 11:23 p.m. UTC | #28
On Tue, Dec 7, 2021 at 12:28 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> Argh.. __atomic_add_fetch() != __atomic_fetch_add(); much confusion for
> GCC having both. With the right primitive it becomes:
>
>         movl    $1, %eax
>         lock xaddl      %eax, (%rdi)
>         testl   %eax, %eax
>         je      .L5
>         js      .L6
>
> Which makes a whole lot more sense.

Note that the above misses the case where the old value was MAX_INT
and the result now became negative.

That isn't a _problem_, of course. I think it's fine. But if you cared
about it, you'd have to do something like

>         movl    $1, %eax
>         lock xaddl      %eax, (%rdi)
>         jl      .L6
>         testl   %eax, %eax
>         je      .L5

instead (I might have gotten that "jl" wrong, needs more testing.

But if you don't care about the MAX_INT overflow and make the overflow
boundary be the next increment, then just make it be one error case:

>         movl    $1, %eax
>         lock xaddl      %eax, (%rdi)
>         testl   %eax, %eax
>         jle      .L5

and then (if you absolutely have to distinguish them) you can test eax
again in the slow path.

                     Linus
Peter Zijlstra Dec. 8, 2021, 5:07 p.m. UTC | #29
On Tue, Dec 07, 2021 at 03:23:02PM -0800, Linus Torvalds wrote:
> On Tue, Dec 7, 2021 at 12:28 PM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > Argh.. __atomic_add_fetch() != __atomic_fetch_add(); much confusion for
> > GCC having both. With the right primitive it becomes:
> >
> >         movl    $1, %eax
> >         lock xaddl      %eax, (%rdi)
> >         testl   %eax, %eax
> >         je      .L5
> >         js      .L6
> >
> > Which makes a whole lot more sense.
> 
> Note that the above misses the case where the old value was MAX_INT
> and the result now became negative.
> 
> That isn't a _problem_, of course. I think it's fine. But if you cared
> about it, you'd have to do something like

Hm....

> But if you don't care about the MAX_INT overflow and make the overflow
> boundary be the next increment, then just make it be one error case:
> 
> >         movl    $1, %eax
> >         lock xaddl      %eax, (%rdi)
> >         testl   %eax, %eax
> >         jle      .L5
> 
> and then (if you absolutely have to distinguish them) you can test eax
> again in the slow path.

Suppose:

  inc(): overflow when old value is negative or zero
  dec(): overflow when new value is negative or zero

That gives:

  inc(INT_MAX) is allowed
  dec(INT_MIN) is allowed

IOW, the effective range becomes: [1..INT_MIN], which is a bit
counter-intuitive, but then so is most of this stuff.

Therefore can write this like:

#define atomic_inc_ofl(v, label)
do {
	int old = atomic_fetch_inc(v);
	if (unlikely(old <= 0))
		goto label;
} while (0)

#define atomic_dec_ofl(v, label)
do {
	int new = atomic_dec_return(v);
	if (unlikely(new <= 0))
		goto label;
} while (0)

#define atomic_dec_and_test_ofl(v, label)
({
	bool ret = false;
	int new = atomic_dec_return(&r->refs);
	if (unlikely(new < 0))
		goto label;
	if (unlikely(new == 0)
		ret = true;
	ret;
})

For a consistent set of primitives, right?

Which already gives better code-gen than we have today.

But that then also means we can write dec_ofl as:

	lock decl %[var]
	jle %l1

and dec_and_test_ofl() like:

	lock decl %[var]
	jl %l2
	je %l[__zero]

Lemme finisht the patches and send that out after dinner.
Linus Torvalds Dec. 8, 2021, 6 p.m. UTC | #30
On Wed, Dec 8, 2021 at 9:07 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> IOW, the effective range becomes: [1..INT_MIN], which is a bit
> counter-intuitive, but then so is most of this stuff.

I'd suggest not codifying it too strictly, because the exact range at
the upper end might depend on what is convenient for an architecture
to do.

For x86, 'xadd' has odd semantics in that the flags register is about
the *new* state, but the returned value is about the *old* state.

That means that on x86, some things are cheaper to test based on the
pre-inc/dec values, and other things are cheaper to test based on the
post-inc/dec ones.

It's also why for "page->_mapcount" we have the "free" value being -1,
not 0, and the refcount is "off by one". It makes the special cases of
"increment from zero" and "decrement to zero" be very easy and
straightforward to test for.

That might be an option for an "atomic_ref" type - with our existing
"page_mapcount()" code being the thing we'd convert first, and make be
the example for it.

I think it should also make the error cases be very easy to check for
without extra tests. If you make "decrement from zero" be the "ok, now
it's free", then that shows in the carry flag. But otherwise, if SF or
OF is set, it's an error.  That means we can use the regular atomics
and flags (although not "dec" and "inc", since we'd care about CF).

So on x86, I think "atomic_dec_ref()" could be

        lock subl $1,ptr
        jc now_its_free
        jl this_is_an_error

if we end up having that "off by one" model.

And importantly, "atomic_inc_ref()" would be just

        lock incl ptr
        jle this_is_an_error

and this avoids us having to have the value in a register and test it
separately.

So your suggestion is _close_, but note how you can't do the "inc_ofl"
without that "off-by-one" model.

And again - I might have gotten the exact flag test instructions
wrong. That's what you get for not actually doing serious assembly
language for a couple of decades.

            Linus
Peter Zijlstra Dec. 8, 2021, 6:44 p.m. UTC | #31
On Wed, Dec 08, 2021 at 10:00:04AM -0800, Linus Torvalds wrote:
> On Wed, Dec 8, 2021 at 9:07 AM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > IOW, the effective range becomes: [1..INT_MIN], which is a bit
> > counter-intuitive, but then so is most of this stuff.
> 
> I'd suggest not codifying it too strictly, because the exact range at
> the upper end might depend on what is convenient for an architecture
> to do.
> 
> For x86, 'xadd' has odd semantics in that the flags register is about
> the *new* state, but the returned value is about the *old* state.

From testing xadd had different flags from add; I've not yet looked at
the SDM to see what it said on the matter.

> That means that on x86, some things are cheaper to test based on the
> pre-inc/dec values, and other things are cheaper to test based on the
> post-inc/dec ones.
> 
> It's also why for "page->_mapcount" we have the "free" value being -1,
> not 0, and the refcount is "off by one". It makes the special cases of
> "increment from zero" and "decrement to zero" be very easy and
> straightforward to test for.
> 
> That might be an option for an "atomic_ref" type - with our existing
> "page_mapcount()" code being the thing we'd convert first, and make be
> the example for it.
> 
> I think it should also make the error cases be very easy to check for
> without extra tests. If you make "decrement from zero" be the "ok, now
> it's free", then that shows in the carry flag. But otherwise, if SF or
> OF is set, it's an error.  That means we can use the regular atomics
> and flags (although not "dec" and "inc", since we'd care about CF).
> 
> So on x86, I think "atomic_dec_ref()" could be
> 
>         lock subl $1,ptr
>         jc now_its_free
>         jl this_is_an_error
> 
> if we end up having that "off by one" model.
> 
> And importantly, "atomic_inc_ref()" would be just
> 
>         lock incl ptr
>         jle this_is_an_error
> 
> and this avoids us having to have the value in a register and test it
> separately.
> 
> So your suggestion is _close_, but note how you can't do the "inc_ofl"
> without that "off-by-one" model.
> 
> And again - I might have gotten the exact flag test instructions
> wrong. That's what you get for not actually doing serious assembly
> language for a couple of decades.

Yeah; I don't have it all in-cache either; I'll go through it tomorrow
or something to see what I can make of it.

Meanwhile I did send out what I had.
Linus Torvalds Dec. 8, 2021, 6:50 p.m. UTC | #32
On Wed, Dec 8, 2021 at 10:44 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> From testing xadd had different flags from add; I've not yet looked at
> the SDM to see what it said on the matter.

That should not be the case. Just checked, and it just says

  "The CF, PF, AF, SF, ZF, and OF flags are set according to the
result of the addition, which is stored in the destination operand"

which shows that I was confused about 'xadd' - I thought it returned
the old value in the register ("fetch_add"). It doesn't. It returns
the new one ("add_fetch"). And then 'fetch_add' ends up undoing it by
doing a sub or whatever.

So the actual returned value and the flags should match on x86.

Other architectures have the "return old value" model, which does mean
that my "different architectures can have different preferences for
which one to test" argument was right, even if I got xadd wrong.

               Linus
Peter Zijlstra Dec. 8, 2021, 8:32 p.m. UTC | #33
On Wed, Dec 08, 2021 at 10:50:10AM -0800, Linus Torvalds wrote:
> On Wed, Dec 8, 2021 at 10:44 AM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > From testing xadd had different flags from add; I've not yet looked at
> > the SDM to see what it said on the matter.
> 
> That should not be the case. Just checked, and it just says
> 
>   "The CF, PF, AF, SF, ZF, and OF flags are set according to the
> result of the addition, which is stored in the destination operand"

I got the constraints wrong :/ They match now.
Peter Zijlstra Dec. 10, 2021, 10:57 a.m. UTC | #34
On Wed, Dec 08, 2021 at 10:50:10AM -0800, Linus Torvalds wrote:
> On Wed, Dec 8, 2021 at 10:44 AM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > From testing xadd had different flags from add; I've not yet looked at
> > the SDM to see what it said on the matter.
> 
> That should not be the case. Just checked, and it just says
> 
>   "The CF, PF, AF, SF, ZF, and OF flags are set according to the
> result of the addition, which is stored in the destination operand"
> 
> which shows that I was confused about 'xadd' - I thought it returned
> the old value in the register ("fetch_add"). It doesn't. It returns
> the new one ("add_fetch"). And then 'fetch_add' ends up undoing it by
> doing a sub or whatever.
> 
> So the actual returned value and the flags should match on x86.
> 
> Other architectures have the "return old value" model, which does mean
> that my "different architectures can have different preferences for
> which one to test" argument was right, even if I got xadd wrong.

I think XADD does return old too; SDM states:

"Exchanges the first operand (destination operand) with the second
operand (source operand), then loads the sum of the two values into the
destination operand. The destination operand can be a register or a
memory location; the source operand is a register."

So it first exchanges and then adds. Which is why the flags are set for
add, not exchange.
Peter Zijlstra Dec. 10, 2021, 12:38 p.m. UTC | #35
On Wed, Dec 08, 2021 at 10:00:04AM -0800, Linus Torvalds wrote:

> It's also why for "page->_mapcount" we have the "free" value being -1,
> not 0, and the refcount is "off by one". It makes the special cases of
> "increment from zero" and "decrement to zero" be very easy and
> straightforward to test for.
> 
> That might be an option for an "atomic_ref" type - with our existing
> "page_mapcount()" code being the thing we'd convert first, and make be
> the example for it.
> 
> I think it should also make the error cases be very easy to check for
> without extra tests. If you make "decrement from zero" be the "ok, now
> it's free", then that shows in the carry flag. But otherwise, if SF or
> OF is set, it's an error.  That means we can use the regular atomics
> and flags (although not "dec" and "inc", since we'd care about CF).
> 
> So on x86, I think "atomic_dec_ref()" could be
> 
>         lock subl $1,ptr
>         jc now_its_free
>         jl this_is_an_error
> 
> if we end up having that "off by one" model.
> 
> And importantly, "atomic_inc_ref()" would be just
> 
>         lock incl ptr
>         jle this_is_an_error
> 
> and this avoids us having to have the value in a register and test it
> separately.
> 
> So your suggestion is _close_, but note how you can't do the "inc_ofl"
> without that "off-by-one" model.
> 
> And again - I might have gotten the exact flag test instructions
> wrong. That's what you get for not actually doing serious assembly
> language for a couple of decades.


add(         -3):       CF=0 PF=0 AF=0 ZF=0 SF=1 ... OF=0       | sub(         -3):     CF=0 PF=1 AF=0 ZF=0 SF=1 ... OF=0
add(         -2):       CF=0 PF=1 AF=0 ZF=0 SF=1 ... OF=0       | sub(         -2):     CF=0 PF=0 AF=0 ZF=0 SF=1 ... OF=0
add(         -1):       CF=1 PF=1 AF=1 ZF=1 SF=0 ... OF=0       | sub(         -1):     CF=0 PF=0 AF=0 ZF=0 SF=1 ... OF=0
add(          0):       CF=0 PF=0 AF=0 ZF=0 SF=0 ... OF=0       | sub(          0):     CF=1 PF=1 AF=1 ZF=0 SF=1 ... OF=0
add(          1):       CF=0 PF=0 AF=0 ZF=0 SF=0 ... OF=0       | sub(          1):     CF=0 PF=1 AF=0 ZF=1 SF=0 ... OF=0
add(          2):       CF=0 PF=1 AF=0 ZF=0 SF=0 ... OF=0       | sub(          2):     CF=0 PF=0 AF=0 ZF=0 SF=0 ... OF=0
add(          3):       CF=0 PF=0 AF=0 ZF=0 SF=0 ... OF=0       | sub(          3):     CF=0 PF=0 AF=0 ZF=0 SF=0 ... OF=0
               :                                                |               :
add( 2147483645):       CF=0 PF=0 AF=0 ZF=0 SF=0 ... OF=0       | sub( 2147483645):     CF=0 PF=1 AF=0 ZF=0 SF=0 ... OF=0
add( 2147483646):       CF=0 PF=1 AF=0 ZF=0 SF=0 ... OF=0       | sub( 2147483646):     CF=0 PF=0 AF=0 ZF=0 SF=0 ... OF=0
add( 2147483647):       CF=0 PF=1 AF=1 ZF=0 SF=1 ... OF=1       | sub( 2147483647):     CF=0 PF=0 AF=0 ZF=0 SF=0 ... OF=0
add(-2147483648):       CF=0 PF=0 AF=0 ZF=0 SF=1 ... OF=0       | sub(-2147483648):     CF=0 PF=1 AF=1 ZF=0 SF=0 ... OF=1
add(-2147483647):       CF=0 PF=0 AF=0 ZF=0 SF=1 ... OF=0       | sub(-2147483647):     CF=0 PF=1 AF=0 ZF=0 SF=1 ... OF=0
add(-2147483646):       CF=0 PF=1 AF=0 ZF=0 SF=1 ... OF=0       | sub(-2147483646):     CF=0 PF=0 AF=0 ZF=0 SF=1 ... OF=0
add(-2147483645):       CF=0 PF=0 AF=0 ZF=0 SF=1 ... OF=0       | sub(-2147483645):     CF=0 PF=0 AF=0 ZF=0 SF=1 ... OF=0

So:

e := z
l := s!=o

inc()						inc()
	lock inc %[var]                     		mov       $-1, %[reg]
	jle	error-zero-or-negative          	lock xadd %[reg], %[var]
                                                	test      %[reg], %[reg]
                                                	jle	  error-zero-or-negative
                                                
dec()                                           dec()
	lock sub $1, %[var]                     	lock dec %[var]
	jc	error-to-zero                   	jle	error-zero-or-negative
	jl	error-from-negative             
                                                
dec_and_test()                                  dec_and_test()
	lock sub $1, %[var]                     	lock dec %[var]
	jc	do-free                         	jl	error-from-negative
	jl	error-from-negative             	je	do-free


Should work I suppose, and gives [-1, INT_MIN] as operating range. It
adds a single branch instruction (which should be default predicted
not-taken due to being a forward jump IIRC) but makes inc a lot smaller.


Except I've no sane idea how to make it work with the rest of
refcount_t. The best I can seem to come up with is something like:

#define ATOMIC_OFL_OFFSET	1

static inline int refcount_read(const refcount_t *r)
{
	return atomic_read(&r->refs) + ATOMIC_OFL_OFFSET;
}

static inline void refcount_set(refcount_t *r, int n)
{
	atomic_set(&r->refs, n - ATOMIC_OFL_OFFSET);
}

static inline __must_check bool __refcount_add_not_zero(int i, refcount_t *r, int *oldp)
{
	int old = atomic_read(&r->refs);

	do {
		if (old == -ATOMIC_OFL)
			break;
	} while (!atomic_try_cmpxchg_relaxed(&r->refs, &old, old + i));

	old += ATOMIC_OFL_OFFSET;

	if (oldp)
		*oldp = old;

	if (unlikely(old < 0 || (i > 1 && old + i < 0)))
		refcount_warn_saturate(r, REFCOUNT_ADD_NOT_ZERO_OVF);

	return old;
}

static inline void __refcount_add(int i, refcount_t *, int *oldp)
{
	int old = atomic_fetch_add_relaxed(i, &r->refs) + ATOMIC_OFL_OFFSET;

	if (oldp)
		*oldp = old;

	if (unlikely(!old))
		refcount_warn_saturate(r, REFCOUNT_ADD_UAF);
	if (unlikely(old < 0 || old + i < 0)
		refcount_warn_saturate(r, REFCOUNT_ADD_OVF);
}

And have the generic code have: ATOMIC_OFL_OFFSET == 0.

Do we *really* want to do that ?

With the above, __refcount_add_not_zero(), for the common case of: @i=1,
@oldp=NULL we get:

    a8f7:       41 8b 04 24             mov    (%r12),%eax
    a8fb:       83 f8 ff                cmp    $0xffffffff,%eax
    a8fe:       74 1a                   je     a91a <ring_buffer_get+0x3a>
    a900:       8d 50 01                lea    0x1(%rax),%edx
    a903:       f0 41 0f b1 14 24       lock cmpxchg %edx,(%r12)
    a909:       75 f0                   jne    a8fb <ring_buffer_get+0x1b>
    a90b:       85 d2                   test   %edx,%edx
    a90d:       78 19                   js     a928 <ring_buffer_get+0x48>

Which is actually really nice because i == ATOMIC_OFL_OFFSET.

Anybody? For now I think I'll drop the documentation patch and do this
scheme as the last patch in the series for v2.

Also, Mark suggested I rename the new primitives to:
atomic_*_overflow().
diff mbox series

Patch

diff --git a/block/blk-flush.c b/block/blk-flush.c
index f78bb39e589e..e4df894189ce 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -229,7 +229,7 @@  static void flush_end_io(struct request *flush_rq, blk_status_t error)
 	/* release the tag's ownership to the req cloned from */
 	spin_lock_irqsave(&fq->mq_flush_lock, flags);
 
-	if (!refcount_dec_and_test(&flush_rq->ref)) {
+	if (!req_ref_put_and_test(flush_rq)) {
 		fq->rq_status = error;
 		spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
 		return;
@@ -349,7 +349,7 @@  static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
 	 * and READ flush_rq->end_io
 	 */
 	smp_wmb();
-	refcount_set(&flush_rq->ref, 1);
+	req_ref_set(flush_rq, 1);
 
 	blk_flush_queue_rq(flush_rq, false);
 }
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 995336abee33..380e2dd31bfc 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -228,7 +228,7 @@  static struct request *blk_mq_find_and_get_req(struct blk_mq_tags *tags,
 
 	spin_lock_irqsave(&tags->lock, flags);
 	rq = tags->rqs[bitnr];
-	if (!rq || rq->tag != bitnr || !refcount_inc_not_zero(&rq->ref))
+	if (!rq || rq->tag != bitnr || !req_ref_inc_not_zero(rq))
 		rq = NULL;
 	spin_unlock_irqrestore(&tags->lock, flags);
 	return rq;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index fa464a3e2f9a..22ec21aa0c22 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -386,7 +386,7 @@  static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 	INIT_LIST_HEAD(&rq->queuelist);
 	/* tag was already set */
 	WRITE_ONCE(rq->deadline, 0);
-	refcount_set(&rq->ref, 1);
+	req_ref_set(rq, 1);
 
 	if (rq->rq_flags & RQF_ELV) {
 		struct elevator_queue *e = data->q->elevator;
@@ -634,7 +634,7 @@  void blk_mq_free_request(struct request *rq)
 	rq_qos_done(q, rq);
 
 	WRITE_ONCE(rq->state, MQ_RQ_IDLE);
-	if (refcount_dec_and_test(&rq->ref))
+	if (req_ref_put_and_test(rq))
 		__blk_mq_free_request(rq);
 }
 EXPORT_SYMBOL_GPL(blk_mq_free_request);
@@ -930,7 +930,7 @@  void blk_mq_end_request_batch(struct io_comp_batch *iob)
 		rq_qos_done(rq->q, rq);
 
 		WRITE_ONCE(rq->state, MQ_RQ_IDLE);
-		if (!refcount_dec_and_test(&rq->ref))
+		if (!req_ref_put_and_test(rq))
 			continue;
 
 		blk_crypto_free_request(rq);
@@ -1373,7 +1373,7 @@  void blk_mq_put_rq_ref(struct request *rq)
 {
 	if (is_flush_rq(rq))
 		rq->end_io(rq, 0);
-	else if (refcount_dec_and_test(&rq->ref))
+	else if (req_ref_put_and_test(rq))
 		__blk_mq_free_request(rq);
 }
 
@@ -3003,7 +3003,7 @@  static void blk_mq_clear_rq_mapping(struct blk_mq_tags *drv_tags,
 			unsigned long rq_addr = (unsigned long)rq;
 
 			if (rq_addr >= start && rq_addr < end) {
-				WARN_ON_ONCE(refcount_read(&rq->ref) != 0);
+				WARN_ON_ONCE(req_ref_read(rq) != 0);
 				cmpxchg(&drv_tags->rqs[i], rq, NULL);
 			}
 		}
@@ -3337,7 +3337,7 @@  static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags,
 	if (!tags)
 		return;
 
-	WARN_ON_ONCE(refcount_read(&flush_rq->ref) != 0);
+	WARN_ON_ONCE(req_ref_read(flush_rq) != 0);
 
 	for (i = 0; i < queue_depth; i++)
 		cmpxchg(&tags->rqs[i], flush_rq, NULL);
diff --git a/block/blk.h b/block/blk.h
index 296411900c55..f869f4b2dec9 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -473,4 +473,35 @@  static inline bool should_fail_request(struct block_device *part,
 }
 #endif /* CONFIG_FAIL_MAKE_REQUEST */
 
+/*
+ * Optimized request reference counting. Ideally we'd make timeouts be more
+ * clever, as that's the only reason we need references at all... But until
+ * this happens, this is faster than using refcount_t. Also see:
+ *
+ * abc54d634334 ("io_uring: switch to atomic_t for io_kiocb reference count")
+ */
+#define req_ref_zero_or_close_to_overflow(req)	\
+	((unsigned int) atomic_read(&(req->ref)) + 127u <= 127u)
+
+static inline bool req_ref_inc_not_zero(struct request *req)
+{
+	return atomic_inc_not_zero(&req->ref);
+}
+
+static inline bool req_ref_put_and_test(struct request *req)
+{
+	WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
+	return atomic_dec_and_test(&req->ref);
+}
+
+static inline void req_ref_set(struct request *req, int value)
+{
+	atomic_set(&req->ref, value);
+}
+
+static inline int req_ref_read(struct request *req)
+{
+	return atomic_read(&req->ref);
+}
+
 #endif /* BLK_INTERNAL_H */
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index bfc3cc61f653..ecdc049b52fa 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -138,7 +138,7 @@  struct request {
 	unsigned short ioprio;
 
 	enum mq_rq_state state;
-	refcount_t ref;
+	atomic_t ref;
 
 	unsigned long deadline;