diff mbox

[RFC,3/5] asm-generic/bitops/atomic.h: Rewrite using atomic_fetch_*

Message ID 1518708575-12284-4-git-send-email-will.deacon@arm.com (mailing list archive)
State New, archived
Headers show

Commit Message

Will Deacon Feb. 15, 2018, 3:29 p.m. UTC
The atomic bitops can actually be implemented pretty efficiently using
the atomic_fetch_* ops, rather than explicit use of spinlocks.

Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 include/asm-generic/bitops/atomic.h | 219 ++++++++++++------------------------
 1 file changed, 71 insertions(+), 148 deletions(-)

Comments

Peter Zijlstra Feb. 15, 2018, 5:08 p.m. UTC | #1
On Thu, Feb 15, 2018 at 03:29:33PM +0000, Will Deacon wrote:
> +static inline void set_bit(unsigned int nr, volatile unsigned long *p)
> +{
> +	p += BIT_WORD(nr);
> +	atomic_long_fetch_or_relaxed(BIT_MASK(nr), (atomic_long_t *)p);
> +}
>  
> +static inline void clear_bit(unsigned int nr, volatile unsigned long *p)
> +{
> +	p += BIT_WORD(nr);
> +	atomic_long_fetch_andnot_relaxed(BIT_MASK(nr), (atomic_long_t *)p);
> +}
>  
> +static inline void change_bit(unsigned int nr, volatile unsigned long *p)
> +{
> +	p += BIT_WORD(nr);
> +	atomic_long_fetch_xor_relaxed(BIT_MASK(nr), (atomic_long_t *)p);
> +}
>  
> +static inline int test_and_set_bit(unsigned int nr, volatile unsigned long *p)
>  {
> +	long old;
>  	unsigned long mask = BIT_MASK(nr);
>  
> +	p += BIT_WORD(nr);
> +	if (READ_ONCE(*p) & mask)
> +		return 1;
> +
> +	old = atomic_long_fetch_or(mask, (atomic_long_t *)p);
> +	return !!(old & mask);
>  }
>  
> +static inline int test_and_clear_bit(unsigned int nr, volatile unsigned long *p)
>  {
> +	long old;
>  	unsigned long mask = BIT_MASK(nr);
>  
> +	p += BIT_WORD(nr);
> +	if (!(READ_ONCE(*p) & mask))
> +		return 0;
> +
> +	old = atomic_long_fetch_andnot(mask, (atomic_long_t *)p);
> +	return !!(old & mask);
>  }
>  
> +static inline int test_and_change_bit(unsigned int nr, volatile unsigned long *p)
>  {
> +	long old;
>  	unsigned long mask = BIT_MASK(nr);
>  
> +	p += BIT_WORD(nr);
> +	old = atomic_long_fetch_xor(mask, (atomic_long_t *)p);
> +	return !!(old & mask);
>  }
>  
> +static inline int test_and_set_bit_lock(unsigned int nr,
> +					volatile unsigned long *p)
>  {
> +	long old;
>  	unsigned long mask = BIT_MASK(nr);
>  
> +	p += BIT_WORD(nr);
> +	if (READ_ONCE(*p) & mask)
> +		return 1;
>  
> +	old = atomic_long_fetch_or_acquire(mask, (atomic_long_t *)p);
> +	return !!(old & mask);
>  }
>  
> +static inline void clear_bit_unlock(unsigned int nr, volatile unsigned long *p)
>  {
> +	p += BIT_WORD(nr);
> +	atomic_long_fetch_andnot_release(BIT_MASK(nr), (atomic_long_t *)p);
> +}
>  
> +static inline void __clear_bit_unlock(unsigned int nr,
> +				      volatile unsigned long *p)
> +{
> +	unsigned long old;
>  
> +	p += BIT_WORD(nr);
> +	old = READ_ONCE(*p);
> +	old &= ~BIT_MASK(nr);
> +	smp_store_release(p, old);

This should be atomic_set_release() I think, for the special case where
atomics are implemented with spinlocks, see the 'fun' case in
Documentation/atomic_t.txt.

>  }

The only other comment is that I think it would be better if you use
atomic_t instead of atomic_long_t. It would just mean changing
BIT_WORD() and BIT_MASK().

The reason is that we generate a pretty sane set of atomic_t primitives
as long as the architecture supplies cmpxchg, but atomic64 defaults to
utter crap, even on 64bit platforms.

Otherwise this looks pretty neat.
Will Deacon Feb. 15, 2018, 6:20 p.m. UTC | #2
Hi Peter,

Thanks for having a look.

On Thu, Feb 15, 2018 at 06:08:47PM +0100, Peter Zijlstra wrote:
> On Thu, Feb 15, 2018 at 03:29:33PM +0000, Will Deacon wrote:
> > +static inline void __clear_bit_unlock(unsigned int nr,
> > +				      volatile unsigned long *p)
> > +{
> > +	unsigned long old;
> >  
> > +	p += BIT_WORD(nr);
> > +	old = READ_ONCE(*p);
> > +	old &= ~BIT_MASK(nr);
> > +	smp_store_release(p, old);
> 
> This should be atomic_set_release() I think, for the special case where
> atomics are implemented with spinlocks, see the 'fun' case in
> Documentation/atomic_t.txt.

My understanding of __clear_bit_unlock is that there is guaranteed to be
no concurrent accesses to the same word, so why would it matter whether
locks are used to implement atomics?

> The only other comment is that I think it would be better if you use
> atomic_t instead of atomic_long_t. It would just mean changing
> BIT_WORD() and BIT_MASK().

It would make it pretty messy for big-endian architectures, I think...

> The reason is that we generate a pretty sane set of atomic_t primitives
> as long as the architecture supplies cmpxchg, but atomic64 defaults to
> utter crap, even on 64bit platforms.

I think all the architectures using this today are 32-bit:

blackfin
c6x
cris
metag
openrisc
sh
xtensa

and I don't know how much we should care about optimising the generic atomic
bitops for 64-bit architectures that rely on spinlocks for 64-bit atomics!

Will
Peter Zijlstra Feb. 16, 2018, 10:35 a.m. UTC | #3
On Thu, Feb 15, 2018 at 06:20:49PM +0000, Will Deacon wrote:

> > The only other comment is that I think it would be better if you use
> > atomic_t instead of atomic_long_t. It would just mean changing
> > BIT_WORD() and BIT_MASK().
> 
> It would make it pretty messy for big-endian architectures, I think...

Urgh, the big.little indians strike again.. Bah I always forget about
that.

#define BIT_U32_MASK(nr)	(1UL << ((nr) % 32))
#define BIT_U32_WORD(nr)	(((nr) / 32) ^ (4 * __BIG_ENDIAN__))

Or something like that might work, but I always get these things wrong.

> > The reason is that we generate a pretty sane set of atomic_t primitives
> > as long as the architecture supplies cmpxchg, but atomic64 defaults to
> > utter crap, even on 64bit platforms.
> 
> I think all the architectures using this today are 32-bit:
> 
> blackfin
> c6x
> cris
> metag
> openrisc
> sh
> xtensa
> 
> and I don't know how much we should care about optimising the generic atomic
> bitops for 64-bit architectures that rely on spinlocks for 64-bit atomics!

You're probably right, but it just bugs me that we default to such
horrible crap. Arguably we should do a better default for atomic64_t on
64bit archs. But that's for another time.
Peter Zijlstra Feb. 16, 2018, 2:18 p.m. UTC | #4
On Fri, Feb 16, 2018 at 11:35:20AM +0100, Peter Zijlstra wrote:

> #define BIT_U32_MASK(nr)	(1UL << ((nr) % 32))
> #define BIT_U32_WORD(nr)	(((nr) / 32) ^ (4 * __BIG_ENDIAN__))

s/4 *//

it's already a 4 byte offset.
Will Deacon Feb. 19, 2018, 2:01 p.m. UTC | #5
On Fri, Feb 16, 2018 at 11:35:20AM +0100, Peter Zijlstra wrote:
> On Thu, Feb 15, 2018 at 06:20:49PM +0000, Will Deacon wrote:
> 
> > > The only other comment is that I think it would be better if you use
> > > atomic_t instead of atomic_long_t. It would just mean changing
> > > BIT_WORD() and BIT_MASK().
> > 
> > It would make it pretty messy for big-endian architectures, I think...
> 
> Urgh, the big.little indians strike again.. Bah I always forget about
> that.
> 
> #define BIT_U32_MASK(nr)	(1UL << ((nr) % 32))
> #define BIT_U32_WORD(nr)	(((nr) / 32) ^ (4 * __BIG_ENDIAN__))
> 
> Or something like that might work, but I always get these things wrong.
> 
> > > The reason is that we generate a pretty sane set of atomic_t primitives
> > > as long as the architecture supplies cmpxchg, but atomic64 defaults to
> > > utter crap, even on 64bit platforms.
> > 
> > I think all the architectures using this today are 32-bit:
> > 
> > blackfin
> > c6x
> > cris
> > metag
> > openrisc
> > sh
> > xtensa
> > 
> > and I don't know how much we should care about optimising the generic atomic
> > bitops for 64-bit architectures that rely on spinlocks for 64-bit atomics!
> 
> You're probably right, but it just bugs me that we default to such
> horrible crap. Arguably we should do a better default for atomic64_t on
> 64bit archs. But that's for another time.

If it's defined, then we could consider using cmpxchg64 to build atomic64
instead of the locks. But even then, I'm not sure we're really helping
anybody out in practice.

Will
Peter Zijlstra Feb. 19, 2018, 2:05 p.m. UTC | #6
On Mon, Feb 19, 2018 at 02:01:51PM +0000, Will Deacon wrote:
> If it's defined, then we could consider using cmpxchg64 to build atomic64
> instead of the locks. But even then, I'm not sure we're really helping
> anybody out in practice.

yeah, most 64bit archs have more atomics or ll/sc and would not use it
anyway I suppose.
diff mbox

Patch

diff --git a/include/asm-generic/bitops/atomic.h b/include/asm-generic/bitops/atomic.h
index 04deffaf5f7d..69b7915e291d 100644
--- a/include/asm-generic/bitops/atomic.h
+++ b/include/asm-generic/bitops/atomic.h
@@ -2,189 +2,112 @@ 
 #ifndef _ASM_GENERIC_BITOPS_ATOMIC_H_
 #define _ASM_GENERIC_BITOPS_ATOMIC_H_
 
-#include <asm/types.h>
-#include <linux/irqflags.h>
+#include <linux/atomic.h>
+#include <linux/compiler.h>
+#include <asm/barrier.h>
 
-#ifdef CONFIG_SMP
-#include <asm/spinlock.h>
-#include <asm/cache.h>		/* we use L1_CACHE_BYTES */
-
-/* Use an array of spinlocks for our atomic_ts.
- * Hash function to index into a different SPINLOCK.
- * Since "a" is usually an address, use one spinlock per cacheline.
+/*
+ * Implementation of atomic bitops using atomic-fetch ops.
+ * See Documentation/atomic_bitops.txt for details.
  */
-#  define ATOMIC_HASH_SIZE 4
-#  define ATOMIC_HASH(a) (&(__atomic_hash[ (((unsigned long) a)/L1_CACHE_BYTES) & (ATOMIC_HASH_SIZE-1) ]))
-
-extern arch_spinlock_t __atomic_hash[ATOMIC_HASH_SIZE] __lock_aligned;
-
-/* Can't use raw_spin_lock_irq because of #include problems, so
- * this is the substitute */
-#define _atomic_spin_lock_irqsave(l,f) do {	\
-	arch_spinlock_t *s = ATOMIC_HASH(l);	\
-	local_irq_save(f);			\
-	arch_spin_lock(s);			\
-} while(0)
-
-#define _atomic_spin_unlock_irqrestore(l,f) do {	\
-	arch_spinlock_t *s = ATOMIC_HASH(l);		\
-	arch_spin_unlock(s);				\
-	local_irq_restore(f);				\
-} while(0)
 
+static inline void set_bit(unsigned int nr, volatile unsigned long *p)
+{
+	p += BIT_WORD(nr);
+	atomic_long_fetch_or_relaxed(BIT_MASK(nr), (atomic_long_t *)p);
+}
 
-#else
-#  define _atomic_spin_lock_irqsave(l,f) do { local_irq_save(f); } while (0)
-#  define _atomic_spin_unlock_irqrestore(l,f) do { local_irq_restore(f); } while (0)
-#endif
+static inline void clear_bit(unsigned int nr, volatile unsigned long *p)
+{
+	p += BIT_WORD(nr);
+	atomic_long_fetch_andnot_relaxed(BIT_MASK(nr), (atomic_long_t *)p);
+}
 
-/*
- * NMI events can occur at any time, including when interrupts have been
- * disabled by *_irqsave().  So you can get NMI events occurring while a
- * *_bit function is holding a spin lock.  If the NMI handler also wants
- * to do bit manipulation (and they do) then you can get a deadlock
- * between the original caller of *_bit() and the NMI handler.
- *
- * by Keith Owens
- */
+static inline void change_bit(unsigned int nr, volatile unsigned long *p)
+{
+	p += BIT_WORD(nr);
+	atomic_long_fetch_xor_relaxed(BIT_MASK(nr), (atomic_long_t *)p);
+}
 
-/**
- * set_bit - Atomically set a bit in memory
- * @nr: the bit to set
- * @addr: the address to start counting from
- *
- * This function is atomic and may not be reordered.  See __set_bit()
- * if you do not require the atomic guarantees.
- *
- * Note: there are no guarantees that this function will not be reordered
- * on non x86 architectures, so if you are writing portable code,
- * make sure not to rely on its reordering guarantees.
- *
- * Note that @nr may be almost arbitrarily large; this function is not
- * restricted to acting on a single-word quantity.
- */
-static inline void set_bit(int nr, volatile unsigned long *addr)
+static inline int test_and_set_bit(unsigned int nr, volatile unsigned long *p)
 {
+	long old;
 	unsigned long mask = BIT_MASK(nr);
-	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
-	unsigned long flags;
 
-	_atomic_spin_lock_irqsave(p, flags);
-	*p  |= mask;
-	_atomic_spin_unlock_irqrestore(p, flags);
+	p += BIT_WORD(nr);
+	if (READ_ONCE(*p) & mask)
+		return 1;
+
+	old = atomic_long_fetch_or(mask, (atomic_long_t *)p);
+	return !!(old & mask);
 }
 
-/**
- * clear_bit - Clears a bit in memory
- * @nr: Bit to clear
- * @addr: Address to start counting from
- *
- * clear_bit() is atomic and may not be reordered.  However, it does
- * not contain a memory barrier, so if it is used for locking purposes,
- * you should call smp_mb__before_atomic() and/or smp_mb__after_atomic()
- * in order to ensure changes are visible on other processors.
- */
-static inline void clear_bit(int nr, volatile unsigned long *addr)
+static inline int test_and_clear_bit(unsigned int nr, volatile unsigned long *p)
 {
+	long old;
 	unsigned long mask = BIT_MASK(nr);
-	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
-	unsigned long flags;
 
-	_atomic_spin_lock_irqsave(p, flags);
-	*p &= ~mask;
-	_atomic_spin_unlock_irqrestore(p, flags);
+	p += BIT_WORD(nr);
+	if (!(READ_ONCE(*p) & mask))
+		return 0;
+
+	old = atomic_long_fetch_andnot(mask, (atomic_long_t *)p);
+	return !!(old & mask);
 }
 
-/**
- * change_bit - Toggle a bit in memory
- * @nr: Bit to change
- * @addr: Address to start counting from
- *
- * change_bit() is atomic and may not be reordered. It may be
- * reordered on other architectures than x86.
- * Note that @nr may be almost arbitrarily large; this function is not
- * restricted to acting on a single-word quantity.
- */
-static inline void change_bit(int nr, volatile unsigned long *addr)
+static inline int test_and_change_bit(unsigned int nr, volatile unsigned long *p)
 {
+	long old;
 	unsigned long mask = BIT_MASK(nr);
-	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
-	unsigned long flags;
 
-	_atomic_spin_lock_irqsave(p, flags);
-	*p ^= mask;
-	_atomic_spin_unlock_irqrestore(p, flags);
+	p += BIT_WORD(nr);
+	old = atomic_long_fetch_xor(mask, (atomic_long_t *)p);
+	return !!(old & mask);
 }
 
-/**
- * test_and_set_bit - Set a bit and return its old value
- * @nr: Bit to set
- * @addr: Address to count from
- *
- * This operation is atomic and cannot be reordered.
- * It may be reordered on other architectures than x86.
- * It also implies a memory barrier.
- */
-static inline int test_and_set_bit(int nr, volatile unsigned long *addr)
+static inline int test_and_set_bit_lock(unsigned int nr,
+					volatile unsigned long *p)
 {
+	long old;
 	unsigned long mask = BIT_MASK(nr);
-	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
-	unsigned long old;
-	unsigned long flags;
 
-	_atomic_spin_lock_irqsave(p, flags);
-	old = *p;
-	*p = old | mask;
-	_atomic_spin_unlock_irqrestore(p, flags);
+	p += BIT_WORD(nr);
+	if (READ_ONCE(*p) & mask)
+		return 1;
 
-	return (old & mask) != 0;
+	old = atomic_long_fetch_or_acquire(mask, (atomic_long_t *)p);
+	return !!(old & mask);
 }
 
-/**
- * test_and_clear_bit - Clear a bit and return its old value
- * @nr: Bit to clear
- * @addr: Address to count from
- *
- * This operation is atomic and cannot be reordered.
- * It can be reorderdered on other architectures other than x86.
- * It also implies a memory barrier.
- */
-static inline int test_and_clear_bit(int nr, volatile unsigned long *addr)
+static inline void clear_bit_unlock(unsigned int nr, volatile unsigned long *p)
 {
-	unsigned long mask = BIT_MASK(nr);
-	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
-	unsigned long old;
-	unsigned long flags;
+	p += BIT_WORD(nr);
+	atomic_long_fetch_andnot_release(BIT_MASK(nr), (atomic_long_t *)p);
+}
 
-	_atomic_spin_lock_irqsave(p, flags);
-	old = *p;
-	*p = old & ~mask;
-	_atomic_spin_unlock_irqrestore(p, flags);
+static inline void __clear_bit_unlock(unsigned int nr,
+				      volatile unsigned long *p)
+{
+	unsigned long old;
 
-	return (old & mask) != 0;
+	p += BIT_WORD(nr);
+	old = READ_ONCE(*p);
+	old &= ~BIT_MASK(nr);
+	smp_store_release(p, old);
 }
 
-/**
- * test_and_change_bit - Change a bit and return its old value
- * @nr: Bit to change
- * @addr: Address to count from
- *
- * This operation is atomic and cannot be reordered.
- * It also implies a memory barrier.
- */
-static inline int test_and_change_bit(int nr, volatile unsigned long *addr)
+#ifndef clear_bit_unlock_is_negative_byte
+static inline bool clear_bit_unlock_is_negative_byte(unsigned int nr,
+						     volatile unsigned long *p)
 {
+	long old;
 	unsigned long mask = BIT_MASK(nr);
-	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
-	unsigned long old;
-	unsigned long flags;
-
-	_atomic_spin_lock_irqsave(p, flags);
-	old = *p;
-	*p = old ^ mask;
-	_atomic_spin_unlock_irqrestore(p, flags);
 
-	return (old & mask) != 0;
+	p += BIT_WORD(nr);
+	old = atomic_long_fetch_andnot_release(mask, (atomic_long_t *)p);
+	return !!(old & BIT(7));
 }
+#define clear_bit_unlock_is_negative_byte clear_bit_unlock_is_negative_byte
+#endif
 
 #endif /* _ASM_GENERIC_BITOPS_ATOMIC_H */