Message ID | 1453740558-16303-4-git-send-email-alex.bennee@linaro.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 25/01/2016 17:49, Alex Bennée wrote: > The __atomic primitives have been available since GCC 4.7 and provide > a richer interface for describing memory ordering requirements. As a > bonus by using the primitives instead of hand-rolled functions we can > use tools such as the AddressSanitizer which need the use of well > defined APIs for its analysis. > > If we have __ATOMIC defines we exclusively use the __atomic primitives > for all our atomic access. Otherwise we fall back to the mixture of > __sync and hand-rolled barrier cases. > > Signed-off-by: Alex Bennée <alex.bennee@linaro.org> > --- > include/qemu/atomic.h | 126 ++++++++++++++++++++++++++++++++------------------ > 1 file changed, 82 insertions(+), 44 deletions(-) > > diff --git a/include/qemu/atomic.h b/include/qemu/atomic.h > index bd2c075..414c81a 100644 > --- a/include/qemu/atomic.h > +++ b/include/qemu/atomic.h > @@ -15,12 +15,90 @@ > > #include "qemu/compiler.h" > > -/* For C11 atomic ops */ > > /* Compiler barrier */ > #define barrier() ({ asm volatile("" ::: "memory"); (void)0; }) > > -#ifndef __ATOMIC_RELAXED > +#ifdef __ATOMIC_RELAXED > +/* For C11 atomic ops */ > + > +/* Manual memory barriers > + * > + *__atomic_thread_fence does not include a compiler barrier; instead, > + * the barrier is part of __atomic_load/__atomic_store's "volatile-like" > + * semantics. If smp_wmb() is a no-op, absence of the barrier means that > + * the compiler is free to reorder stores on each side of the barrier. > + * Add one here, and similarly in smp_rmb() and smp_read_barrier_depends(). > + */ > + > +#define smp_mb() ({ barrier(); __atomic_thread_fence(__ATOMIC_CONSUME); barrier(); }) This should be seq_cst. > +#define smp_wmb() ({ barrier(); __atomic_thread_fence(__ATOMIC_RELEASE); barrier(); }) > +#define smp_rmb() ({ barrier(); __atomic_thread_fence(__ATOMIC_ACQUIRE); barrier(); }) > + > +#define smp_read_barrier_depends() ({ barrier(); __atomic_thread_fence(__ATOMIC_CONSUME); barrier(); }) > + > +/* Weak atomic operations prevent the compiler moving other > + * loads/stores past the atomic operation load/store. > + */ > +#define atomic_read(ptr) \ > + ({ \ > + typeof(*ptr) _val; \ > + __atomic_load(ptr, &_val, __ATOMIC_CONSUME); \ This should be relaxed. > + _val; \ > + }) > + > +#define atomic_rcu_read(ptr) atomic_read(ptr) This should be consume. > + > +#define atomic_set(ptr, i) do { \ > + typeof(*ptr) _val = (i); \ > + __atomic_store(ptr, &_val, __ATOMIC_RELEASE); \ This should be relaxed. > +} while(0) > + > +#define atomic_rcu_set(ptr, i) atomic_set(ptr, i) This should be release. > +/* Sequentially consistent atomic access */ > + > +#define atomic_xchg(ptr, i) ({ \ > + typeof(*ptr) _new = (i), _old; \ > + __atomic_exchange(ptr, &_new, &_old, __ATOMIC_SEQ_CST); \ > + _old; \ > +}) > + > +/* Returns the eventual value, failed or not */ > +#define atomic_cmpxchg(ptr, old, new) \ > + ({ \ > + typeof(*ptr) _old = (old), _new = (new); \ > + __atomic_compare_exchange(ptr, &_old, &_new, false, \ > + __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST); \ > + *ptr; /* can this race if cmpxchg not used elsewhere? */ \ If I read the manual correctly, you can return _old here (that's why it is a pointer). > + }) > + > +#define atomic_mb_set(ptr, i) ((void)atomic_xchg(ptr, i)) > +#define atomic_mb_read(ptr) \ > + ({ \ > + typeof(*ptr) _val; \ > + __atomic_load(ptr, &_val, __ATOMIC_ACQUIRE); \ > + _val; \ > + }) Please leave these defined in terms of relaxed accesses and memory barriers. Paolo
Paolo Bonzini <pbonzini@redhat.com> writes: > On 25/01/2016 17:49, Alex Bennée wrote: >> The __atomic primitives have been available since GCC 4.7 and provide >> a richer interface for describing memory ordering requirements. As a >> bonus by using the primitives instead of hand-rolled functions we can >> use tools such as the AddressSanitizer which need the use of well >> defined APIs for its analysis. >> >> If we have __ATOMIC defines we exclusively use the __atomic primitives >> for all our atomic access. Otherwise we fall back to the mixture of >> __sync and hand-rolled barrier cases. >> >> Signed-off-by: Alex Bennée <alex.bennee@linaro.org> >> --- >> include/qemu/atomic.h | 126 ++++++++++++++++++++++++++++++++------------------ >> 1 file changed, 82 insertions(+), 44 deletions(-) >> >> diff --git a/include/qemu/atomic.h b/include/qemu/atomic.h >> index bd2c075..414c81a 100644 >> --- a/include/qemu/atomic.h >> +++ b/include/qemu/atomic.h >> @@ -15,12 +15,90 @@ >> >> #include "qemu/compiler.h" >> >> -/* For C11 atomic ops */ >> >> /* Compiler barrier */ >> #define barrier() ({ asm volatile("" ::: "memory"); (void)0; }) >> >> -#ifndef __ATOMIC_RELAXED >> +#ifdef __ATOMIC_RELAXED >> +/* For C11 atomic ops */ >> + >> +/* Manual memory barriers >> + * >> + *__atomic_thread_fence does not include a compiler barrier; instead, >> + * the barrier is part of __atomic_load/__atomic_store's "volatile-like" >> + * semantics. If smp_wmb() is a no-op, absence of the barrier means that >> + * the compiler is free to reorder stores on each side of the barrier. >> + * Add one here, and similarly in smp_rmb() and smp_read_barrier_depends(). >> + */ >> + >> +#define smp_mb() ({ barrier(); __atomic_thread_fence(__ATOMIC_CONSUME); barrier(); }) > > This should be seq_cst. heh when I did this originally I had it as that (it was an overly complex splitting into handrolled, __seq and __atomic implementations.) I'll fix it. I think I can remove the barrier()'s as well can't I? > >> +#define smp_wmb() ({ barrier(); __atomic_thread_fence(__ATOMIC_RELEASE); barrier(); }) >> +#define smp_rmb() ({ barrier(); __atomic_thread_fence(__ATOMIC_ACQUIRE); barrier(); }) >> + >> +#define smp_read_barrier_depends() ({ barrier(); __atomic_thread_fence(__ATOMIC_CONSUME); barrier(); }) >> + >> +/* Weak atomic operations prevent the compiler moving other >> + * loads/stores past the atomic operation load/store. >> + */ >> +#define atomic_read(ptr) \ >> + ({ \ >> + typeof(*ptr) _val; \ >> + __atomic_load(ptr, &_val, __ATOMIC_CONSUME); \ > > This should be relaxed. > >> + _val; \ >> + }) >> + >> +#define atomic_rcu_read(ptr) atomic_read(ptr) > > This should be consume. > >> + >> +#define atomic_set(ptr, i) do { \ >> + typeof(*ptr) _val = (i); \ >> + __atomic_store(ptr, &_val, __ATOMIC_RELEASE); \ > > This should be relaxed. > >> +} while(0) >> + >> +#define atomic_rcu_set(ptr, i) atomic_set(ptr, i) > > This should be release. > >> +/* Sequentially consistent atomic access */ >> + >> +#define atomic_xchg(ptr, i) ({ \ >> + typeof(*ptr) _new = (i), _old; \ >> + __atomic_exchange(ptr, &_new, &_old, __ATOMIC_SEQ_CST); \ >> + _old; \ >> +}) >> + >> +/* Returns the eventual value, failed or not */ >> +#define atomic_cmpxchg(ptr, old, new) \ >> + ({ \ >> + typeof(*ptr) _old = (old), _new = (new); \ >> + __atomic_compare_exchange(ptr, &_old, &_new, false, \ >> + __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST); \ >> + *ptr; /* can this race if cmpxchg not used elsewhere? */ \ > > If I read the manual correctly, you can return _old here (that's why it > is a pointer). > >> + }) >> + >> +#define atomic_mb_set(ptr, i) ((void)atomic_xchg(ptr, i)) >> +#define atomic_mb_read(ptr) \ >> + ({ \ >> + typeof(*ptr) _val; \ >> + __atomic_load(ptr, &_val, __ATOMIC_ACQUIRE); \ >> + _val; \ >> + }) > > Please leave these defined in terms of relaxed accesses and memory > barriers. May I ask why? Doesn't the __ATOMIC_ACQUIRE ensure the load barrier is in place? Or should it be a fill CST barrier? > > Paolo -- Alex Bennée
On 25/01/2016 19:23, Alex Bennée wrote: >>> + }) >>> + >>> +#define atomic_mb_set(ptr, i) ((void)atomic_xchg(ptr, i)) >>> +#define atomic_mb_read(ptr) \ >>> + ({ \ >>> + typeof(*ptr) _val; \ >>> + __atomic_load(ptr, &_val, __ATOMIC_ACQUIRE); \ >>> + _val; \ >>> + }) >> >> Please leave these defined in terms of relaxed accesses and memory >> barriers. > > May I ask why? Doesn't the __ATOMIC_ACQUIRE ensure the load barrier is > in place? Or should it be a fill CST barrier? First, because they are defined like this in docs/atomics.txt. :) Second, because the right definition would be __atomic_load(__ATOMIC_SEQ_CST) and __atomic_store(__ATOMIC_SEQ_CST). These produce even better code than the memory barriers on x86 but, unfortunately, on POWER they are implemented in such a way that make things very slow. Basically, instead of mb_read -> load; rmb() mb_set -> wmb(); store(); mb() POWER uses this: mb_read -> load; mb() mb_set -> wmb(); store(); mb() There are reasons for these, and they are proved in http://www.cl.cam.ac.uk/~pes20/cppppc/popl079-batty.pdf (I'm not pretending I understand this paper except inasmuch as it was necessary to write docs/atomics.txt). However, the cases that actually matter basically never arise. Again, this is documented in docs/atomics.txt around line 90. As an alternative to explicit memory barriers, you can use this on POWER: mb_read -> load-acquire mb_set -> store-release + mb() and seq-cst load/store everywhere else. Paolo
diff --git a/include/qemu/atomic.h b/include/qemu/atomic.h index bd2c075..414c81a 100644 --- a/include/qemu/atomic.h +++ b/include/qemu/atomic.h @@ -15,12 +15,90 @@ #include "qemu/compiler.h" -/* For C11 atomic ops */ /* Compiler barrier */ #define barrier() ({ asm volatile("" ::: "memory"); (void)0; }) -#ifndef __ATOMIC_RELAXED +#ifdef __ATOMIC_RELAXED +/* For C11 atomic ops */ + +/* Manual memory barriers + * + *__atomic_thread_fence does not include a compiler barrier; instead, + * the barrier is part of __atomic_load/__atomic_store's "volatile-like" + * semantics. If smp_wmb() is a no-op, absence of the barrier means that + * the compiler is free to reorder stores on each side of the barrier. + * Add one here, and similarly in smp_rmb() and smp_read_barrier_depends(). + */ + +#define smp_mb() ({ barrier(); __atomic_thread_fence(__ATOMIC_CONSUME); barrier(); }) +#define smp_wmb() ({ barrier(); __atomic_thread_fence(__ATOMIC_RELEASE); barrier(); }) +#define smp_rmb() ({ barrier(); __atomic_thread_fence(__ATOMIC_ACQUIRE); barrier(); }) + +#define smp_read_barrier_depends() ({ barrier(); __atomic_thread_fence(__ATOMIC_CONSUME); barrier(); }) + +/* Weak atomic operations prevent the compiler moving other + * loads/stores past the atomic operation load/store. + */ +#define atomic_read(ptr) \ + ({ \ + typeof(*ptr) _val; \ + __atomic_load(ptr, &_val, __ATOMIC_CONSUME); \ + _val; \ + }) + +#define atomic_rcu_read(ptr) atomic_read(ptr) + +#define atomic_set(ptr, i) do { \ + typeof(*ptr) _val = (i); \ + __atomic_store(ptr, &_val, __ATOMIC_RELEASE); \ +} while(0) + +#define atomic_rcu_set(ptr, i) atomic_set(ptr, i) + +/* Sequentially consistent atomic access */ + +#define atomic_xchg(ptr, i) ({ \ + typeof(*ptr) _new = (i), _old; \ + __atomic_exchange(ptr, &_new, &_old, __ATOMIC_SEQ_CST); \ + _old; \ +}) + +/* Returns the eventual value, failed or not */ +#define atomic_cmpxchg(ptr, old, new) \ + ({ \ + typeof(*ptr) _old = (old), _new = (new); \ + __atomic_compare_exchange(ptr, &_old, &_new, false, \ + __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST); \ + *ptr; /* can this race if cmpxchg not used elsewhere? */ \ + }) + +#define atomic_mb_set(ptr, i) ((void)atomic_xchg(ptr, i)) +#define atomic_mb_read(ptr) \ + ({ \ + typeof(*ptr) _val; \ + __atomic_load(ptr, &_val, __ATOMIC_ACQUIRE); \ + _val; \ + }) + + +/* Provide shorter names for GCC atomic builtins, return old value */ +#define atomic_fetch_inc(ptr) __atomic_fetch_add(ptr, 1, __ATOMIC_SEQ_CST) +#define atomic_fetch_dec(ptr) __atomic_fetch_sub(ptr, 1, __ATOMIC_SEQ_CST) +#define atomic_fetch_add(ptr, n) __atomic_fetch_add(ptr, n, __ATOMIC_SEQ_CST) +#define atomic_fetch_sub(ptr, n) __atomic_fetch_sub(ptr, n, __ATOMIC_SEQ_CST) +#define atomic_fetch_and(ptr, n) __atomic_fetch_and(ptr, n, __ATOMIC_SEQ_CST) +#define atomic_fetch_or(ptr, n) __atomic_fetch_or(ptr, n, __ATOMIC_SEQ_CST) + +/* And even shorter names that return void. */ +#define atomic_inc(ptr) ((void) __atomic_fetch_add(ptr, 1, __ATOMIC_SEQ_CST)) +#define atomic_dec(ptr) ((void) __atomic_fetch_sub(ptr, 1, __ATOMIC_SEQ_CST)) +#define atomic_add(ptr, n) ((void) __atomic_fetch_add(ptr, n, __ATOMIC_SEQ_CST)) +#define atomic_sub(ptr, n) ((void) __atomic_fetch_sub(ptr, n, __ATOMIC_SEQ_CST)) +#define atomic_and(ptr, n) ((void) __atomic_fetch_and(ptr, n, __ATOMIC_SEQ_CST)) +#define atomic_or(ptr, n) ((void) __atomic_fetch_or(ptr, n, __ATOMIC_SEQ_CST)) + +#else /* __ATOMIC_RELAXED */ /* * We use GCC builtin if it's available, as that can use mfence on @@ -85,8 +163,6 @@ #endif /* _ARCH_PPC */ -#endif /* C11 atomics */ - /* * For (host) platforms we don't have explicit barrier definitions * for, we use the gcc __sync_synchronize() primitive to generate a @@ -98,34 +174,16 @@ #endif #ifndef smp_wmb -#ifdef __ATOMIC_RELEASE -/* __atomic_thread_fence does not include a compiler barrier; instead, - * the barrier is part of __atomic_load/__atomic_store's "volatile-like" - * semantics. If smp_wmb() is a no-op, absence of the barrier means that - * the compiler is free to reorder stores on each side of the barrier. - * Add one here, and similarly in smp_rmb() and smp_read_barrier_depends(). - */ -#define smp_wmb() ({ barrier(); __atomic_thread_fence(__ATOMIC_RELEASE); barrier(); }) -#else #define smp_wmb() __sync_synchronize() #endif -#endif #ifndef smp_rmb -#ifdef __ATOMIC_ACQUIRE -#define smp_rmb() ({ barrier(); __atomic_thread_fence(__ATOMIC_ACQUIRE); barrier(); }) -#else #define smp_rmb() __sync_synchronize() #endif -#endif #ifndef smp_read_barrier_depends -#ifdef __ATOMIC_CONSUME -#define smp_read_barrier_depends() ({ barrier(); __atomic_thread_fence(__ATOMIC_CONSUME); barrier(); }) -#else #define smp_read_barrier_depends() barrier() #endif -#endif #ifndef atomic_read #define atomic_read(ptr) (*(__typeof__(*ptr) volatile*) (ptr)) @@ -156,20 +214,12 @@ * Should match atomic_rcu_set(), atomic_xchg(), atomic_cmpxchg(). */ #ifndef atomic_rcu_read -#ifdef __ATOMIC_CONSUME -#define atomic_rcu_read(ptr) ({ \ - typeof(*ptr) _val; \ - __atomic_load(ptr, &_val, __ATOMIC_CONSUME); \ - _val; \ -}) -#else #define atomic_rcu_read(ptr) ({ \ typeof(*ptr) _val = atomic_read(ptr); \ smp_read_barrier_depends(); \ _val; \ }) #endif -#endif /** * atomic_rcu_set - assigns (publicizes) a pointer to a new data structure @@ -183,18 +233,11 @@ * Should match atomic_rcu_read(). */ #ifndef atomic_rcu_set -#ifdef __ATOMIC_RELEASE -#define atomic_rcu_set(ptr, i) do { \ - typeof(*ptr) _val = (i); \ - __atomic_store(ptr, &_val, __ATOMIC_RELEASE); \ -} while(0) -#else #define atomic_rcu_set(ptr, i) do { \ smp_wmb(); \ atomic_set(ptr, i); \ } while (0) #endif -#endif /* These have the same semantics as Java volatile variables. * See http://gee.cs.oswego.edu/dl/jmm/cookbook.html: @@ -237,12 +280,6 @@ #ifndef atomic_xchg #if defined(__clang__) #define atomic_xchg(ptr, i) __sync_swap(ptr, i) -#elif defined(__ATOMIC_SEQ_CST) -#define atomic_xchg(ptr, i) ({ \ - typeof(*ptr) _new = (i), _old; \ - __atomic_exchange(ptr, &_new, &_old, __ATOMIC_SEQ_CST); \ - _old; \ -}) #else /* __sync_lock_test_and_set() is documented to be an acquire barrier only. */ #define atomic_xchg(ptr, i) (smp_mb(), __sync_lock_test_and_set(ptr, i)) @@ -266,4 +303,5 @@ #define atomic_and(ptr, n) ((void) __sync_fetch_and_and(ptr, n)) #define atomic_or(ptr, n) ((void) __sync_fetch_and_or(ptr, n)) -#endif +#endif /* __ATOMIC_RELAXED */ +#endif /* __QEMU_ATOMIC_H */
The __atomic primitives have been available since GCC 4.7 and provide a richer interface for describing memory ordering requirements. As a bonus by using the primitives instead of hand-rolled functions we can use tools such as the AddressSanitizer which need the use of well defined APIs for its analysis. If we have __ATOMIC defines we exclusively use the __atomic primitives for all our atomic access. Otherwise we fall back to the mixture of __sync and hand-rolled barrier cases. Signed-off-by: Alex Bennée <alex.bennee@linaro.org> --- include/qemu/atomic.h | 126 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 82 insertions(+), 44 deletions(-)