Message ID | 214bfd61c8ccf2a5b2c640b815ebfa6a705f6234.1703255175.git.oleksii.kurochko@gmail.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | Enable build of full Xen for RISC-V | expand |
On 22.12.2023 16:12, Oleksii Kurochko wrote: > --- /dev/null > +++ b/xen/arch/riscv/include/asm/cmpxchg.h > @@ -0,0 +1,496 @@ > +/* SPDX-License-Identifier: GPL-2.0-only */ > +/* Copyright (C) 2014 Regents of the University of California */ > + > +#ifndef _ASM_RISCV_CMPXCHG_H > +#define _ASM_RISCV_CMPXCHG_H > + > +#include <xen/compiler.h> > +#include <xen/lib.h> > + > +#include <asm/fence.h> > +#include <asm/io.h> > +#include <asm/system.h> > + > +#define __xchg_relaxed(ptr, new, size) \ > +({ \ > + __typeof__(ptr) ptr__ = (ptr); \ > + __typeof__(new) new__ = (new); \ > + __typeof__(*(ptr)) ret__; \ I expect the types of new and *ptr want to actually match. Which you then want to enforce, so that issues at use sites would either be reported by the compiler, or be permitted by a type conversion of new. > + switch (size) \ > + { \ Nit: Hard tab left here. (Also again you want to either stick to Linux style or fully switch to Xen style.) > + case 4: \ > + asm volatile( \ > + " amoswap.w %0, %2, %1\n" \ I don't think a leading tab (or leading whitespace in general) is needed in single-line-output asm()s. The trailing \n also isn't needed if I'm not mistaken. > + : "=r" (ret__), "+A" (*ptr__) \ > + : "r" (new__) \ > + : "memory" ); \ > + break; \ > + case 8: \ > + asm volatile( \ > + " amoswap.d %0, %2, %1\n" \ > + : "=r" (ret__), "+A" (*ptr__) \ > + : "r" (new__) \ > + : "memory" ); \ > + break; \ > + default: \ > + ASSERT_UNREACHABLE(); \ If at all possible this wants to trigger a build failure, not a runtime one. > + } \ > + ret__; \ > +}) > + > +#define xchg_relaxed(ptr, x) \ > +({ \ > + __typeof__(*(ptr)) x_ = (x); \ > + (__typeof__(*(ptr))) __xchg_relaxed((ptr), x_, sizeof(*(ptr))); \ Nit: Stray blank after cast. For readability I'd also suggest to drop parentheses in cases like the first argument passed to __xchg_relaxed() here. > +}) For both: What does "relaxed" describe? I'm asking because it's not really clear whether the memory clobbers are actually needed. > +#define __xchg_acquire(ptr, new, size) \ > +({ \ > + __typeof__(ptr) ptr__ = (ptr); \ > + __typeof__(new) new__ = (new); \ > + __typeof__(*(ptr)) ret__; \ > + switch (size) \ > + { \ > + case 4: \ > + asm volatile( \ > + " amoswap.w %0, %2, %1\n" \ > + RISCV_ACQUIRE_BARRIER \ > + : "=r" (ret__), "+A" (*ptr__) \ > + : "r" (new__) \ > + : "memory" ); \ > + break; \ > + case 8: \ > + asm volatile( \ > + " amoswap.d %0, %2, %1\n" \ > + RISCV_ACQUIRE_BARRIER \ > + : "=r" (ret__), "+A" (*ptr__) \ > + : "r" (new__) \ > + : "memory" ); \ > + break; \ > + default: \ > + ASSERT_UNREACHABLE(); \ > + } \ > + ret__; \ > +}) If I'm not mistaken this differs from __xchg_relaxed() only in the use of RISCV_ACQUIRE_BARRIER, and ... > +#define xchg_acquire(ptr, x) \ > +({ \ > + __typeof__(*(ptr)) x_ = (x); \ > + (__typeof__(*(ptr))) __xchg_acquire((ptr), x_, sizeof(*(ptr))); \ > +}) > + > +#define __xchg_release(ptr, new, size) \ > +({ \ > + __typeof__(ptr) ptr__ = (ptr); \ > + __typeof__(new) new__ = (new); \ > + __typeof__(*(ptr)) ret__; \ > + switch (size) \ > + { \ > + case 4: \ > + asm volatile ( \ > + RISCV_RELEASE_BARRIER \ > + " amoswap.w %0, %2, %1\n" \ > + : "=r" (ret__), "+A" (*ptr__) \ > + : "r" (new__) \ > + : "memory"); \ > + break; \ > + case 8: \ > + asm volatile ( \ > + RISCV_RELEASE_BARRIER \ > + " amoswap.d %0, %2, %1\n" \ > + : "=r" (ret__), "+A" (*ptr__) \ > + : "r" (new__) \ > + : "memory"); \ > + break; \ > + default: \ > + ASSERT_UNREACHABLE(); \ > + } \ > + ret__; \ > +}) this only in the use of RISCV_RELEASE_BARRIER. If so they likely want folding, to limit redundancy and make eventual updating easier. (Same for the cmpxchg helper further down, as it seems.) > +#define xchg_release(ptr, x) \ > +({ \ > + __typeof__(*(ptr)) x_ = (x); \ > + (__typeof__(*(ptr))) __xchg_release((ptr), x_, sizeof(*(ptr))); \ > +}) > + > +static always_inline uint32_t __xchg_case_4(volatile uint32_t *ptr, > + uint32_t new) > +{ > + __typeof__(*(ptr)) ret; > + > + asm volatile ( > + " amoswap.w.aqrl %0, %2, %1\n" > + : "=r" (ret), "+A" (*ptr) > + : "r" (new) > + : "memory" ); > + > + return ret; > +} > + > +static always_inline uint64_t __xchg_case_8(volatile uint64_t *ptr, > + uint64_t new) > +{ > + __typeof__(*(ptr)) ret; > + > + asm volatile( \ > + " amoswap.d.aqrl %0, %2, %1\n" \ > + : "=r" (ret), "+A" (*ptr) \ > + : "r" (new) \ > + : "memory" ); \ > + > + return ret; > +} > + > +static always_inline unsigned short __cmpxchg_case_2(volatile uint32_t *ptr, > + uint32_t old, > + uint32_t new); Don't you consistently mean uint16_t here (incl the return type) and ... > +static always_inline unsigned short __cmpxchg_case_1(volatile uint32_t *ptr, > + uint32_t old, > + uint32_t new); ... uint8_t here? > +static inline unsigned long __xchg(volatile void *ptr, unsigned long x, int size) > +{ > + switch (size) { > + case 1: > + return __cmpxchg_case_1(ptr, (uint32_t)-1, x); > + case 2: > + return __cmpxchg_case_2(ptr, (uint32_t)-1, x); How are these going to work? You'll compare against ~0, and if the value in memory isn't ~0, memory won't be updated; you will only (correctly) return the value found in memory. Or wait - looking at __cmpxchg_case_{1,2}() far further down, you ignore "old" there. Which apparently means they'll work for the use here, but not for the use in __cmpxchg(). > + case 4: > + return __xchg_case_4(ptr, x); > + case 8: > + return __xchg_case_8(ptr, x); > + default: > + ASSERT_UNREACHABLE(); > + } > + > + return -1; > +} > + > +#define xchg(ptr,x) \ > +({ \ > + __typeof__(*(ptr)) ret__; \ > + ret__ = (__typeof__(*(ptr))) \ > + __xchg((ptr), (unsigned long)(x), sizeof(*(ptr))); \ > + ret__; \ > +}) > + > +#define xchg32(ptr, x) \ > +({ \ > + BUILD_BUG_ON(sizeof(*(ptr)) != 4); \ > + xchg((ptr), (x)); \ > +}) > + > +#define xchg64(ptr, x) \ > +({ \ > + BUILD_BUG_ON(sizeof(*(ptr)) != 8); \ > + xchg((ptr), (x)); \ > +}) What are these two (and their cmpxchg counterparts) needed for? > +/* > + * Atomic compare and exchange. Compare OLD with MEM, if identical, > + * store NEW in MEM. Return the initial value in MEM. Success is > + * indicated by comparing RETURN with OLD. > + */ > +#define __cmpxchg_relaxed(ptr, old, new, size) \ > +({ \ > + __typeof__(ptr) ptr__ = (ptr); \ > + __typeof__(*(ptr)) __old = (old); \ Leftover leading underscores? > + __typeof__(*(ptr)) new__ = (new); \ Related to my earlier comment on types needing to be compatible - see how here you're using "ptr" throughout. > + __typeof__(*(ptr)) ret__; \ > + register unsigned int __rc; \ More leftover leading underscores? > +static always_inline unsigned short __cmpxchg_case_2(volatile uint32_t *ptr, > + uint32_t old, > + uint32_t new) > +{ > + (void) old; > + > + if (((unsigned long)ptr & 3) == 3) > + { > +#ifdef CONFIG_64BIT > + return __emulate_cmpxchg_case1_2((uint64_t *)ptr, new, > + readq, __cmpxchg_case_8, 0xffffU); What if ((unsigned long)ptr & 7) == 7 (which is a sub-case of what the if() above checks for? Isn't it more reasonable to require aligned 16-bit quantities here? Or if mis-aligned addresses are okay, you could as well emulate using __cmpxchg_case_4(). Also you shouldn't be casting away volatile (here and below). Avoiding the casts (by suitable using volatile void * parameter types) would likely be best. Jan
On Mon, 2024-01-22 at 17:27 +0100, Jan Beulich wrote: > On 22.12.2023 16:12, Oleksii Kurochko wrote: > > --- /dev/null > > +++ b/xen/arch/riscv/include/asm/cmpxchg.h > > @@ -0,0 +1,496 @@ > > +/* SPDX-License-Identifier: GPL-2.0-only */ > > +/* Copyright (C) 2014 Regents of the University of California */ > > + > > +#ifndef _ASM_RISCV_CMPXCHG_H > > +#define _ASM_RISCV_CMPXCHG_H > > + > > +#include <xen/compiler.h> > > +#include <xen/lib.h> > > + > > +#include <asm/fence.h> > > +#include <asm/io.h> > > +#include <asm/system.h> > > + > > +#define __xchg_relaxed(ptr, new, size) \ > > +({ \ > > + __typeof__(ptr) ptr__ = (ptr); \ > > + __typeof__(new) new__ = (new); \ > > + __typeof__(*(ptr)) ret__; \ > > I expect the types of new and *ptr want to actually match. Which > you then want to enforce, so that issues at use sites would either > be reported by the compiler, or be permitted by a type conversion > of new. I am OK to make the same type for new and ret__, but it looks like __xchg_relaxed() is used only inside xchg_relaxed() where 'new' is declared with the same type as *ptr. > > > + switch (size) \ > > + { \ > > Nit: Hard tab left here. (Also again you want to either stick to > Linux style or fully switch to Xen style.) Thanks. I'll update that. > > > + case 4: \ > > + asm volatile( \ > > + " amoswap.w %0, %2, %1\n" \ > > I don't think a leading tab (or leading whitespace in general) is > needed in single-line-output asm()s. The trailing \n also isn't > needed if I'm not mistaken. I just wanted to align it with "=r", but for sure a leading tab or whitespace can be dropped. > > > + : "=r" (ret__), "+A" (*ptr__) \ > > + : "r" (new__) \ > > + : "memory" ); \ > > + break; \ > > + case 8: \ > > + asm volatile( \ > > + " amoswap.d %0, %2, %1\n" \ > > + : "=r" (ret__), "+A" (*ptr__) \ > > + : "r" (new__) \ > > + : "memory" ); \ > > + break; \ > > + default: \ > > + ASSERT_UNREACHABLE(); \ > > If at all possible this wants to trigger a build failure, not a > runtime > one. I'll update that with BUILD_BUG_ON(size > 8); > > > + } \ > > + ret__; \ > > +}) > > + > > +#define xchg_relaxed(ptr, x) \ > > +({ \ > > + __typeof__(*(ptr)) x_ = (x); \ > > + (__typeof__(*(ptr))) __xchg_relaxed((ptr), x_, > > sizeof(*(ptr))); \ > > Nit: Stray blank after cast. For readability I'd also suggest to > drop parentheses in cases like the first argument passed to > __xchg_relaxed() here. Thanks. I'll take that into account. > > > +}) > > For both: What does "relaxed" describe? I'm asking because it's not > really clear whether the memory clobbers are actually needed. > > > +#define __xchg_acquire(ptr, new, size) \ > > +({ \ > > + __typeof__(ptr) ptr__ = (ptr); \ > > + __typeof__(new) new__ = (new); \ > > + __typeof__(*(ptr)) ret__; \ > > + switch (size) \ > > + { \ > > + case 4: \ > > + asm volatile( \ > > + " amoswap.w %0, %2, %1\n" \ > > + RISCV_ACQUIRE_BARRIER \ > > + : "=r" (ret__), "+A" (*ptr__) \ > > + : "r" (new__) \ > > + : "memory" ); \ > > + break; \ > > + case 8: \ > > + asm volatile( \ > > + " amoswap.d %0, %2, %1\n" \ > > + RISCV_ACQUIRE_BARRIER \ > > + : "=r" (ret__), "+A" (*ptr__) \ > > + : "r" (new__) \ > > + : "memory" ); \ > > + break; \ > > + default: \ > > + ASSERT_UNREACHABLE(); \ > > + } \ > > + ret__; \ > > +}) > > If I'm not mistaken this differs from __xchg_relaxed() only in the > use > of RISCV_ACQUIRE_BARRIER, and ... > > > +#define xchg_acquire(ptr, x) \ > > +({ \ > > + __typeof__(*(ptr)) x_ = (x); \ > > + (__typeof__(*(ptr))) __xchg_acquire((ptr), x_, > > sizeof(*(ptr))); \ > > +}) > > + > > +#define __xchg_release(ptr, new, size) \ > > +({ \ > > + __typeof__(ptr) ptr__ = (ptr); \ > > + __typeof__(new) new__ = (new); \ > > + __typeof__(*(ptr)) ret__; \ > > + switch (size) \ > > + { \ > > + case 4: \ > > + asm volatile ( \ > > + RISCV_RELEASE_BARRIER \ > > + " amoswap.w %0, %2, %1\n" \ > > + : "=r" (ret__), "+A" (*ptr__) \ > > + : "r" (new__) \ > > + : "memory"); \ > > + break; \ > > + case 8: \ > > + asm volatile ( \ > > + RISCV_RELEASE_BARRIER \ > > + " amoswap.d %0, %2, %1\n" \ > > + : "=r" (ret__), "+A" (*ptr__) \ > > + : "r" (new__) \ > > + : "memory"); \ > > + break; \ > > + default: \ > > + ASSERT_UNREACHABLE(); \ > > + } \ > > + ret__; \ > > +}) > > this only in the use of RISCV_RELEASE_BARRIER. If so they likely want > folding, to limit redundancy and make eventual updating easier. (Same > for the cmpxchg helper further down, as it seems.) Yes, you are right. The difference is only in RISCV_RELEASE_BARRIER and it is an absence of RISCV_RELEASE_BARRIER is a reason why we have "relaxed" versions. I am not sure that I understand what you mean by folding here. Do you mean that there is no any sense to have to separate macros and it is needed only one with RISCV_RELEASE_BARRIER? > > > +#define xchg_release(ptr, x) \ > > +({ \ > > + __typeof__(*(ptr)) x_ = (x); \ > > + (__typeof__(*(ptr))) __xchg_release((ptr), x_, > > sizeof(*(ptr))); \ > > +}) > > + > > +static always_inline uint32_t __xchg_case_4(volatile uint32_t > > *ptr, > > + uint32_t new) > > +{ > > + __typeof__(*(ptr)) ret; > > + > > + asm volatile ( > > + " amoswap.w.aqrl %0, %2, %1\n" > > + : "=r" (ret), "+A" (*ptr) > > + : "r" (new) > > + : "memory" ); > > + > > + return ret; > > +} > > + > > +static always_inline uint64_t __xchg_case_8(volatile uint64_t > > *ptr, > > + uint64_t new) > > +{ > > + __typeof__(*(ptr)) ret; > > + > > + asm volatile( \ > > + " amoswap.d.aqrl %0, %2, %1\n" \ > > + : "=r" (ret), "+A" (*ptr) \ > > + : "r" (new) \ > > + : "memory" ); \ > > + > > + return ret; > > +} > > + > > +static always_inline unsigned short __cmpxchg_case_2(volatile > > uint32_t *ptr, > > + uint32_t old, > > + uint32_t > > new); > > Don't you consistently mean uint16_t here (incl the return type) and > ... > > > +static always_inline unsigned short __cmpxchg_case_1(volatile > > uint32_t *ptr, > > + uint32_t old, > > + uint32_t > > new); > > ... uint8_t here? The idea was that we emulate __cmpxchg_case_1 and __cmpxchg_case_2 using 4 bytes cmpxchg and __cmpxchg_case_4 expects uint32_t. > > > +static inline unsigned long __xchg(volatile void *ptr, unsigned > > long x, int size) > > +{ > > + switch (size) { > > + case 1: > > + return __cmpxchg_case_1(ptr, (uint32_t)-1, x); > > + case 2: > > + return __cmpxchg_case_2(ptr, (uint32_t)-1, x); > > How are these going to work? You'll compare against ~0, and if the > value > in memory isn't ~0, memory won't be updated; you will only > (correctly) > return the value found in memory. > > Or wait - looking at __cmpxchg_case_{1,2}() far further down, you > ignore > "old" there. Which apparently means they'll work for the use here, > but > not for the use in __cmpxchg(). Yes, the trick is that old is ignored and is read in __emulate_cmpxchg_case1_2() before __cmpxchg_case_4 is called: do { read_val = read_func(aligned_ptr); swapped_new = read_val & ~mask; swapped_new |= masked_new; ret = cmpxchg_func(aligned_ptr, read_val, swapped_new); } while ( ret != read_val ); read_val it is 'old'. But now I am not 100% sure that it is correct for __cmpxchg... > > > + case 4: > > + return __xchg_case_4(ptr, x); > > + case 8: > > + return __xchg_case_8(ptr, x); > > + default: > > + ASSERT_UNREACHABLE(); > > + } > > + > > + return -1; > > +} > > + > > +#define xchg(ptr,x) \ > > +({ \ > > + __typeof__(*(ptr)) ret__; \ > > + ret__ = (__typeof__(*(ptr))) \ > > + __xchg((ptr), (unsigned long)(x), sizeof(*(ptr))); \ > > + ret__; \ > > +}) > > + > > +#define xchg32(ptr, x) \ > > +({ \ > > + BUILD_BUG_ON(sizeof(*(ptr)) != 4); \ > > + xchg((ptr), (x)); \ > > +}) > > + > > +#define xchg64(ptr, x) \ > > +({ \ > > + BUILD_BUG_ON(sizeof(*(ptr)) != 8); \ > > + xchg((ptr), (x)); \ > > +}) > > What are these two (and their cmpxchg counterparts) needed for? It was copied from Linux, but it looks like they can be really dropped, I can't find where it is used, so I'll drop them. > > > +/* > > + * Atomic compare and exchange. Compare OLD with MEM, if > > identical, > > + * store NEW in MEM. Return the initial value in MEM. Success is > > + * indicated by comparing RETURN with OLD. > > + */ > > +#define __cmpxchg_relaxed(ptr, old, new, size) \ > > +({ \ > > + __typeof__(ptr) ptr__ = (ptr); \ > > + __typeof__(*(ptr)) __old = (old); \ > > Leftover leading underscores? > > > + __typeof__(*(ptr)) new__ = (new); \ > > Related to my earlier comment on types needing to be compatible - see > how here you're using "ptr" throughout. > > > + __typeof__(*(ptr)) ret__; \ > > + register unsigned int __rc; \ > > More leftover leading underscores? Missed to drop underscores. Thanks. I'll drop them in next version. > > > +static always_inline unsigned short __cmpxchg_case_2(volatile > > uint32_t *ptr, > > + uint32_t old, > > + uint32_t new) > > +{ > > + (void) old; > > + > > + if (((unsigned long)ptr & 3) == 3) > > + { > > +#ifdef CONFIG_64BIT > > + return __emulate_cmpxchg_case1_2((uint64_t *)ptr, new, > > + readq, __cmpxchg_case_8, > > 0xffffU); > > What if ((unsigned long)ptr & 7) == 7 (which is a sub-case of what > the > if() above checks for? Isn't it more reasonable to require aligned > 16-bit quantities here? Or if mis-aligned addresses are okay, you > could > as well emulate using __cmpxchg_case_4(). Yes, it will be more reasonable. I'll use IS_ALIGNED instead. > > Also you shouldn't be casting away volatile (here and below). > Avoiding > the casts (by suitable using volatile void * parameter types) would > likely be best. Thanks. I'll update the function prototype. ~ Oleksii
On 23.01.2024 11:15, Oleksii wrote: > On Mon, 2024-01-22 at 17:27 +0100, Jan Beulich wrote: >> On 22.12.2023 16:12, Oleksii Kurochko wrote: >>> + : "=r" (ret__), "+A" (*ptr__) \ >>> + : "r" (new__) \ >>> + : "memory" ); \ >>> + break; \ >>> + case 8: \ >>> + asm volatile( \ >>> + " amoswap.d %0, %2, %1\n" \ >>> + : "=r" (ret__), "+A" (*ptr__) \ >>> + : "r" (new__) \ >>> + : "memory" ); \ >>> + break; \ >>> + default: \ >>> + ASSERT_UNREACHABLE(); \ >> >> If at all possible this wants to trigger a build failure, not a >> runtime >> one. > I'll update that with BUILD_BUG_ON(size > 8); What about size accidentally being e.g. 5? I'm also not sure you'll be able to use BUILD_BUG_ON() here: That'll depend on what use sites there are. And if all present ones are okay in this regard, you'd still set out a trap for someone else to fall into later. We have a common approach for this, which currently is under re-work. See https://lists.xen.org/archives/html/xen-devel/2024-01/msg01115.html. >>> + } \ >>> + ret__; \ >>> +}) >>> + >>> +#define xchg_relaxed(ptr, x) \ >>> +({ \ >>> + __typeof__(*(ptr)) x_ = (x); \ >>> + (__typeof__(*(ptr))) __xchg_relaxed((ptr), x_, >>> sizeof(*(ptr))); \ >> >> Nit: Stray blank after cast. For readability I'd also suggest to >> drop parentheses in cases like the first argument passed to >> __xchg_relaxed() here. > Thanks. I'll take that into account. > >> >>> +}) >> >> For both: What does "relaxed" describe? I'm asking because it's not >> really clear whether the memory clobbers are actually needed. >> >>> +#define __xchg_acquire(ptr, new, size) \ >>> +({ \ >>> + __typeof__(ptr) ptr__ = (ptr); \ >>> + __typeof__(new) new__ = (new); \ >>> + __typeof__(*(ptr)) ret__; \ >>> + switch (size) \ >>> + { \ >>> + case 4: \ >>> + asm volatile( \ >>> + " amoswap.w %0, %2, %1\n" \ >>> + RISCV_ACQUIRE_BARRIER \ >>> + : "=r" (ret__), "+A" (*ptr__) \ >>> + : "r" (new__) \ >>> + : "memory" ); \ >>> + break; \ >>> + case 8: \ >>> + asm volatile( \ >>> + " amoswap.d %0, %2, %1\n" \ >>> + RISCV_ACQUIRE_BARRIER \ >>> + : "=r" (ret__), "+A" (*ptr__) \ >>> + : "r" (new__) \ >>> + : "memory" ); \ >>> + break; \ >>> + default: \ >>> + ASSERT_UNREACHABLE(); \ >>> + } \ >>> + ret__; \ >>> +}) >> >> If I'm not mistaken this differs from __xchg_relaxed() only in the >> use >> of RISCV_ACQUIRE_BARRIER, and ... >> >>> +#define xchg_acquire(ptr, x) \ >>> +({ \ >>> + __typeof__(*(ptr)) x_ = (x); \ >>> + (__typeof__(*(ptr))) __xchg_acquire((ptr), x_, >>> sizeof(*(ptr))); \ >>> +}) >>> + >>> +#define __xchg_release(ptr, new, size) \ >>> +({ \ >>> + __typeof__(ptr) ptr__ = (ptr); \ >>> + __typeof__(new) new__ = (new); \ >>> + __typeof__(*(ptr)) ret__; \ >>> + switch (size) \ >>> + { \ >>> + case 4: \ >>> + asm volatile ( \ >>> + RISCV_RELEASE_BARRIER \ >>> + " amoswap.w %0, %2, %1\n" \ >>> + : "=r" (ret__), "+A" (*ptr__) \ >>> + : "r" (new__) \ >>> + : "memory"); \ >>> + break; \ >>> + case 8: \ >>> + asm volatile ( \ >>> + RISCV_RELEASE_BARRIER \ >>> + " amoswap.d %0, %2, %1\n" \ >>> + : "=r" (ret__), "+A" (*ptr__) \ >>> + : "r" (new__) \ >>> + : "memory"); \ >>> + break; \ >>> + default: \ >>> + ASSERT_UNREACHABLE(); \ >>> + } \ >>> + ret__; \ >>> +}) >> >> this only in the use of RISCV_RELEASE_BARRIER. If so they likely want >> folding, to limit redundancy and make eventual updating easier. (Same >> for the cmpxchg helper further down, as it seems.) > Yes, you are right. The difference is only in RISCV_RELEASE_BARRIER and > it is an absence of RISCV_RELEASE_BARRIER is a reason why we have > "relaxed" versions. > > I am not sure that I understand what you mean by folding here. Do you > mean that there is no any sense to have to separate macros and it is > needed only one with RISCV_RELEASE_BARRIER? No. You should parameterize the folded common macro for the derived macros to simply pass in the barriers needed, with empty macro arguments indicating "this barrier not needed". >>> +#define xchg_release(ptr, x) \ >>> +({ \ >>> + __typeof__(*(ptr)) x_ = (x); \ >>> + (__typeof__(*(ptr))) __xchg_release((ptr), x_, >>> sizeof(*(ptr))); \ >>> +}) >>> + >>> +static always_inline uint32_t __xchg_case_4(volatile uint32_t >>> *ptr, >>> + uint32_t new) >>> +{ >>> + __typeof__(*(ptr)) ret; >>> + >>> + asm volatile ( >>> + " amoswap.w.aqrl %0, %2, %1\n" >>> + : "=r" (ret), "+A" (*ptr) >>> + : "r" (new) >>> + : "memory" ); >>> + >>> + return ret; >>> +} >>> + >>> +static always_inline uint64_t __xchg_case_8(volatile uint64_t >>> *ptr, >>> + uint64_t new) >>> +{ >>> + __typeof__(*(ptr)) ret; >>> + >>> + asm volatile( \ >>> + " amoswap.d.aqrl %0, %2, %1\n" \ >>> + : "=r" (ret), "+A" (*ptr) \ >>> + : "r" (new) \ >>> + : "memory" ); \ >>> + >>> + return ret; >>> +} >>> + >>> +static always_inline unsigned short __cmpxchg_case_2(volatile >>> uint32_t *ptr, >>> + uint32_t old, >>> + uint32_t >>> new); >> >> Don't you consistently mean uint16_t here (incl the return type) and >> ... >> >>> +static always_inline unsigned short __cmpxchg_case_1(volatile >>> uint32_t *ptr, >>> + uint32_t old, >>> + uint32_t >>> new); >> >> ... uint8_t here? > The idea was that we emulate __cmpxchg_case_1 and __cmpxchg_case_2 > using 4 bytes cmpxchg and __cmpxchg_case_4 expects uint32_t. I consider this wrong. The functions would better be type-correct. >>> +static inline unsigned long __xchg(volatile void *ptr, unsigned >>> long x, int size) >>> +{ >>> + switch (size) { >>> + case 1: >>> + return __cmpxchg_case_1(ptr, (uint32_t)-1, x); >>> + case 2: >>> + return __cmpxchg_case_2(ptr, (uint32_t)-1, x); >> >> How are these going to work? You'll compare against ~0, and if the >> value >> in memory isn't ~0, memory won't be updated; you will only >> (correctly) >> return the value found in memory. >> >> Or wait - looking at __cmpxchg_case_{1,2}() far further down, you >> ignore >> "old" there. Which apparently means they'll work for the use here, >> but >> not for the use in __cmpxchg(). > Yes, the trick is that old is ignored and is read in > __emulate_cmpxchg_case1_2() before __cmpxchg_case_4 is called: > do { > read_val = read_func(aligned_ptr); > swapped_new = read_val & ~mask; > swapped_new |= masked_new; > ret = cmpxchg_func(aligned_ptr, read_val, swapped_new); > } while ( ret != read_val ); > read_val it is 'old'. > > But now I am not 100% sure that it is correct for __cmpxchg... It just can't be correct - you can't ignore "old" there. I think you want simple cmpxchg primitives, which xchg then uses in a loop (while cmpxchg uses them plainly). >>> +static always_inline unsigned short __cmpxchg_case_2(volatile >>> uint32_t *ptr, >>> + uint32_t old, >>> + uint32_t new) >>> +{ >>> + (void) old; >>> + >>> + if (((unsigned long)ptr & 3) == 3) >>> + { >>> +#ifdef CONFIG_64BIT >>> + return __emulate_cmpxchg_case1_2((uint64_t *)ptr, new, >>> + readq, __cmpxchg_case_8, >>> 0xffffU); >> >> What if ((unsigned long)ptr & 7) == 7 (which is a sub-case of what >> the >> if() above checks for? Isn't it more reasonable to require aligned >> 16-bit quantities here? Or if mis-aligned addresses are okay, you >> could >> as well emulate using __cmpxchg_case_4(). > Yes, it will be more reasonable. I'll use IS_ALIGNED instead. Not sure I get your use of "instead" here correctly. There's more to change here than just the if() condition. Jan
On Tue, 2024-01-23 at 11:28 +0100, Jan Beulich wrote: > On 23.01.2024 11:15, Oleksii wrote: > > On Mon, 2024-01-22 at 17:27 +0100, Jan Beulich wrote: > > > On 22.12.2023 16:12, Oleksii Kurochko wrote: > > > > + : "=r" (ret__), "+A" (*ptr__) \ > > > > + : "r" (new__) \ > > > > + : "memory" ); \ > > > > + break; \ > > > > + case 8: \ > > > > + asm volatile( \ > > > > + " amoswap.d %0, %2, %1\n" \ > > > > + : "=r" (ret__), "+A" (*ptr__) \ > > > > + : "r" (new__) \ > > > > + : "memory" ); \ > > > > + break; \ > > > > + default: \ > > > > + ASSERT_UNREACHABLE(); \ > > > > > > If at all possible this wants to trigger a build failure, not a > > > runtime > > > one. > > I'll update that with BUILD_BUG_ON(size > 8); > > What about size accidentally being e.g. 5? I'm also not sure you'll > be > able to use BUILD_BUG_ON() here: That'll depend on what use sites > there > are. And if all present ones are okay in this regard, you'd still set > out a trap for someone else to fall into later. We have a common > approach for this, which currently is under re-work. See > https://lists.xen.org/archives/html/xen-devel/2024-01/msg01115.html. Thanks. I'll use that. > > > > > + } \ > > > > + ret__; \ > > > > +}) > > > > + > > > > +#define xchg_relaxed(ptr, x) \ > > > > +({ \ > > > > + __typeof__(*(ptr)) x_ = (x); \ > > > > + (__typeof__(*(ptr))) __xchg_relaxed((ptr), x_, > > > > sizeof(*(ptr))); \ > > > > > > Nit: Stray blank after cast. For readability I'd also suggest to > > > drop parentheses in cases like the first argument passed to > > > __xchg_relaxed() here. > > Thanks. I'll take that into account. > > > > > > > > > +}) > > > > > > For both: What does "relaxed" describe? I'm asking because it's > > > not > > > really clear whether the memory clobbers are actually needed. > > > > > > > +#define __xchg_acquire(ptr, new, size) \ > > > > +({ \ > > > > + __typeof__(ptr) ptr__ = (ptr); \ > > > > + __typeof__(new) new__ = (new); \ > > > > + __typeof__(*(ptr)) ret__; \ > > > > + switch (size) \ > > > > + { \ > > > > + case 4: \ > > > > + asm volatile( \ > > > > + " amoswap.w %0, %2, %1\n" \ > > > > + RISCV_ACQUIRE_BARRIER \ > > > > + : "=r" (ret__), "+A" (*ptr__) \ > > > > + : "r" (new__) \ > > > > + : "memory" ); \ > > > > + break; \ > > > > + case 8: \ > > > > + asm volatile( \ > > > > + " amoswap.d %0, %2, %1\n" \ > > > > + RISCV_ACQUIRE_BARRIER \ > > > > + : "=r" (ret__), "+A" (*ptr__) \ > > > > + : "r" (new__) \ > > > > + : "memory" ); \ > > > > + break; \ > > > > + default: \ > > > > + ASSERT_UNREACHABLE(); \ > > > > + } \ > > > > + ret__; \ > > > > +}) > > > > > > If I'm not mistaken this differs from __xchg_relaxed() only in > > > the > > > use > > > of RISCV_ACQUIRE_BARRIER, and ... > > > > > > > +#define xchg_acquire(ptr, x) \ > > > > +({ \ > > > > + __typeof__(*(ptr)) x_ = (x); \ > > > > + (__typeof__(*(ptr))) __xchg_acquire((ptr), x_, > > > > sizeof(*(ptr))); \ > > > > +}) > > > > + > > > > +#define __xchg_release(ptr, new, size) \ > > > > +({ \ > > > > + __typeof__(ptr) ptr__ = (ptr); \ > > > > + __typeof__(new) new__ = (new); \ > > > > + __typeof__(*(ptr)) ret__; \ > > > > + switch (size) \ > > > > + { \ > > > > + case 4: \ > > > > + asm volatile ( \ > > > > + RISCV_RELEASE_BARRIER \ > > > > + " amoswap.w %0, %2, %1\n" \ > > > > + : "=r" (ret__), "+A" (*ptr__) \ > > > > + : "r" (new__) \ > > > > + : "memory"); \ > > > > + break; \ > > > > + case 8: \ > > > > + asm volatile ( \ > > > > + RISCV_RELEASE_BARRIER \ > > > > + " amoswap.d %0, %2, %1\n" \ > > > > + : "=r" (ret__), "+A" (*ptr__) \ > > > > + : "r" (new__) \ > > > > + : "memory"); \ > > > > + break; \ > > > > + default: \ > > > > + ASSERT_UNREACHABLE(); \ > > > > + } \ > > > > + ret__; \ > > > > +}) > > > > > > this only in the use of RISCV_RELEASE_BARRIER. If so they likely > > > want > > > folding, to limit redundancy and make eventual updating easier. > > > (Same > > > for the cmpxchg helper further down, as it seems.) > > Yes, you are right. The difference is only in RISCV_RELEASE_BARRIER > > and > > it is an absence of RISCV_RELEASE_BARRIER is a reason why we have > > "relaxed" versions. > > > > I am not sure that I understand what you mean by folding here. Do > > you > > mean that there is no any sense to have to separate macros and it > > is > > needed only one with RISCV_RELEASE_BARRIER? > > No. You should parameterize the folded common macro for the derived > macros to simply pass in the barriers needed, with empty macro > arguments indicating "this barrier not needed". Now it makes sense to me. Thank you for explanation. > > > > > +#define xchg_release(ptr, x) \ > > > > +({ \ > > > > + __typeof__(*(ptr)) x_ = (x); \ > > > > + (__typeof__(*(ptr))) __xchg_release((ptr), x_, > > > > sizeof(*(ptr))); \ > > > > +}) > > > > + > > > > +static always_inline uint32_t __xchg_case_4(volatile uint32_t > > > > *ptr, > > > > + uint32_t new) > > > > +{ > > > > + __typeof__(*(ptr)) ret; > > > > + > > > > + asm volatile ( > > > > + " amoswap.w.aqrl %0, %2, %1\n" > > > > + : "=r" (ret), "+A" (*ptr) > > > > + : "r" (new) > > > > + : "memory" ); > > > > + > > > > + return ret; > > > > +} > > > > + > > > > +static always_inline uint64_t __xchg_case_8(volatile uint64_t > > > > *ptr, > > > > + uint64_t new) > > > > +{ > > > > + __typeof__(*(ptr)) ret; > > > > + > > > > + asm volatile( \ > > > > + " amoswap.d.aqrl %0, %2, %1\n" \ > > > > + : "=r" (ret), "+A" (*ptr) \ > > > > + : "r" (new) \ > > > > + : "memory" ); \ > > > > + > > > > + return ret; > > > > +} > > > > + > > > > +static always_inline unsigned short __cmpxchg_case_2(volatile > > > > uint32_t *ptr, > > > > + uint32_t > > > > old, > > > > + uint32_t > > > > new); > > > > > > Don't you consistently mean uint16_t here (incl the return type) > > > and > > > ... > > > > > > > +static always_inline unsigned short __cmpxchg_case_1(volatile > > > > uint32_t *ptr, > > > > + uint32_t > > > > old, > > > > + uint32_t > > > > new); > > > > > > ... uint8_t here? > > The idea was that we emulate __cmpxchg_case_1 and __cmpxchg_case_2 > > using 4 bytes cmpxchg and __cmpxchg_case_4 expects uint32_t. > > I consider this wrong. The functions would better be type-correct. > > > > > +static inline unsigned long __xchg(volatile void *ptr, > > > > unsigned > > > > long x, int size) > > > > +{ > > > > + switch (size) { > > > > + case 1: > > > > + return __cmpxchg_case_1(ptr, (uint32_t)-1, x); > > > > + case 2: > > > > + return __cmpxchg_case_2(ptr, (uint32_t)-1, x); > > > > > > How are these going to work? You'll compare against ~0, and if > > > the > > > value > > > in memory isn't ~0, memory won't be updated; you will only > > > (correctly) > > > return the value found in memory. > > > > > > Or wait - looking at __cmpxchg_case_{1,2}() far further down, you > > > ignore > > > "old" there. Which apparently means they'll work for the use > > > here, > > > but > > > not for the use in __cmpxchg(). > > Yes, the trick is that old is ignored and is read in > > __emulate_cmpxchg_case1_2() before __cmpxchg_case_4 is called: > > do > > { > > read_val = > > read_func(aligned_ptr); > > swapped_new = read_val & > > ~mask; > > swapped_new |= > > masked_new; > > ret = cmpxchg_func(aligned_ptr, read_val, > > swapped_new); > > } while ( ret != read_val > > ); > > read_val it is 'old'. > > > > But now I am not 100% sure that it is correct for __cmpxchg... > > It just can't be correct - you can't ignore "old" there. I think you > want simple cmpxchg primitives, which xchg then uses in a loop (while > cmpxchg uses them plainly). But xchg doesn't require 'old' value, so it should be ignored in some way by cmpxchg. > > > > > +static always_inline unsigned short __cmpxchg_case_2(volatile > > > > uint32_t *ptr, > > > > + uint32_t > > > > old, > > > > + uint32_t > > > > new) > > > > +{ > > > > + (void) old; > > > > + > > > > + if (((unsigned long)ptr & 3) == 3) > > > > + { > > > > +#ifdef CONFIG_64BIT > > > > + return __emulate_cmpxchg_case1_2((uint64_t *)ptr, new, > > > > + readq, > > > > __cmpxchg_case_8, > > > > 0xffffU); > > > > > > What if ((unsigned long)ptr & 7) == 7 (which is a sub-case of > > > what > > > the > > > if() above checks for? Isn't it more reasonable to require > > > aligned > > > 16-bit quantities here? Or if mis-aligned addresses are okay, you > > > could > > > as well emulate using __cmpxchg_case_4(). > > Yes, it will be more reasonable. I'll use IS_ALIGNED instead. > > Not sure I get your use of "instead" here correctly. There's more > to change here than just the if() condition. I meant something like: if ( IS_ALIGNED(ptr, 16) ) __emulate_cmpxchg_case1_2(...); else assert_failed("ptr isn't aligned\n"); I have to check if it is necessary to have an mis-aligned address for CAS intstuctions. If mis-aligned address is okay then it looks like we can use always __cmpxchng_case_4 without checking if a ptr is ALIGNED or not. Or I started to loose something... ~ Oleksii
On 23.01.2024 13:18, Oleksii wrote: > On Tue, 2024-01-23 at 11:28 +0100, Jan Beulich wrote: >> On 23.01.2024 11:15, Oleksii wrote: >>> On Mon, 2024-01-22 at 17:27 +0100, Jan Beulich wrote: >>>> On 22.12.2023 16:12, Oleksii Kurochko wrote: >>>>> +static inline unsigned long __xchg(volatile void *ptr, >>>>> unsigned >>>>> long x, int size) >>>>> +{ >>>>> + switch (size) { >>>>> + case 1: >>>>> + return __cmpxchg_case_1(ptr, (uint32_t)-1, x); >>>>> + case 2: >>>>> + return __cmpxchg_case_2(ptr, (uint32_t)-1, x); >>>> >>>> How are these going to work? You'll compare against ~0, and if >>>> the >>>> value >>>> in memory isn't ~0, memory won't be updated; you will only >>>> (correctly) >>>> return the value found in memory. >>>> >>>> Or wait - looking at __cmpxchg_case_{1,2}() far further down, you >>>> ignore >>>> "old" there. Which apparently means they'll work for the use >>>> here, >>>> but >>>> not for the use in __cmpxchg(). >>> Yes, the trick is that old is ignored and is read in >>> __emulate_cmpxchg_case1_2() before __cmpxchg_case_4 is called: >>> do >>> { >>> read_val = >>> read_func(aligned_ptr); >>> swapped_new = read_val & >>> ~mask; >>> swapped_new |= >>> masked_new; >>> ret = cmpxchg_func(aligned_ptr, read_val, >>> swapped_new); >>> } while ( ret != read_val >>> ); >>> read_val it is 'old'. >>> >>> But now I am not 100% sure that it is correct for __cmpxchg... >> >> It just can't be correct - you can't ignore "old" there. I think you >> want simple cmpxchg primitives, which xchg then uses in a loop (while >> cmpxchg uses them plainly). > But xchg doesn't require 'old' value, so it should be ignored in some > way by cmpxchg. Well, no. If you have only cmpxchg, I think your only choice is - as said - to read the old value and then loop over cmpxchg until that succeeds. Not really different from other operations which need emulating using cmpxchg. >>>>> +static always_inline unsigned short __cmpxchg_case_2(volatile >>>>> uint32_t *ptr, >>>>> + uint32_t >>>>> old, >>>>> + uint32_t >>>>> new) >>>>> +{ >>>>> + (void) old; >>>>> + >>>>> + if (((unsigned long)ptr & 3) == 3) >>>>> + { >>>>> +#ifdef CONFIG_64BIT >>>>> + return __emulate_cmpxchg_case1_2((uint64_t *)ptr, new, >>>>> + readq, >>>>> __cmpxchg_case_8, >>>>> 0xffffU); >>>> >>>> What if ((unsigned long)ptr & 7) == 7 (which is a sub-case of >>>> what >>>> the >>>> if() above checks for? Isn't it more reasonable to require >>>> aligned >>>> 16-bit quantities here? Or if mis-aligned addresses are okay, you >>>> could >>>> as well emulate using __cmpxchg_case_4(). >>> Yes, it will be more reasonable. I'll use IS_ALIGNED instead. >> >> Not sure I get your use of "instead" here correctly. There's more >> to change here than just the if() condition. > I meant something like: > > if ( IS_ALIGNED(ptr, 16) ) > __emulate_cmpxchg_case1_2(...); > else > assert_failed("ptr isn't aligned\n"); Except that you'd better not use assert_failed() directly anywhere, and the above is easier as ASSERT(IS_ALIGNED(ptr, 16)); __emulate_cmpxchg_case1_2(...); anyway (leaving aside that I guess you mean 2, not 16). Jan
On Tue, 2024-01-23 at 14:27 +0100, Jan Beulich wrote: > On 23.01.2024 13:18, Oleksii wrote: > > On Tue, 2024-01-23 at 11:28 +0100, Jan Beulich wrote: > > > On 23.01.2024 11:15, Oleksii wrote: > > > > On Mon, 2024-01-22 at 17:27 +0100, Jan Beulich wrote: > > > > > On 22.12.2023 16:12, Oleksii Kurochko wrote: > > > > > > +static inline unsigned long __xchg(volatile void *ptr, > > > > > > unsigned > > > > > > long x, int size) > > > > > > +{ > > > > > > + switch (size) { > > > > > > + case 1: > > > > > > + return __cmpxchg_case_1(ptr, (uint32_t)-1, x); > > > > > > + case 2: > > > > > > + return __cmpxchg_case_2(ptr, (uint32_t)-1, x); > > > > > > > > > > How are these going to work? You'll compare against ~0, and > > > > > if > > > > > the > > > > > value > > > > > in memory isn't ~0, memory won't be updated; you will only > > > > > (correctly) > > > > > return the value found in memory. > > > > > > > > > > Or wait - looking at __cmpxchg_case_{1,2}() far further down, > > > > > you > > > > > ignore > > > > > "old" there. Which apparently means they'll work for the use > > > > > here, > > > > > but > > > > > not for the use in __cmpxchg(). > > > > Yes, the trick is that old is ignored and is read in > > > > __emulate_cmpxchg_case1_2() before __cmpxchg_case_4 is called: > > > > do > > > > { > > > > read_val = > > > > read_func(aligned_ptr); > > > > swapped_new = read_val & > > > > ~mask; > > > > swapped_new |= > > > > masked_new; > > > > ret = cmpxchg_func(aligned_ptr, read_val, > > > > swapped_new); > > > > } while ( ret != read_val > > > > ); > > > > read_val it is 'old'. > > > > > > > > But now I am not 100% sure that it is correct for __cmpxchg... > > > > > > It just can't be correct - you can't ignore "old" there. I think > > > you > > > want simple cmpxchg primitives, which xchg then uses in a loop > > > (while > > > cmpxchg uses them plainly). > > But xchg doesn't require 'old' value, so it should be ignored in > > some > > way by cmpxchg. > > Well, no. If you have only cmpxchg, I think your only choice is - as > said - to read the old value and then loop over cmpxchg until that > succeeds. Not really different from other operations which need > emulating using cmpxchg. Then it looks like the main error in __emulate_cmpxchg_case1_2 is that I read the value each time, so read_val = read_func(aligned_ptr); should be before the do {...} while(). Also, it would be better to rename it to old_val or just old. > > > > > > > +static always_inline unsigned short > > > > > > __cmpxchg_case_2(volatile > > > > > > uint32_t *ptr, > > > > > > + > > > > > > uint32_t > > > > > > old, > > > > > > + > > > > > > uint32_t > > > > > > new) > > > > > > +{ > > > > > > + (void) old; > > > > > > + > > > > > > + if (((unsigned long)ptr & 3) == 3) > > > > > > + { > > > > > > +#ifdef CONFIG_64BIT > > > > > > + return __emulate_cmpxchg_case1_2((uint64_t *)ptr, > > > > > > new, > > > > > > + readq, > > > > > > __cmpxchg_case_8, > > > > > > 0xffffU); > > > > > > > > > > What if ((unsigned long)ptr & 7) == 7 (which is a sub-case of > > > > > what > > > > > the > > > > > if() above checks for? Isn't it more reasonable to require > > > > > aligned > > > > > 16-bit quantities here? Or if mis-aligned addresses are okay, > > > > > you > > > > > could > > > > > as well emulate using __cmpxchg_case_4(). > > > > Yes, it will be more reasonable. I'll use IS_ALIGNED instead. > > > > > > Not sure I get your use of "instead" here correctly. There's more > > > to change here than just the if() condition. > > I meant something like: > > > > if ( IS_ALIGNED(ptr, 16) ) > > __emulate_cmpxchg_case1_2(...); > > else > > assert_failed("ptr isn't aligned\n"); > > Except that you'd better not use assert_failed() directly anywhere, > and the above is easier as > > ASSERT(IS_ALIGNED(ptr, 16)); > __emulate_cmpxchg_case1_2(...); > > anyway (leaving aside that I guess you mean 2, not 16). Yeah, it should be 2. Thanks. ~ Oleksii
On Mon, 2024-01-22 at 17:27 +0100, Jan Beulich wrote: > > +#define __xchg_acquire(ptr, new, size) \ > > +({ \ > > + __typeof__(ptr) ptr__ = (ptr); \ > > + __typeof__(new) new__ = (new); \ > > + __typeof__(*(ptr)) ret__; \ > > + switch (size) \ > > + { \ > > + case 4: \ > > + asm volatile( \ > > + " amoswap.w %0, %2, %1\n" \ > > + RISCV_ACQUIRE_BARRIER \ > > + : "=r" (ret__), "+A" (*ptr__) \ > > + : "r" (new__) \ > > + : "memory" ); \ > > + break; \ > > + case 8: \ > > + asm volatile( \ > > + " amoswap.d %0, %2, %1\n" \ > > + RISCV_ACQUIRE_BARRIER \ > > + : "=r" (ret__), "+A" (*ptr__) \ > > + : "r" (new__) \ > > + : "memory" ); \ > > + break; \ > > + default: \ > > + ASSERT_UNREACHABLE(); \ > > + } \ > > + ret__; \ > > +}) > > If I'm not mistaken this differs from __xchg_relaxed() only in the > use > of RISCV_ACQUIRE_BARRIER, and ... > > > +#define xchg_acquire(ptr, x) \ > > +({ \ > > + __typeof__(*(ptr)) x_ = (x); \ > > + (__typeof__(*(ptr))) __xchg_acquire((ptr), x_, > > sizeof(*(ptr))); \ > > +}) > > + > > +#define __xchg_release(ptr, new, size) \ > > +({ \ > > + __typeof__(ptr) ptr__ = (ptr); \ > > + __typeof__(new) new__ = (new); \ > > + __typeof__(*(ptr)) ret__; \ > > + switch (size) \ > > + { \ > > + case 4: \ > > + asm volatile ( \ > > + RISCV_RELEASE_BARRIER \ > > + " amoswap.w %0, %2, %1\n" \ > > + : "=r" (ret__), "+A" (*ptr__) \ > > + : "r" (new__) \ > > + : "memory"); \ > > + break; \ > > + case 8: \ > > + asm volatile ( \ > > + RISCV_RELEASE_BARRIER \ > > + " amoswap.d %0, %2, %1\n" \ > > + : "=r" (ret__), "+A" (*ptr__) \ > > + : "r" (new__) \ > > + : "memory"); \ > > + break; \ > > + default: \ > > + ASSERT_UNREACHABLE(); \ > > + } \ > > + ret__; \ > > +}) > > this only in the use of RISCV_RELEASE_BARRIER. If so they likely want > folding, to limit redundancy and make eventual updating easier. (Same > for the cmpxchg helper further down, as it seems.) Also the difference is in where to place barrier before or after atomic instruction. I am not sure that we can easily folded this macros. ~ Oleksii
On 30.01.2024 15:57, Oleksii wrote: > On Mon, 2024-01-22 at 17:27 +0100, Jan Beulich wrote: >>> +#define __xchg_acquire(ptr, new, size) \ >>> +({ \ >>> + __typeof__(ptr) ptr__ = (ptr); \ >>> + __typeof__(new) new__ = (new); \ >>> + __typeof__(*(ptr)) ret__; \ >>> + switch (size) \ >>> + { \ >>> + case 4: \ >>> + asm volatile( \ >>> + " amoswap.w %0, %2, %1\n" \ >>> + RISCV_ACQUIRE_BARRIER \ >>> + : "=r" (ret__), "+A" (*ptr__) \ >>> + : "r" (new__) \ >>> + : "memory" ); \ >>> + break; \ >>> + case 8: \ >>> + asm volatile( \ >>> + " amoswap.d %0, %2, %1\n" \ >>> + RISCV_ACQUIRE_BARRIER \ >>> + : "=r" (ret__), "+A" (*ptr__) \ >>> + : "r" (new__) \ >>> + : "memory" ); \ >>> + break; \ >>> + default: \ >>> + ASSERT_UNREACHABLE(); \ >>> + } \ >>> + ret__; \ >>> +}) >> >> If I'm not mistaken this differs from __xchg_relaxed() only in the >> use >> of RISCV_ACQUIRE_BARRIER, and ... >> >>> +#define xchg_acquire(ptr, x) \ >>> +({ \ >>> + __typeof__(*(ptr)) x_ = (x); \ >>> + (__typeof__(*(ptr))) __xchg_acquire((ptr), x_, >>> sizeof(*(ptr))); \ >>> +}) >>> + >>> +#define __xchg_release(ptr, new, size) \ >>> +({ \ >>> + __typeof__(ptr) ptr__ = (ptr); \ >>> + __typeof__(new) new__ = (new); \ >>> + __typeof__(*(ptr)) ret__; \ >>> + switch (size) \ >>> + { \ >>> + case 4: \ >>> + asm volatile ( \ >>> + RISCV_RELEASE_BARRIER \ >>> + " amoswap.w %0, %2, %1\n" \ >>> + : "=r" (ret__), "+A" (*ptr__) \ >>> + : "r" (new__) \ >>> + : "memory"); \ >>> + break; \ >>> + case 8: \ >>> + asm volatile ( \ >>> + RISCV_RELEASE_BARRIER \ >>> + " amoswap.d %0, %2, %1\n" \ >>> + : "=r" (ret__), "+A" (*ptr__) \ >>> + : "r" (new__) \ >>> + : "memory"); \ >>> + break; \ >>> + default: \ >>> + ASSERT_UNREACHABLE(); \ >>> + } \ >>> + ret__; \ >>> +}) >> >> this only in the use of RISCV_RELEASE_BARRIER. If so they likely want >> folding, to limit redundancy and make eventual updating easier. (Same >> for the cmpxchg helper further down, as it seems.) > Also the difference is in where to place barrier before or after atomic > instruction. I am not sure that we can easily folded this macros. The folded macro would have two barrier parameters - on for acquire, one for release. Jan
On Tue, 2024-01-30 at 16:05 +0100, Jan Beulich wrote: > On 30.01.2024 15:57, Oleksii wrote: > > On Mon, 2024-01-22 at 17:27 +0100, Jan Beulich wrote: > > > > +#define __xchg_acquire(ptr, new, size) \ > > > > +({ \ > > > > + __typeof__(ptr) ptr__ = (ptr); \ > > > > + __typeof__(new) new__ = (new); \ > > > > + __typeof__(*(ptr)) ret__; \ > > > > + switch (size) \ > > > > + { \ > > > > + case 4: \ > > > > + asm volatile( \ > > > > + " amoswap.w %0, %2, %1\n" \ > > > > + RISCV_ACQUIRE_BARRIER \ > > > > + : "=r" (ret__), "+A" (*ptr__) \ > > > > + : "r" (new__) \ > > > > + : "memory" ); \ > > > > + break; \ > > > > + case 8: \ > > > > + asm volatile( \ > > > > + " amoswap.d %0, %2, %1\n" \ > > > > + RISCV_ACQUIRE_BARRIER \ > > > > + : "=r" (ret__), "+A" (*ptr__) \ > > > > + : "r" (new__) \ > > > > + : "memory" ); \ > > > > + break; \ > > > > + default: \ > > > > + ASSERT_UNREACHABLE(); \ > > > > + } \ > > > > + ret__; \ > > > > +}) > > > > > > If I'm not mistaken this differs from __xchg_relaxed() only in > > > the > > > use > > > of RISCV_ACQUIRE_BARRIER, and ... > > > > > > > +#define xchg_acquire(ptr, x) \ > > > > +({ \ > > > > + __typeof__(*(ptr)) x_ = (x); \ > > > > + (__typeof__(*(ptr))) __xchg_acquire((ptr), x_, > > > > sizeof(*(ptr))); \ > > > > +}) > > > > + > > > > +#define __xchg_release(ptr, new, size) \ > > > > +({ \ > > > > + __typeof__(ptr) ptr__ = (ptr); \ > > > > + __typeof__(new) new__ = (new); \ > > > > + __typeof__(*(ptr)) ret__; \ > > > > + switch (size) \ > > > > + { \ > > > > + case 4: \ > > > > + asm volatile ( \ > > > > + RISCV_RELEASE_BARRIER \ > > > > + " amoswap.w %0, %2, %1\n" \ > > > > + : "=r" (ret__), "+A" (*ptr__) \ > > > > + : "r" (new__) \ > > > > + : "memory"); \ > > > > + break; \ > > > > + case 8: \ > > > > + asm volatile ( \ > > > > + RISCV_RELEASE_BARRIER \ > > > > + " amoswap.d %0, %2, %1\n" \ > > > > + : "=r" (ret__), "+A" (*ptr__) \ > > > > + : "r" (new__) \ > > > > + : "memory"); \ > > > > + break; \ > > > > + default: \ > > > > + ASSERT_UNREACHABLE(); \ > > > > + } \ > > > > + ret__; \ > > > > +}) > > > > > > this only in the use of RISCV_RELEASE_BARRIER. If so they likely > > > want > > > folding, to limit redundancy and make eventual updating easier. > > > (Same > > > for the cmpxchg helper further down, as it seems.) > > Also the difference is in where to place barrier before or after > > atomic > > instruction. I am not sure that we can easily folded this macros. > > The folded macro would have two barrier parameters - on for acquire, > one > for release. Yes, in such case it will work. Thanks. ~ Oleksii
diff --git a/xen/arch/riscv/include/asm/cmpxchg.h b/xen/arch/riscv/include/asm/cmpxchg.h new file mode 100644 index 0000000000..916776c403 --- /dev/null +++ b/xen/arch/riscv/include/asm/cmpxchg.h @@ -0,0 +1,496 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (C) 2014 Regents of the University of California */ + +#ifndef _ASM_RISCV_CMPXCHG_H +#define _ASM_RISCV_CMPXCHG_H + +#include <xen/compiler.h> +#include <xen/lib.h> + +#include <asm/fence.h> +#include <asm/io.h> +#include <asm/system.h> + +#define __xchg_relaxed(ptr, new, size) \ +({ \ + __typeof__(ptr) ptr__ = (ptr); \ + __typeof__(new) new__ = (new); \ + __typeof__(*(ptr)) ret__; \ + switch (size) \ + { \ + case 4: \ + asm volatile( \ + " amoswap.w %0, %2, %1\n" \ + : "=r" (ret__), "+A" (*ptr__) \ + : "r" (new__) \ + : "memory" ); \ + break; \ + case 8: \ + asm volatile( \ + " amoswap.d %0, %2, %1\n" \ + : "=r" (ret__), "+A" (*ptr__) \ + : "r" (new__) \ + : "memory" ); \ + break; \ + default: \ + ASSERT_UNREACHABLE(); \ + } \ + ret__; \ +}) + +#define xchg_relaxed(ptr, x) \ +({ \ + __typeof__(*(ptr)) x_ = (x); \ + (__typeof__(*(ptr))) __xchg_relaxed((ptr), x_, sizeof(*(ptr))); \ +}) + +#define __xchg_acquire(ptr, new, size) \ +({ \ + __typeof__(ptr) ptr__ = (ptr); \ + __typeof__(new) new__ = (new); \ + __typeof__(*(ptr)) ret__; \ + switch (size) \ + { \ + case 4: \ + asm volatile( \ + " amoswap.w %0, %2, %1\n" \ + RISCV_ACQUIRE_BARRIER \ + : "=r" (ret__), "+A" (*ptr__) \ + : "r" (new__) \ + : "memory" ); \ + break; \ + case 8: \ + asm volatile( \ + " amoswap.d %0, %2, %1\n" \ + RISCV_ACQUIRE_BARRIER \ + : "=r" (ret__), "+A" (*ptr__) \ + : "r" (new__) \ + : "memory" ); \ + break; \ + default: \ + ASSERT_UNREACHABLE(); \ + } \ + ret__; \ +}) + +#define xchg_acquire(ptr, x) \ +({ \ + __typeof__(*(ptr)) x_ = (x); \ + (__typeof__(*(ptr))) __xchg_acquire((ptr), x_, sizeof(*(ptr))); \ +}) + +#define __xchg_release(ptr, new, size) \ +({ \ + __typeof__(ptr) ptr__ = (ptr); \ + __typeof__(new) new__ = (new); \ + __typeof__(*(ptr)) ret__; \ + switch (size) \ + { \ + case 4: \ + asm volatile ( \ + RISCV_RELEASE_BARRIER \ + " amoswap.w %0, %2, %1\n" \ + : "=r" (ret__), "+A" (*ptr__) \ + : "r" (new__) \ + : "memory"); \ + break; \ + case 8: \ + asm volatile ( \ + RISCV_RELEASE_BARRIER \ + " amoswap.d %0, %2, %1\n" \ + : "=r" (ret__), "+A" (*ptr__) \ + : "r" (new__) \ + : "memory"); \ + break; \ + default: \ + ASSERT_UNREACHABLE(); \ + } \ + ret__; \ +}) + +#define xchg_release(ptr, x) \ +({ \ + __typeof__(*(ptr)) x_ = (x); \ + (__typeof__(*(ptr))) __xchg_release((ptr), x_, sizeof(*(ptr))); \ +}) + +static always_inline uint32_t __xchg_case_4(volatile uint32_t *ptr, + uint32_t new) +{ + __typeof__(*(ptr)) ret; + + asm volatile ( + " amoswap.w.aqrl %0, %2, %1\n" + : "=r" (ret), "+A" (*ptr) + : "r" (new) + : "memory" ); + + return ret; +} + +static always_inline uint64_t __xchg_case_8(volatile uint64_t *ptr, + uint64_t new) +{ + __typeof__(*(ptr)) ret; + + asm volatile( \ + " amoswap.d.aqrl %0, %2, %1\n" \ + : "=r" (ret), "+A" (*ptr) \ + : "r" (new) \ + : "memory" ); \ + + return ret; +} + +static always_inline unsigned short __cmpxchg_case_2(volatile uint32_t *ptr, + uint32_t old, + uint32_t new); + +static always_inline unsigned short __cmpxchg_case_1(volatile uint32_t *ptr, + uint32_t old, + uint32_t new); + +static inline unsigned long __xchg(volatile void *ptr, unsigned long x, int size) +{ + switch (size) { + case 1: + return __cmpxchg_case_1(ptr, (uint32_t)-1, x); + case 2: + return __cmpxchg_case_2(ptr, (uint32_t)-1, x); + case 4: + return __xchg_case_4(ptr, x); + case 8: + return __xchg_case_8(ptr, x); + default: + ASSERT_UNREACHABLE(); + } + + return -1; +} + +#define xchg(ptr,x) \ +({ \ + __typeof__(*(ptr)) ret__; \ + ret__ = (__typeof__(*(ptr))) \ + __xchg((ptr), (unsigned long)(x), sizeof(*(ptr))); \ + ret__; \ +}) + +#define xchg32(ptr, x) \ +({ \ + BUILD_BUG_ON(sizeof(*(ptr)) != 4); \ + xchg((ptr), (x)); \ +}) + +#define xchg64(ptr, x) \ +({ \ + BUILD_BUG_ON(sizeof(*(ptr)) != 8); \ + xchg((ptr), (x)); \ +}) + +/* + * Atomic compare and exchange. Compare OLD with MEM, if identical, + * store NEW in MEM. Return the initial value in MEM. Success is + * indicated by comparing RETURN with OLD. + */ +#define __cmpxchg_relaxed(ptr, old, new, size) \ +({ \ + __typeof__(ptr) ptr__ = (ptr); \ + __typeof__(*(ptr)) __old = (old); \ + __typeof__(*(ptr)) new__ = (new); \ + __typeof__(*(ptr)) ret__; \ + register unsigned int __rc; \ + switch (size) \ + { \ + case 4: \ + asm volatile( \ + "0: lr.w %0, %2\n" \ + " bne %0, %z3, 1f\n" \ + " sc.w %1, %z4, %2\n" \ + " bnez %1, 0b\n" \ + "1:\n" \ + : "=&r" (ret__), "=&r" (__rc), "+A" (*ptr__) \ + : "rJ" (__old), "rJ" (new__) \ + : "memory"); \ + break; \ + case 8: \ + asm volatile( \ + "0: lr.d %0, %2\n" \ + " bne %0, %z3, 1f\n" \ + " sc.d %1, %z4, %2\n" \ + " bnez %1, 0b\n" \ + "1:\n" \ + : "=&r" (ret__), "=&r" (__rc), "+A" (*ptr__) \ + : "rJ" (__old), "rJ" (new__) \ + : "memory"); \ + break; \ + default: \ + ASSERT_UNREACHABLE(); \ + } \ + ret__; \ +}) + +#define cmpxchg_relaxed(ptr, o, n) \ +({ \ + __typeof__(*(ptr)) o_ = (o); \ + __typeof__(*(ptr)) n_ = (n); \ + (__typeof__(*(ptr))) __cmpxchg_relaxed((ptr), \ + o_, n_, sizeof(*(ptr))); \ +}) + +#define __cmpxchg_acquire(ptr, old, new, size) \ +({ \ + __typeof__(ptr) ptr__ = (ptr); \ + __typeof__(*(ptr)) __old = (old); \ + __typeof__(*(ptr)) new__ = (new); \ + __typeof__(*(ptr)) ret__; \ + register unsigned int __rc; \ + switch (size) \ + { \ + case 4: \ + asm volatile( \ + "0: lr.w %0, %2\n" \ + " bne %0, %z3, 1f\n" \ + " sc.w %1, %z4, %2\n" \ + " bnez %1, 0b\n" \ + RISCV_ACQUIRE_BARRIER \ + "1:\n" \ + : "=&r" (ret__), "=&r" (__rc), "+A" (*ptr__) \ + : "rJ" (__old), "rJ" (new__) \ + : "memory"); \ + break; \ + case 8: \ + asm volatile( \ + "0: lr.d %0, %2\n" \ + " bne %0, %z3, 1f\n" \ + " sc.d %1, %z4, %2\n" \ + " bnez %1, 0b\n" \ + RISCV_ACQUIRE_BARRIER \ + "1:\n" \ + : "=&r" (ret__), "=&r" (__rc), "+A" (*ptr__) \ + : "rJ" (__old), "rJ" (new__) \ + : "memory"); \ + break; \ + default: \ + ASSERT_UNREACHABLE(); \ + } \ + ret__; \ +}) + +#define cmpxchg_acquire(ptr, o, n) \ +({ \ + __typeof__(*(ptr)) o_ = (o); \ + __typeof__(*(ptr)) n_ = (n); \ + (__typeof__(*(ptr))) __cmpxchg_acquire((ptr), o_, n_, sizeof(*(ptr))); \ +}) + +#define __cmpxchg_release(ptr, old, new, size) \ +({ \ + __typeof__(ptr) ptr__ = (ptr); \ + __typeof__(*(ptr)) __old = (old); \ + __typeof__(*(ptr)) new__ = (new); \ + __typeof__(*(ptr)) ret__; \ + register unsigned int __rc; \ + switch (size) \ + { \ + case 4: \ + asm volatile ( \ + RISCV_RELEASE_BARRIER \ + "0: lr.w %0, %2\n" \ + " bne %0, %z3, 1f\n" \ + " sc.w %1, %z4, %2\n" \ + " bnez %1, 0b\n" \ + "1:\n" \ + : "=&r" (ret__), "=&r" (__rc), "+A" (*ptr__) \ + : "rJ" (__old), "rJ" (new__) \ + : "memory" ); \ + break; \ + case 8: \ + asm volatile ( \ + RISCV_RELEASE_BARRIER \ + "0: lr.d %0, %2\n" \ + " bne %0, %z3, 1f\n" \ + " sc.d %1, %z4, %2\n" \ + " bnez %1, 0b\n" \ + "1:\n" \ + : "=&r" (ret__), "=&r" (__rc), "+A" (*ptr__) \ + : "rJ" (__old), "rJ" (new__) \ + : "memory" ); \ + break; \ + default: \ + ASSERT_UNREACHABLE(); \ + } \ + ret__; \ +}) + +#define cmpxchg_release(ptr, o, n) \ +({ \ + __typeof__(*(ptr)) _o_ = (o); \ + __typeof__(*(ptr)) _n_ = (n); \ + (__typeof__(*(ptr))) __cmpxchg_release((ptr), _o_, _n_, sizeof(*(ptr))); \ +}) + +static always_inline uint32_t __cmpxchg_case_4(volatile uint32_t *ptr, + uint32_t old, + uint32_t new) +{ + uint32_t ret; + register uint32_t rc; + + asm volatile ( + "0: lr.w %0, %2\n" + " bne %0, %z3, 1f\n" + " sc.w.rl %1, %z4, %2\n" + " bnez %1, 0b\n" + " fence rw, rw\n" + "1:\n" + : "=&r" (ret), "=&r" (rc), "+A" (*ptr) + : "rJ" (old), "rJ" (new) + : "memory" ); + + return ret; +} + +static always_inline uint64_t __cmpxchg_case_8(volatile uint64_t *ptr, + uint64_t old, + uint64_t new) +{ + uint64_t ret; + register uint32_t rc; + + asm volatile( + "0: lr.d %0, %2\n" + " bne %0, %z3, 1f\n" + " sc.d.rl %1, %z4, %2\n" + " bnez %1, 0b\n" + " fence rw, rw\n" + "1:\n" + : "=&r" (ret), "=&r" (rc), "+A" (*ptr) + : "rJ" (old), "rJ" (new) + : "memory"); + + return ret; +} + +#define __emulate_cmpxchg_case1_2(ptr, new, read_func, cmpxchg_func, swap_byte_mask_base)\ +({ \ + __typeof__(*(ptr)) read_val; \ + __typeof__(*(ptr)) swapped_new; \ + __typeof__(*(ptr)) ret; \ + __typeof__(*(ptr)) new_ = (__typeof__(*(ptr)))new; \ + \ + __typeof__(ptr) aligned_ptr = (__typeof__(ptr))((unsigned long)ptr & ~3); \ + __typeof__(*(ptr)) mask_off = ((unsigned long)ptr & 3) * 8; \ + __typeof__(*(ptr)) mask = \ + (__typeof__(*(ptr)))swap_byte_mask_base << mask_off; \ + __typeof__(*(ptr)) masked_new = (new_ << mask_off) & mask; \ + \ + do { \ + read_val = read_func(aligned_ptr); \ + swapped_new = read_val & ~mask; \ + swapped_new |= masked_new; \ + ret = cmpxchg_func(aligned_ptr, read_val, swapped_new); \ + } while ( ret != read_val ); \ + \ + ret = MASK_EXTR(swapped_new, mask); \ + ret; \ +}) + +static always_inline unsigned short __cmpxchg_case_2(volatile uint32_t *ptr, + uint32_t old, + uint32_t new) +{ + (void) old; + + if (((unsigned long)ptr & 3) == 3) + { +#ifdef CONFIG_64BIT + return __emulate_cmpxchg_case1_2((uint64_t *)ptr, new, + readq, __cmpxchg_case_8, 0xffffU); +#else + #error "add emulation support of cmpxchg for CONFIG_32BIT" +#endif + } + else + return __emulate_cmpxchg_case1_2((uint32_t *)ptr, new, + readl, __cmpxchg_case_4, 0xffffU); +} + +static always_inline unsigned short __cmpxchg_case_1(volatile uint32_t *ptr, + uint32_t old, + uint32_t new) +{ + (void) old; + + return __emulate_cmpxchg_case1_2((uint32_t *)ptr, new, + readl, __cmpxchg_case_4, 0xffU); +} + +static always_inline unsigned long __cmpxchg(volatile void *ptr, + unsigned long old, + unsigned long new, + int size) +{ + switch (size) + { + case 1: + return __cmpxchg_case_1(ptr, old, new); + case 2: + return __cmpxchg_case_2(ptr, old, new); + case 4: + return __cmpxchg_case_4(ptr, old, new); + case 8: + return __cmpxchg_case_8(ptr, old, new); + default: + ASSERT_UNREACHABLE(); + } + + return old; +} + +#define cmpxchg(ptr, o, n) \ +({ \ + __typeof__(*(ptr)) ret__; \ + ret__ = (__typeof__(*(ptr))) \ + __cmpxchg((ptr), (unsigned long)(o), (unsigned long)(n), \ + sizeof(*(ptr))); \ + ret__; \ +}) + +#define cmpxchg_local(ptr, o, n) \ + (__cmpxchg_relaxed((ptr), (o), (n), sizeof(*(ptr)))) + +#define cmpxchg32(ptr, o, n) \ +({ \ + BUILD_BUG_ON(sizeof(*(ptr)) != 4); \ + cmpxchg((ptr), (o), (n)); \ +}) + +#define cmpxchg32_local(ptr, o, n) \ +({ \ + BUILD_BUG_ON(sizeof(*(ptr)) != 4); \ + cmpxchg_relaxed((ptr), (o), (n)) \ +}) + +#define cmpxchg64(ptr, o, n) \ +({ \ + BUILD_BUG_ON(sizeof(*(ptr)) != 8); \ + cmpxchg((ptr), (o), (n)); \ +}) + +#define cmpxchg64_local(ptr, o, n) \ +({ \ + BUILD_BUG_ON(sizeof(*(ptr)) != 8); \ + cmpxchg_relaxed((ptr), (o), (n)); \ +}) + +#endif /* _ASM_RISCV_CMPXCHG_H */ + +/* + * Local variables: + * mode: C + * c-file-style: "BSD" + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + */
The header was taken from Linux kernl 6.4.0-rc1. Addionally, were updated: * add emulation of {cmp}xchg for 1/2 byte types * replace tabs with spaces * replace __* varialbed with *__ Signed-off-by: Oleksii Kurochko <oleksii.kurochko@gmail.com> --- Changes in V3: - update the commit message - add emulation of {cmp}xchg_... for 1 and 2 bytes types --- Changes in V2: - update the comment at the top of the header. - change xen/lib.h to xen/bug.h. - sort inclusion of headers properly. --- xen/arch/riscv/include/asm/cmpxchg.h | 496 +++++++++++++++++++++++++++ 1 file changed, 496 insertions(+) create mode 100644 xen/arch/riscv/include/asm/cmpxchg.h