diff mbox series

[v3,13/34] xen/riscv: introduce cmpxchg.h

Message ID 214bfd61c8ccf2a5b2c640b815ebfa6a705f6234.1703255175.git.oleksii.kurochko@gmail.com (mailing list archive)
State Superseded
Headers show
Series Enable build of full Xen for RISC-V | expand

Commit Message

Oleksii Kurochko Dec. 22, 2023, 3:12 p.m. UTC
The header was taken from Linux kernl 6.4.0-rc1.

Addionally, were updated:
* add emulation of {cmp}xchg for 1/2 byte types
* replace tabs with spaces
* replace __* varialbed with *__

Signed-off-by: Oleksii Kurochko <oleksii.kurochko@gmail.com>
---
Changes in V3:
 - update the commit message
 - add emulation of {cmp}xchg_... for 1 and 2 bytes types
---
Changes in V2:
	- update the comment at the top of the header.
	- change xen/lib.h to xen/bug.h.
	- sort inclusion of headers properly.
---
 xen/arch/riscv/include/asm/cmpxchg.h | 496 +++++++++++++++++++++++++++
 1 file changed, 496 insertions(+)
 create mode 100644 xen/arch/riscv/include/asm/cmpxchg.h

Comments

Jan Beulich Jan. 22, 2024, 4:27 p.m. UTC | #1
On 22.12.2023 16:12, Oleksii Kurochko wrote:
> --- /dev/null
> +++ b/xen/arch/riscv/include/asm/cmpxchg.h
> @@ -0,0 +1,496 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/* Copyright (C) 2014 Regents of the University of California */
> +
> +#ifndef _ASM_RISCV_CMPXCHG_H
> +#define _ASM_RISCV_CMPXCHG_H
> +
> +#include <xen/compiler.h>
> +#include <xen/lib.h>
> +
> +#include <asm/fence.h>
> +#include <asm/io.h>
> +#include <asm/system.h>
> +
> +#define __xchg_relaxed(ptr, new, size) \
> +({ \
> +    __typeof__(ptr) ptr__ = (ptr); \
> +    __typeof__(new) new__ = (new); \
> +    __typeof__(*(ptr)) ret__; \

I expect the types of new and *ptr want to actually match. Which
you then want to enforce, so that issues at use sites would either
be reported by the compiler, or be permitted by a type conversion
of new.

> +    switch (size) \
> +	{ \

Nit: Hard tab left here. (Also again you want to either stick to
Linux style or fully switch to Xen style.)

> +    case 4: \
> +        asm volatile( \
> +            "	amoswap.w %0, %2, %1\n" \

I don't think a leading tab (or leading whitespace in general) is
needed in single-line-output asm()s. The trailing \n also isn't
needed if I'm not mistaken.

> +            : "=r" (ret__), "+A" (*ptr__) \
> +            : "r" (new__) \
> +            : "memory" ); \
> +        break; \
> +    case 8: \
> +        asm volatile( \
> +            "	amoswap.d %0, %2, %1\n" \
> +            : "=r" (ret__), "+A" (*ptr__) \
> +            : "r" (new__) \
> +            : "memory" ); \
> +        break; \
> +    default: \
> +        ASSERT_UNREACHABLE(); \

If at all possible this wants to trigger a build failure, not a runtime
one.

> +    } \
> +    ret__; \
> +})
> +
> +#define xchg_relaxed(ptr, x) \
> +({ \
> +    __typeof__(*(ptr)) x_ = (x); \
> +    (__typeof__(*(ptr))) __xchg_relaxed((ptr), x_, sizeof(*(ptr))); \

Nit: Stray blank after cast. For readability I'd also suggest to
drop parentheses in cases like the first argument passed to
__xchg_relaxed() here.

> +})

For both: What does "relaxed" describe? I'm asking because it's not
really clear whether the memory clobbers are actually needed.

> +#define __xchg_acquire(ptr, new, size) \
> +({ \
> +    __typeof__(ptr) ptr__ = (ptr); \
> +    __typeof__(new) new__ = (new); \
> +    __typeof__(*(ptr)) ret__; \
> +    switch (size) \
> +	{ \
> +    case 4: \
> +        asm volatile( \
> +            "	amoswap.w %0, %2, %1\n" \
> +            RISCV_ACQUIRE_BARRIER \
> +            : "=r" (ret__), "+A" (*ptr__) \
> +            : "r" (new__) \
> +            : "memory" ); \
> +        break; \
> +    case 8: \
> +        asm volatile( \
> +            "	amoswap.d %0, %2, %1\n" \
> +            RISCV_ACQUIRE_BARRIER \
> +            : "=r" (ret__), "+A" (*ptr__) \
> +            : "r" (new__) \
> +            : "memory" ); \
> +        break; \
> +    default: \
> +        ASSERT_UNREACHABLE(); \
> +    } \
> +    ret__; \
> +})

If I'm not mistaken this differs from __xchg_relaxed() only in the use
of RISCV_ACQUIRE_BARRIER, and ...

> +#define xchg_acquire(ptr, x) \
> +({ \
> +    __typeof__(*(ptr)) x_ = (x); \
> +    (__typeof__(*(ptr))) __xchg_acquire((ptr), x_, sizeof(*(ptr))); \
> +})
> +
> +#define __xchg_release(ptr, new, size) \
> +({ \
> +    __typeof__(ptr) ptr__ = (ptr); \
> +    __typeof__(new) new__ = (new); \
> +    __typeof__(*(ptr)) ret__; \
> +    switch (size) \
> +	{ \
> +    case 4: \
> +        asm volatile ( \
> +            RISCV_RELEASE_BARRIER \
> +            "	amoswap.w %0, %2, %1\n" \
> +            : "=r" (ret__), "+A" (*ptr__) \
> +            : "r" (new__) \
> +            : "memory"); \
> +        break; \
> +    case 8: \
> +        asm volatile ( \
> +            RISCV_RELEASE_BARRIER \
> +            "	amoswap.d %0, %2, %1\n" \
> +            : "=r" (ret__), "+A" (*ptr__) \
> +            : "r" (new__) \
> +            : "memory"); \
> +        break; \
> +    default: \
> +        ASSERT_UNREACHABLE(); \
> +    } \
> +    ret__; \
> +})

this only in the use of RISCV_RELEASE_BARRIER. If so they likely want
folding, to limit redundancy and make eventual updating easier. (Same
for the cmpxchg helper further down, as it seems.)

> +#define xchg_release(ptr, x) \
> +({ \
> +    __typeof__(*(ptr)) x_ = (x); \
> +    (__typeof__(*(ptr))) __xchg_release((ptr), x_, sizeof(*(ptr))); \
> +})
> +
> +static always_inline uint32_t __xchg_case_4(volatile uint32_t *ptr,
> +                                            uint32_t new)
> +{
> +    __typeof__(*(ptr)) ret;
> +
> +    asm volatile (
> +        "   amoswap.w.aqrl %0, %2, %1\n"
> +        : "=r" (ret), "+A" (*ptr)
> +        : "r" (new)
> +        : "memory" );
> +
> +    return ret;
> +}
> +
> +static always_inline uint64_t __xchg_case_8(volatile uint64_t *ptr,
> +                                            uint64_t new)
> +{
> +    __typeof__(*(ptr)) ret;
> +
> +    asm volatile( \
> +        "   amoswap.d.aqrl %0, %2, %1\n" \
> +        : "=r" (ret), "+A" (*ptr) \
> +        : "r" (new) \
> +        : "memory" ); \
> +
> +    return ret;
> +}
> +
> +static always_inline unsigned short __cmpxchg_case_2(volatile uint32_t *ptr,
> +                                                     uint32_t old,
> +                                                     uint32_t new);

Don't you consistently mean uint16_t here (incl the return type) and ...

> +static always_inline unsigned short __cmpxchg_case_1(volatile uint32_t *ptr,
> +                                                     uint32_t old,
> +                                                     uint32_t new);

... uint8_t here?

> +static inline unsigned long __xchg(volatile void *ptr, unsigned long x, int size)
> +{
> +    switch (size) {
> +    case 1:
> +        return __cmpxchg_case_1(ptr, (uint32_t)-1, x);
> +    case 2:
> +        return __cmpxchg_case_2(ptr, (uint32_t)-1, x);

How are these going to work? You'll compare against ~0, and if the value
in memory isn't ~0, memory won't be updated; you will only (correctly)
return the value found in memory.

Or wait - looking at __cmpxchg_case_{1,2}() far further down, you ignore
"old" there. Which apparently means they'll work for the use here, but
not for the use in __cmpxchg().

> +    case 4:
> +        return __xchg_case_4(ptr, x);
> +    case 8:
> +        return __xchg_case_8(ptr, x);
> +    default:
> +        ASSERT_UNREACHABLE();
> +    }
> +
> +    return -1;
> +}
> +
> +#define xchg(ptr,x) \
> +({ \
> +    __typeof__(*(ptr)) ret__; \
> +    ret__ = (__typeof__(*(ptr))) \
> +            __xchg((ptr), (unsigned long)(x), sizeof(*(ptr))); \
> +    ret__; \
> +})
> +
> +#define xchg32(ptr, x) \
> +({ \
> +    BUILD_BUG_ON(sizeof(*(ptr)) != 4); \
> +    xchg((ptr), (x)); \
> +})
> +
> +#define xchg64(ptr, x) \
> +({ \
> +    BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
> +    xchg((ptr), (x)); \
> +})

What are these two (and their cmpxchg counterparts) needed for?

> +/*
> + * Atomic compare and exchange.  Compare OLD with MEM, if identical,
> + * store NEW in MEM.  Return the initial value in MEM.  Success is
> + * indicated by comparing RETURN with OLD.
> + */
> +#define __cmpxchg_relaxed(ptr, old, new, size) \
> +({ \
> +    __typeof__(ptr) ptr__ = (ptr); \
> +    __typeof__(*(ptr)) __old = (old); \

Leftover leading underscores?

> +    __typeof__(*(ptr)) new__ = (new); \

Related to my earlier comment on types needing to be compatible - see
how here you're using "ptr" throughout.

> +    __typeof__(*(ptr)) ret__; \
> +    register unsigned int __rc; \

More leftover leading underscores?

> +static always_inline unsigned short __cmpxchg_case_2(volatile uint32_t *ptr,
> +                                                     uint32_t old,
> +                                                     uint32_t new)
> +{
> +    (void) old;
> +
> +    if (((unsigned long)ptr & 3) == 3)
> +    {
> +#ifdef CONFIG_64BIT
> +        return __emulate_cmpxchg_case1_2((uint64_t *)ptr, new,
> +                                         readq, __cmpxchg_case_8, 0xffffU);

What if ((unsigned long)ptr & 7) == 7 (which is a sub-case of what the
if() above checks for? Isn't it more reasonable to require aligned
16-bit quantities here? Or if mis-aligned addresses are okay, you could
as well emulate using __cmpxchg_case_4().

Also you shouldn't be casting away volatile (here and below). Avoiding
the casts (by suitable using volatile void * parameter types) would
likely be best.

Jan
Oleksii Kurochko Jan. 23, 2024, 10:15 a.m. UTC | #2
On Mon, 2024-01-22 at 17:27 +0100, Jan Beulich wrote:
> On 22.12.2023 16:12, Oleksii Kurochko wrote:
> > --- /dev/null
> > +++ b/xen/arch/riscv/include/asm/cmpxchg.h
> > @@ -0,0 +1,496 @@
> > +/* SPDX-License-Identifier: GPL-2.0-only */
> > +/* Copyright (C) 2014 Regents of the University of California */
> > +
> > +#ifndef _ASM_RISCV_CMPXCHG_H
> > +#define _ASM_RISCV_CMPXCHG_H
> > +
> > +#include <xen/compiler.h>
> > +#include <xen/lib.h>
> > +
> > +#include <asm/fence.h>
> > +#include <asm/io.h>
> > +#include <asm/system.h>
> > +
> > +#define __xchg_relaxed(ptr, new, size) \
> > +({ \
> > +    __typeof__(ptr) ptr__ = (ptr); \
> > +    __typeof__(new) new__ = (new); \
> > +    __typeof__(*(ptr)) ret__; \
> 
> I expect the types of new and *ptr want to actually match. Which
> you then want to enforce, so that issues at use sites would either
> be reported by the compiler, or be permitted by a type conversion
> of new.
I am OK to make the same type for new and ret__, but it looks like
__xchg_relaxed() is used only inside xchg_relaxed() where 'new' is
declared with the same type as *ptr.

> 
> > +    switch (size) \
> > +	{ \
> 
> Nit: Hard tab left here. (Also again you want to either stick to
> Linux style or fully switch to Xen style.)
Thanks. I'll update that.

> 
> > +    case 4: \
> > +        asm volatile( \
> > +            "	amoswap.w %0, %2, %1\n" \
> 
> I don't think a leading tab (or leading whitespace in general) is
> needed in single-line-output asm()s. The trailing \n also isn't
> needed if I'm not mistaken.
I just wanted to align it with "=r", but for sure a leading tab or
whitespace can be dropped.

> 
> > +            : "=r" (ret__), "+A" (*ptr__) \
> > +            : "r" (new__) \
> > +            : "memory" ); \
> > +        break; \
> > +    case 8: \
> > +        asm volatile( \
> > +            "	amoswap.d %0, %2, %1\n" \
> > +            : "=r" (ret__), "+A" (*ptr__) \
> > +            : "r" (new__) \
> > +            : "memory" ); \
> > +        break; \
> > +    default: \
> > +        ASSERT_UNREACHABLE(); \
> 
> If at all possible this wants to trigger a build failure, not a
> runtime
> one.
I'll update that with BUILD_BUG_ON(size > 8);
> 
> > +    } \
> > +    ret__; \
> > +})
> > +
> > +#define xchg_relaxed(ptr, x) \
> > +({ \
> > +    __typeof__(*(ptr)) x_ = (x); \
> > +    (__typeof__(*(ptr))) __xchg_relaxed((ptr), x_,
> > sizeof(*(ptr))); \
> 
> Nit: Stray blank after cast. For readability I'd also suggest to
> drop parentheses in cases like the first argument passed to
> __xchg_relaxed() here.
Thanks. I'll take that into account.

> 
> > +})
> 
> For both: What does "relaxed" describe? I'm asking because it's not
> really clear whether the memory clobbers are actually needed.
> 
> > +#define __xchg_acquire(ptr, new, size) \
> > +({ \
> > +    __typeof__(ptr) ptr__ = (ptr); \
> > +    __typeof__(new) new__ = (new); \
> > +    __typeof__(*(ptr)) ret__; \
> > +    switch (size) \
> > +	{ \
> > +    case 4: \
> > +        asm volatile( \
> > +            "	amoswap.w %0, %2, %1\n" \
> > +            RISCV_ACQUIRE_BARRIER \
> > +            : "=r" (ret__), "+A" (*ptr__) \
> > +            : "r" (new__) \
> > +            : "memory" ); \
> > +        break; \
> > +    case 8: \
> > +        asm volatile( \
> > +            "	amoswap.d %0, %2, %1\n" \
> > +            RISCV_ACQUIRE_BARRIER \
> > +            : "=r" (ret__), "+A" (*ptr__) \
> > +            : "r" (new__) \
> > +            : "memory" ); \
> > +        break; \
> > +    default: \
> > +        ASSERT_UNREACHABLE(); \
> > +    } \
> > +    ret__; \
> > +})
> 
> If I'm not mistaken this differs from __xchg_relaxed() only in the
> use
> of RISCV_ACQUIRE_BARRIER, and ...
> 
> > +#define xchg_acquire(ptr, x) \
> > +({ \
> > +    __typeof__(*(ptr)) x_ = (x); \
> > +    (__typeof__(*(ptr))) __xchg_acquire((ptr), x_,
> > sizeof(*(ptr))); \
> > +})
> > +
> > +#define __xchg_release(ptr, new, size) \
> > +({ \
> > +    __typeof__(ptr) ptr__ = (ptr); \
> > +    __typeof__(new) new__ = (new); \
> > +    __typeof__(*(ptr)) ret__; \
> > +    switch (size) \
> > +	{ \
> > +    case 4: \
> > +        asm volatile ( \
> > +            RISCV_RELEASE_BARRIER \
> > +            "	amoswap.w %0, %2, %1\n" \
> > +            : "=r" (ret__), "+A" (*ptr__) \
> > +            : "r" (new__) \
> > +            : "memory"); \
> > +        break; \
> > +    case 8: \
> > +        asm volatile ( \
> > +            RISCV_RELEASE_BARRIER \
> > +            "	amoswap.d %0, %2, %1\n" \
> > +            : "=r" (ret__), "+A" (*ptr__) \
> > +            : "r" (new__) \
> > +            : "memory"); \
> > +        break; \
> > +    default: \
> > +        ASSERT_UNREACHABLE(); \
> > +    } \
> > +    ret__; \
> > +})
> 
> this only in the use of RISCV_RELEASE_BARRIER. If so they likely want
> folding, to limit redundancy and make eventual updating easier. (Same
> for the cmpxchg helper further down, as it seems.)
Yes, you are right. The difference is only in RISCV_RELEASE_BARRIER and
it is an absence of RISCV_RELEASE_BARRIER is a reason why we have
"relaxed" versions.

I am not sure that I understand what you mean by folding here. Do you
mean that there is no any sense to have to separate macros and it is
needed only one with RISCV_RELEASE_BARRIER?

> 
> > +#define xchg_release(ptr, x) \
> > +({ \
> > +    __typeof__(*(ptr)) x_ = (x); \
> > +    (__typeof__(*(ptr))) __xchg_release((ptr), x_,
> > sizeof(*(ptr))); \
> > +})
> > +
> > +static always_inline uint32_t __xchg_case_4(volatile uint32_t
> > *ptr,
> > +                                            uint32_t new)
> > +{
> > +    __typeof__(*(ptr)) ret;
> > +
> > +    asm volatile (
> > +        "   amoswap.w.aqrl %0, %2, %1\n"
> > +        : "=r" (ret), "+A" (*ptr)
> > +        : "r" (new)
> > +        : "memory" );
> > +
> > +    return ret;
> > +}
> > +
> > +static always_inline uint64_t __xchg_case_8(volatile uint64_t
> > *ptr,
> > +                                            uint64_t new)
> > +{
> > +    __typeof__(*(ptr)) ret;
> > +
> > +    asm volatile( \
> > +        "   amoswap.d.aqrl %0, %2, %1\n" \
> > +        : "=r" (ret), "+A" (*ptr) \
> > +        : "r" (new) \
> > +        : "memory" ); \
> > +
> > +    return ret;
> > +}
> > +
> > +static always_inline unsigned short __cmpxchg_case_2(volatile
> > uint32_t *ptr,
> > +                                                     uint32_t old,
> > +                                                     uint32_t
> > new);
> 
> Don't you consistently mean uint16_t here (incl the return type) and
> ...
> 
> > +static always_inline unsigned short __cmpxchg_case_1(volatile
> > uint32_t *ptr,
> > +                                                     uint32_t old,
> > +                                                     uint32_t
> > new);
> 
> ... uint8_t here?
The idea was that we emulate __cmpxchg_case_1 and __cmpxchg_case_2
using 4 bytes cmpxchg and __cmpxchg_case_4 expects uint32_t.

> 
> > +static inline unsigned long __xchg(volatile void *ptr, unsigned
> > long x, int size)
> > +{
> > +    switch (size) {
> > +    case 1:
> > +        return __cmpxchg_case_1(ptr, (uint32_t)-1, x);
> > +    case 2:
> > +        return __cmpxchg_case_2(ptr, (uint32_t)-1, x);
> 
> How are these going to work? You'll compare against ~0, and if the
> value
> in memory isn't ~0, memory won't be updated; you will only
> (correctly)
> return the value found in memory.
> 
> Or wait - looking at __cmpxchg_case_{1,2}() far further down, you
> ignore
> "old" there. Which apparently means they'll work for the use here,
> but
> not for the use in __cmpxchg().
Yes, the trick is that old is ignored and is read in
__emulate_cmpxchg_case1_2() before __cmpxchg_case_4 is called:
    do {                                                              
        read_val = read_func(aligned_ptr);                            
        swapped_new = read_val & ~mask;                               
        swapped_new |= masked_new;                                    
        ret = cmpxchg_func(aligned_ptr, read_val, swapped_new);       
    } while ( ret != read_val );                                      
read_val it is 'old'.

But now I am not 100% sure that it is correct for __cmpxchg...

> 
> > +    case 4:
> > +        return __xchg_case_4(ptr, x);
> > +    case 8:
> > +        return __xchg_case_8(ptr, x);
> > +    default:
> > +        ASSERT_UNREACHABLE();
> > +    }
> > +
> > +    return -1;
> > +}
> > +
> > +#define xchg(ptr,x) \
> > +({ \
> > +    __typeof__(*(ptr)) ret__; \
> > +    ret__ = (__typeof__(*(ptr))) \
> > +            __xchg((ptr), (unsigned long)(x), sizeof(*(ptr))); \
> > +    ret__; \
> > +})
> > +
> > +#define xchg32(ptr, x) \
> > +({ \
> > +    BUILD_BUG_ON(sizeof(*(ptr)) != 4); \
> > +    xchg((ptr), (x)); \
> > +})
> > +
> > +#define xchg64(ptr, x) \
> > +({ \
> > +    BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
> > +    xchg((ptr), (x)); \
> > +})
> 
> What are these two (and their cmpxchg counterparts) needed for?
It was copied from Linux, but it looks like they can be really dropped,
I can't find where it is used, so I'll drop them.

> 
> > +/*
> > + * Atomic compare and exchange.  Compare OLD with MEM, if
> > identical,
> > + * store NEW in MEM.  Return the initial value in MEM.  Success is
> > + * indicated by comparing RETURN with OLD.
> > + */
> > +#define __cmpxchg_relaxed(ptr, old, new, size) \
> > +({ \
> > +    __typeof__(ptr) ptr__ = (ptr); \
> > +    __typeof__(*(ptr)) __old = (old); \
> 
> Leftover leading underscores?
> 
> > +    __typeof__(*(ptr)) new__ = (new); \
> 
> Related to my earlier comment on types needing to be compatible - see
> how here you're using "ptr" throughout.
> 
> > +    __typeof__(*(ptr)) ret__; \
> > +    register unsigned int __rc; \
> 
> More leftover leading underscores?
Missed to drop underscores. Thanks. I'll drop them in next version.
> 
> > +static always_inline unsigned short __cmpxchg_case_2(volatile
> > uint32_t *ptr,
> > +                                                     uint32_t old,
> > +                                                     uint32_t new)
> > +{
> > +    (void) old;
> > +
> > +    if (((unsigned long)ptr & 3) == 3)
> > +    {
> > +#ifdef CONFIG_64BIT
> > +        return __emulate_cmpxchg_case1_2((uint64_t *)ptr, new,
> > +                                         readq, __cmpxchg_case_8,
> > 0xffffU);
> 
> What if ((unsigned long)ptr & 7) == 7 (which is a sub-case of what
> the
> if() above checks for? Isn't it more reasonable to require aligned
> 16-bit quantities here? Or if mis-aligned addresses are okay, you
> could
> as well emulate using __cmpxchg_case_4().
Yes, it will be more reasonable. I'll use IS_ALIGNED instead.

> 
> Also you shouldn't be casting away volatile (here and below).
> Avoiding
> the casts (by suitable using volatile void * parameter types) would
> likely be best.
Thanks. I'll update the function prototype.

~ Oleksii
Jan Beulich Jan. 23, 2024, 10:28 a.m. UTC | #3
On 23.01.2024 11:15, Oleksii wrote:
> On Mon, 2024-01-22 at 17:27 +0100, Jan Beulich wrote:
>> On 22.12.2023 16:12, Oleksii Kurochko wrote:
>>> +            : "=r" (ret__), "+A" (*ptr__) \
>>> +            : "r" (new__) \
>>> +            : "memory" ); \
>>> +        break; \
>>> +    case 8: \
>>> +        asm volatile( \
>>> +            "	amoswap.d %0, %2, %1\n" \
>>> +            : "=r" (ret__), "+A" (*ptr__) \
>>> +            : "r" (new__) \
>>> +            : "memory" ); \
>>> +        break; \
>>> +    default: \
>>> +        ASSERT_UNREACHABLE(); \
>>
>> If at all possible this wants to trigger a build failure, not a
>> runtime
>> one.
> I'll update that with BUILD_BUG_ON(size > 8);

What about size accidentally being e.g. 5? I'm also not sure you'll be
able to use BUILD_BUG_ON() here: That'll depend on what use sites there
are. And if all present ones are okay in this regard, you'd still set
out a trap for someone else to fall into later. We have a common
approach for this, which currently is under re-work. See
https://lists.xen.org/archives/html/xen-devel/2024-01/msg01115.html.

>>> +    } \
>>> +    ret__; \
>>> +})
>>> +
>>> +#define xchg_relaxed(ptr, x) \
>>> +({ \
>>> +    __typeof__(*(ptr)) x_ = (x); \
>>> +    (__typeof__(*(ptr))) __xchg_relaxed((ptr), x_,
>>> sizeof(*(ptr))); \
>>
>> Nit: Stray blank after cast. For readability I'd also suggest to
>> drop parentheses in cases like the first argument passed to
>> __xchg_relaxed() here.
> Thanks. I'll take that into account.
> 
>>
>>> +})
>>
>> For both: What does "relaxed" describe? I'm asking because it's not
>> really clear whether the memory clobbers are actually needed.
>>
>>> +#define __xchg_acquire(ptr, new, size) \
>>> +({ \
>>> +    __typeof__(ptr) ptr__ = (ptr); \
>>> +    __typeof__(new) new__ = (new); \
>>> +    __typeof__(*(ptr)) ret__; \
>>> +    switch (size) \
>>> +	{ \
>>> +    case 4: \
>>> +        asm volatile( \
>>> +            "	amoswap.w %0, %2, %1\n" \
>>> +            RISCV_ACQUIRE_BARRIER \
>>> +            : "=r" (ret__), "+A" (*ptr__) \
>>> +            : "r" (new__) \
>>> +            : "memory" ); \
>>> +        break; \
>>> +    case 8: \
>>> +        asm volatile( \
>>> +            "	amoswap.d %0, %2, %1\n" \
>>> +            RISCV_ACQUIRE_BARRIER \
>>> +            : "=r" (ret__), "+A" (*ptr__) \
>>> +            : "r" (new__) \
>>> +            : "memory" ); \
>>> +        break; \
>>> +    default: \
>>> +        ASSERT_UNREACHABLE(); \
>>> +    } \
>>> +    ret__; \
>>> +})
>>
>> If I'm not mistaken this differs from __xchg_relaxed() only in the
>> use
>> of RISCV_ACQUIRE_BARRIER, and ...
>>
>>> +#define xchg_acquire(ptr, x) \
>>> +({ \
>>> +    __typeof__(*(ptr)) x_ = (x); \
>>> +    (__typeof__(*(ptr))) __xchg_acquire((ptr), x_,
>>> sizeof(*(ptr))); \
>>> +})
>>> +
>>> +#define __xchg_release(ptr, new, size) \
>>> +({ \
>>> +    __typeof__(ptr) ptr__ = (ptr); \
>>> +    __typeof__(new) new__ = (new); \
>>> +    __typeof__(*(ptr)) ret__; \
>>> +    switch (size) \
>>> +	{ \
>>> +    case 4: \
>>> +        asm volatile ( \
>>> +            RISCV_RELEASE_BARRIER \
>>> +            "	amoswap.w %0, %2, %1\n" \
>>> +            : "=r" (ret__), "+A" (*ptr__) \
>>> +            : "r" (new__) \
>>> +            : "memory"); \
>>> +        break; \
>>> +    case 8: \
>>> +        asm volatile ( \
>>> +            RISCV_RELEASE_BARRIER \
>>> +            "	amoswap.d %0, %2, %1\n" \
>>> +            : "=r" (ret__), "+A" (*ptr__) \
>>> +            : "r" (new__) \
>>> +            : "memory"); \
>>> +        break; \
>>> +    default: \
>>> +        ASSERT_UNREACHABLE(); \
>>> +    } \
>>> +    ret__; \
>>> +})
>>
>> this only in the use of RISCV_RELEASE_BARRIER. If so they likely want
>> folding, to limit redundancy and make eventual updating easier. (Same
>> for the cmpxchg helper further down, as it seems.)
> Yes, you are right. The difference is only in RISCV_RELEASE_BARRIER and
> it is an absence of RISCV_RELEASE_BARRIER is a reason why we have
> "relaxed" versions.
> 
> I am not sure that I understand what you mean by folding here. Do you
> mean that there is no any sense to have to separate macros and it is
> needed only one with RISCV_RELEASE_BARRIER?

No. You should parameterize the folded common macro for the derived
macros to simply pass in the barriers needed, with empty macro
arguments indicating "this barrier not needed".

>>> +#define xchg_release(ptr, x) \
>>> +({ \
>>> +    __typeof__(*(ptr)) x_ = (x); \
>>> +    (__typeof__(*(ptr))) __xchg_release((ptr), x_,
>>> sizeof(*(ptr))); \
>>> +})
>>> +
>>> +static always_inline uint32_t __xchg_case_4(volatile uint32_t
>>> *ptr,
>>> +                                            uint32_t new)
>>> +{
>>> +    __typeof__(*(ptr)) ret;
>>> +
>>> +    asm volatile (
>>> +        "   amoswap.w.aqrl %0, %2, %1\n"
>>> +        : "=r" (ret), "+A" (*ptr)
>>> +        : "r" (new)
>>> +        : "memory" );
>>> +
>>> +    return ret;
>>> +}
>>> +
>>> +static always_inline uint64_t __xchg_case_8(volatile uint64_t
>>> *ptr,
>>> +                                            uint64_t new)
>>> +{
>>> +    __typeof__(*(ptr)) ret;
>>> +
>>> +    asm volatile( \
>>> +        "   amoswap.d.aqrl %0, %2, %1\n" \
>>> +        : "=r" (ret), "+A" (*ptr) \
>>> +        : "r" (new) \
>>> +        : "memory" ); \
>>> +
>>> +    return ret;
>>> +}
>>> +
>>> +static always_inline unsigned short __cmpxchg_case_2(volatile
>>> uint32_t *ptr,
>>> +                                                     uint32_t old,
>>> +                                                     uint32_t
>>> new);
>>
>> Don't you consistently mean uint16_t here (incl the return type) and
>> ...
>>
>>> +static always_inline unsigned short __cmpxchg_case_1(volatile
>>> uint32_t *ptr,
>>> +                                                     uint32_t old,
>>> +                                                     uint32_t
>>> new);
>>
>> ... uint8_t here?
> The idea was that we emulate __cmpxchg_case_1 and __cmpxchg_case_2
> using 4 bytes cmpxchg and __cmpxchg_case_4 expects uint32_t.

I consider this wrong. The functions would better be type-correct.

>>> +static inline unsigned long __xchg(volatile void *ptr, unsigned
>>> long x, int size)
>>> +{
>>> +    switch (size) {
>>> +    case 1:
>>> +        return __cmpxchg_case_1(ptr, (uint32_t)-1, x);
>>> +    case 2:
>>> +        return __cmpxchg_case_2(ptr, (uint32_t)-1, x);
>>
>> How are these going to work? You'll compare against ~0, and if the
>> value
>> in memory isn't ~0, memory won't be updated; you will only
>> (correctly)
>> return the value found in memory.
>>
>> Or wait - looking at __cmpxchg_case_{1,2}() far further down, you
>> ignore
>> "old" there. Which apparently means they'll work for the use here,
>> but
>> not for the use in __cmpxchg().
> Yes, the trick is that old is ignored and is read in
> __emulate_cmpxchg_case1_2() before __cmpxchg_case_4 is called:
>     do {                                                              
>         read_val = read_func(aligned_ptr);                            
>         swapped_new = read_val & ~mask;                               
>         swapped_new |= masked_new;                                    
>         ret = cmpxchg_func(aligned_ptr, read_val, swapped_new);       
>     } while ( ret != read_val );                                      
> read_val it is 'old'.
> 
> But now I am not 100% sure that it is correct for __cmpxchg...

It just can't be correct - you can't ignore "old" there. I think you
want simple cmpxchg primitives, which xchg then uses in a loop (while
cmpxchg uses them plainly).

>>> +static always_inline unsigned short __cmpxchg_case_2(volatile
>>> uint32_t *ptr,
>>> +                                                     uint32_t old,
>>> +                                                     uint32_t new)
>>> +{
>>> +    (void) old;
>>> +
>>> +    if (((unsigned long)ptr & 3) == 3)
>>> +    {
>>> +#ifdef CONFIG_64BIT
>>> +        return __emulate_cmpxchg_case1_2((uint64_t *)ptr, new,
>>> +                                         readq, __cmpxchg_case_8,
>>> 0xffffU);
>>
>> What if ((unsigned long)ptr & 7) == 7 (which is a sub-case of what
>> the
>> if() above checks for? Isn't it more reasonable to require aligned
>> 16-bit quantities here? Or if mis-aligned addresses are okay, you
>> could
>> as well emulate using __cmpxchg_case_4().
> Yes, it will be more reasonable. I'll use IS_ALIGNED instead.

Not sure I get your use of "instead" here correctly. There's more
to change here than just the if() condition.

Jan
Oleksii Kurochko Jan. 23, 2024, 12:18 p.m. UTC | #4
On Tue, 2024-01-23 at 11:28 +0100, Jan Beulich wrote:
> On 23.01.2024 11:15, Oleksii wrote:
> > On Mon, 2024-01-22 at 17:27 +0100, Jan Beulich wrote:
> > > On 22.12.2023 16:12, Oleksii Kurochko wrote:
> > > > +            : "=r" (ret__), "+A" (*ptr__) \
> > > > +            : "r" (new__) \
> > > > +            : "memory" ); \
> > > > +        break; \
> > > > +    case 8: \
> > > > +        asm volatile( \
> > > > +            "	amoswap.d %0, %2, %1\n" \
> > > > +            : "=r" (ret__), "+A" (*ptr__) \
> > > > +            : "r" (new__) \
> > > > +            : "memory" ); \
> > > > +        break; \
> > > > +    default: \
> > > > +        ASSERT_UNREACHABLE(); \
> > > 
> > > If at all possible this wants to trigger a build failure, not a
> > > runtime
> > > one.
> > I'll update that with BUILD_BUG_ON(size > 8);
> 
> What about size accidentally being e.g. 5? I'm also not sure you'll
> be
> able to use BUILD_BUG_ON() here: That'll depend on what use sites
> there
> are. And if all present ones are okay in this regard, you'd still set
> out a trap for someone else to fall into later. We have a common
> approach for this, which currently is under re-work. See
> https://lists.xen.org/archives/html/xen-devel/2024-01/msg01115.html.
Thanks. I'll use that.

> 
> > > > +    } \
> > > > +    ret__; \
> > > > +})
> > > > +
> > > > +#define xchg_relaxed(ptr, x) \
> > > > +({ \
> > > > +    __typeof__(*(ptr)) x_ = (x); \
> > > > +    (__typeof__(*(ptr))) __xchg_relaxed((ptr), x_,
> > > > sizeof(*(ptr))); \
> > > 
> > > Nit: Stray blank after cast. For readability I'd also suggest to
> > > drop parentheses in cases like the first argument passed to
> > > __xchg_relaxed() here.
> > Thanks. I'll take that into account.
> > 
> > > 
> > > > +})
> > > 
> > > For both: What does "relaxed" describe? I'm asking because it's
> > > not
> > > really clear whether the memory clobbers are actually needed.
> > > 
> > > > +#define __xchg_acquire(ptr, new, size) \
> > > > +({ \
> > > > +    __typeof__(ptr) ptr__ = (ptr); \
> > > > +    __typeof__(new) new__ = (new); \
> > > > +    __typeof__(*(ptr)) ret__; \
> > > > +    switch (size) \
> > > > +	{ \
> > > > +    case 4: \
> > > > +        asm volatile( \
> > > > +            "	amoswap.w %0, %2, %1\n" \
> > > > +            RISCV_ACQUIRE_BARRIER \
> > > > +            : "=r" (ret__), "+A" (*ptr__) \
> > > > +            : "r" (new__) \
> > > > +            : "memory" ); \
> > > > +        break; \
> > > > +    case 8: \
> > > > +        asm volatile( \
> > > > +            "	amoswap.d %0, %2, %1\n" \
> > > > +            RISCV_ACQUIRE_BARRIER \
> > > > +            : "=r" (ret__), "+A" (*ptr__) \
> > > > +            : "r" (new__) \
> > > > +            : "memory" ); \
> > > > +        break; \
> > > > +    default: \
> > > > +        ASSERT_UNREACHABLE(); \
> > > > +    } \
> > > > +    ret__; \
> > > > +})
> > > 
> > > If I'm not mistaken this differs from __xchg_relaxed() only in
> > > the
> > > use
> > > of RISCV_ACQUIRE_BARRIER, and ...
> > > 
> > > > +#define xchg_acquire(ptr, x) \
> > > > +({ \
> > > > +    __typeof__(*(ptr)) x_ = (x); \
> > > > +    (__typeof__(*(ptr))) __xchg_acquire((ptr), x_,
> > > > sizeof(*(ptr))); \
> > > > +})
> > > > +
> > > > +#define __xchg_release(ptr, new, size) \
> > > > +({ \
> > > > +    __typeof__(ptr) ptr__ = (ptr); \
> > > > +    __typeof__(new) new__ = (new); \
> > > > +    __typeof__(*(ptr)) ret__; \
> > > > +    switch (size) \
> > > > +	{ \
> > > > +    case 4: \
> > > > +        asm volatile ( \
> > > > +            RISCV_RELEASE_BARRIER \
> > > > +            "	amoswap.w %0, %2, %1\n" \
> > > > +            : "=r" (ret__), "+A" (*ptr__) \
> > > > +            : "r" (new__) \
> > > > +            : "memory"); \
> > > > +        break; \
> > > > +    case 8: \
> > > > +        asm volatile ( \
> > > > +            RISCV_RELEASE_BARRIER \
> > > > +            "	amoswap.d %0, %2, %1\n" \
> > > > +            : "=r" (ret__), "+A" (*ptr__) \
> > > > +            : "r" (new__) \
> > > > +            : "memory"); \
> > > > +        break; \
> > > > +    default: \
> > > > +        ASSERT_UNREACHABLE(); \
> > > > +    } \
> > > > +    ret__; \
> > > > +})
> > > 
> > > this only in the use of RISCV_RELEASE_BARRIER. If so they likely
> > > want
> > > folding, to limit redundancy and make eventual updating easier.
> > > (Same
> > > for the cmpxchg helper further down, as it seems.)
> > Yes, you are right. The difference is only in RISCV_RELEASE_BARRIER
> > and
> > it is an absence of RISCV_RELEASE_BARRIER is a reason why we have
> > "relaxed" versions.
> > 
> > I am not sure that I understand what you mean by folding here. Do
> > you
> > mean that there is no any sense to have to separate macros and it
> > is
> > needed only one with RISCV_RELEASE_BARRIER?
> 
> No. You should parameterize the folded common macro for the derived
> macros to simply pass in the barriers needed, with empty macro
> arguments indicating "this barrier not needed".
Now it makes sense to me. Thank you for explanation.

> 
> > > > +#define xchg_release(ptr, x) \
> > > > +({ \
> > > > +    __typeof__(*(ptr)) x_ = (x); \
> > > > +    (__typeof__(*(ptr))) __xchg_release((ptr), x_,
> > > > sizeof(*(ptr))); \
> > > > +})
> > > > +
> > > > +static always_inline uint32_t __xchg_case_4(volatile uint32_t
> > > > *ptr,
> > > > +                                            uint32_t new)
> > > > +{
> > > > +    __typeof__(*(ptr)) ret;
> > > > +
> > > > +    asm volatile (
> > > > +        "   amoswap.w.aqrl %0, %2, %1\n"
> > > > +        : "=r" (ret), "+A" (*ptr)
> > > > +        : "r" (new)
> > > > +        : "memory" );
> > > > +
> > > > +    return ret;
> > > > +}
> > > > +
> > > > +static always_inline uint64_t __xchg_case_8(volatile uint64_t
> > > > *ptr,
> > > > +                                            uint64_t new)
> > > > +{
> > > > +    __typeof__(*(ptr)) ret;
> > > > +
> > > > +    asm volatile( \
> > > > +        "   amoswap.d.aqrl %0, %2, %1\n" \
> > > > +        : "=r" (ret), "+A" (*ptr) \
> > > > +        : "r" (new) \
> > > > +        : "memory" ); \
> > > > +
> > > > +    return ret;
> > > > +}
> > > > +
> > > > +static always_inline unsigned short __cmpxchg_case_2(volatile
> > > > uint32_t *ptr,
> > > > +                                                     uint32_t
> > > > old,
> > > > +                                                     uint32_t
> > > > new);
> > > 
> > > Don't you consistently mean uint16_t here (incl the return type)
> > > and
> > > ...
> > > 
> > > > +static always_inline unsigned short __cmpxchg_case_1(volatile
> > > > uint32_t *ptr,
> > > > +                                                     uint32_t
> > > > old,
> > > > +                                                     uint32_t
> > > > new);
> > > 
> > > ... uint8_t here?
> > The idea was that we emulate __cmpxchg_case_1 and __cmpxchg_case_2
> > using 4 bytes cmpxchg and __cmpxchg_case_4 expects uint32_t.
> 
> I consider this wrong. The functions would better be type-correct.
> 
> > > > +static inline unsigned long __xchg(volatile void *ptr,
> > > > unsigned
> > > > long x, int size)
> > > > +{
> > > > +    switch (size) {
> > > > +    case 1:
> > > > +        return __cmpxchg_case_1(ptr, (uint32_t)-1, x);
> > > > +    case 2:
> > > > +        return __cmpxchg_case_2(ptr, (uint32_t)-1, x);
> > > 
> > > How are these going to work? You'll compare against ~0, and if
> > > the
> > > value
> > > in memory isn't ~0, memory won't be updated; you will only
> > > (correctly)
> > > return the value found in memory.
> > > 
> > > Or wait - looking at __cmpxchg_case_{1,2}() far further down, you
> > > ignore
> > > "old" there. Which apparently means they'll work for the use
> > > here,
> > > but
> > > not for the use in __cmpxchg().
> > Yes, the trick is that old is ignored and is read in
> > __emulate_cmpxchg_case1_2() before __cmpxchg_case_4 is called:
> >     do
> > {                                                              
> >         read_val =
> > read_func(aligned_ptr);                            
> >         swapped_new = read_val &
> > ~mask;                               
> >         swapped_new |=
> > masked_new;                                    
> >         ret = cmpxchg_func(aligned_ptr, read_val,
> > swapped_new);       
> >     } while ( ret != read_val
> > );                                      
> > read_val it is 'old'.
> > 
> > But now I am not 100% sure that it is correct for __cmpxchg...
> 
> It just can't be correct - you can't ignore "old" there. I think you
> want simple cmpxchg primitives, which xchg then uses in a loop (while
> cmpxchg uses them plainly).
But xchg doesn't require 'old' value, so it should be ignored in some
way by cmpxchg.

> 
> > > > +static always_inline unsigned short __cmpxchg_case_2(volatile
> > > > uint32_t *ptr,
> > > > +                                                     uint32_t
> > > > old,
> > > > +                                                     uint32_t
> > > > new)
> > > > +{
> > > > +    (void) old;
> > > > +
> > > > +    if (((unsigned long)ptr & 3) == 3)
> > > > +    {
> > > > +#ifdef CONFIG_64BIT
> > > > +        return __emulate_cmpxchg_case1_2((uint64_t *)ptr, new,
> > > > +                                         readq,
> > > > __cmpxchg_case_8,
> > > > 0xffffU);
> > > 
> > > What if ((unsigned long)ptr & 7) == 7 (which is a sub-case of
> > > what
> > > the
> > > if() above checks for? Isn't it more reasonable to require
> > > aligned
> > > 16-bit quantities here? Or if mis-aligned addresses are okay, you
> > > could
> > > as well emulate using __cmpxchg_case_4().
> > Yes, it will be more reasonable. I'll use IS_ALIGNED instead.
> 
> Not sure I get your use of "instead" here correctly. There's more
> to change here than just the if() condition.
I meant something like:

if ( IS_ALIGNED(ptr, 16) )
    __emulate_cmpxchg_case1_2(...);
else
    assert_failed("ptr isn't aligned\n");

I have to check if it is necessary to have an mis-aligned address for
CAS intstuctions.

If mis-aligned address is okay then it looks like we can use always 
__cmpxchng_case_4 without checking if a ptr is ALIGNED or not. Or I
started to loose something...

~ Oleksii
Jan Beulich Jan. 23, 2024, 1:27 p.m. UTC | #5
On 23.01.2024 13:18, Oleksii wrote:
> On Tue, 2024-01-23 at 11:28 +0100, Jan Beulich wrote:
>> On 23.01.2024 11:15, Oleksii wrote:
>>> On Mon, 2024-01-22 at 17:27 +0100, Jan Beulich wrote:
>>>> On 22.12.2023 16:12, Oleksii Kurochko wrote:
>>>>> +static inline unsigned long __xchg(volatile void *ptr,
>>>>> unsigned
>>>>> long x, int size)
>>>>> +{
>>>>> +    switch (size) {
>>>>> +    case 1:
>>>>> +        return __cmpxchg_case_1(ptr, (uint32_t)-1, x);
>>>>> +    case 2:
>>>>> +        return __cmpxchg_case_2(ptr, (uint32_t)-1, x);
>>>>
>>>> How are these going to work? You'll compare against ~0, and if
>>>> the
>>>> value
>>>> in memory isn't ~0, memory won't be updated; you will only
>>>> (correctly)
>>>> return the value found in memory.
>>>>
>>>> Or wait - looking at __cmpxchg_case_{1,2}() far further down, you
>>>> ignore
>>>> "old" there. Which apparently means they'll work for the use
>>>> here,
>>>> but
>>>> not for the use in __cmpxchg().
>>> Yes, the trick is that old is ignored and is read in
>>> __emulate_cmpxchg_case1_2() before __cmpxchg_case_4 is called:
>>>     do
>>> {                                                              
>>>         read_val =
>>> read_func(aligned_ptr);                            
>>>         swapped_new = read_val &
>>> ~mask;                               
>>>         swapped_new |=
>>> masked_new;                                    
>>>         ret = cmpxchg_func(aligned_ptr, read_val,
>>> swapped_new);       
>>>     } while ( ret != read_val
>>> );                                      
>>> read_val it is 'old'.
>>>
>>> But now I am not 100% sure that it is correct for __cmpxchg...
>>
>> It just can't be correct - you can't ignore "old" there. I think you
>> want simple cmpxchg primitives, which xchg then uses in a loop (while
>> cmpxchg uses them plainly).
> But xchg doesn't require 'old' value, so it should be ignored in some
> way by cmpxchg.

Well, no. If you have only cmpxchg, I think your only choice is - as
said - to read the old value and then loop over cmpxchg until that
succeeds. Not really different from other operations which need
emulating using cmpxchg.

>>>>> +static always_inline unsigned short __cmpxchg_case_2(volatile
>>>>> uint32_t *ptr,
>>>>> +                                                     uint32_t
>>>>> old,
>>>>> +                                                     uint32_t
>>>>> new)
>>>>> +{
>>>>> +    (void) old;
>>>>> +
>>>>> +    if (((unsigned long)ptr & 3) == 3)
>>>>> +    {
>>>>> +#ifdef CONFIG_64BIT
>>>>> +        return __emulate_cmpxchg_case1_2((uint64_t *)ptr, new,
>>>>> +                                         readq,
>>>>> __cmpxchg_case_8,
>>>>> 0xffffU);
>>>>
>>>> What if ((unsigned long)ptr & 7) == 7 (which is a sub-case of
>>>> what
>>>> the
>>>> if() above checks for? Isn't it more reasonable to require
>>>> aligned
>>>> 16-bit quantities here? Or if mis-aligned addresses are okay, you
>>>> could
>>>> as well emulate using __cmpxchg_case_4().
>>> Yes, it will be more reasonable. I'll use IS_ALIGNED instead.
>>
>> Not sure I get your use of "instead" here correctly. There's more
>> to change here than just the if() condition.
> I meant something like:
> 
> if ( IS_ALIGNED(ptr, 16) )
>     __emulate_cmpxchg_case1_2(...);
> else
>     assert_failed("ptr isn't aligned\n");

Except that you'd better not use assert_failed() directly anywhere,
and the above is easier as

    ASSERT(IS_ALIGNED(ptr, 16));
    __emulate_cmpxchg_case1_2(...);

anyway (leaving aside that I guess you mean 2, not 16).

Jan
Oleksii Kurochko Jan. 24, 2024, 9:15 a.m. UTC | #6
On Tue, 2024-01-23 at 14:27 +0100, Jan Beulich wrote:
> On 23.01.2024 13:18, Oleksii wrote:
> > On Tue, 2024-01-23 at 11:28 +0100, Jan Beulich wrote:
> > > On 23.01.2024 11:15, Oleksii wrote:
> > > > On Mon, 2024-01-22 at 17:27 +0100, Jan Beulich wrote:
> > > > > On 22.12.2023 16:12, Oleksii Kurochko wrote:
> > > > > > +static inline unsigned long __xchg(volatile void *ptr,
> > > > > > unsigned
> > > > > > long x, int size)
> > > > > > +{
> > > > > > +    switch (size) {
> > > > > > +    case 1:
> > > > > > +        return __cmpxchg_case_1(ptr, (uint32_t)-1, x);
> > > > > > +    case 2:
> > > > > > +        return __cmpxchg_case_2(ptr, (uint32_t)-1, x);
> > > > > 
> > > > > How are these going to work? You'll compare against ~0, and
> > > > > if
> > > > > the
> > > > > value
> > > > > in memory isn't ~0, memory won't be updated; you will only
> > > > > (correctly)
> > > > > return the value found in memory.
> > > > > 
> > > > > Or wait - looking at __cmpxchg_case_{1,2}() far further down,
> > > > > you
> > > > > ignore
> > > > > "old" there. Which apparently means they'll work for the use
> > > > > here,
> > > > > but
> > > > > not for the use in __cmpxchg().
> > > > Yes, the trick is that old is ignored and is read in
> > > > __emulate_cmpxchg_case1_2() before __cmpxchg_case_4 is called:
> > > >     do
> > > > {                                                              
> > > >         read_val =
> > > > read_func(aligned_ptr);                            
> > > >         swapped_new = read_val &
> > > > ~mask;                               
> > > >         swapped_new |=
> > > > masked_new;                                    
> > > >         ret = cmpxchg_func(aligned_ptr, read_val,
> > > > swapped_new);       
> > > >     } while ( ret != read_val
> > > > );                                      
> > > > read_val it is 'old'.
> > > > 
> > > > But now I am not 100% sure that it is correct for __cmpxchg...
> > > 
> > > It just can't be correct - you can't ignore "old" there. I think
> > > you
> > > want simple cmpxchg primitives, which xchg then uses in a loop
> > > (while
> > > cmpxchg uses them plainly).
> > But xchg doesn't require 'old' value, so it should be ignored in
> > some
> > way by cmpxchg.
> 
> Well, no. If you have only cmpxchg, I think your only choice is - as
> said - to read the old value and then loop over cmpxchg until that
> succeeds. Not really different from other operations which need
> emulating using cmpxchg.
Then it looks like the main error in __emulate_cmpxchg_case1_2 is that
I read the value each time, so read_val = read_func(aligned_ptr); 
should be before the do {...} while(). Also, it would be better to
rename it to old_val or just old.

> 
> > > > > > +static always_inline unsigned short
> > > > > > __cmpxchg_case_2(volatile
> > > > > > uint32_t *ptr,
> > > > > > +                                                    
> > > > > > uint32_t
> > > > > > old,
> > > > > > +                                                    
> > > > > > uint32_t
> > > > > > new)
> > > > > > +{
> > > > > > +    (void) old;
> > > > > > +
> > > > > > +    if (((unsigned long)ptr & 3) == 3)
> > > > > > +    {
> > > > > > +#ifdef CONFIG_64BIT
> > > > > > +        return __emulate_cmpxchg_case1_2((uint64_t *)ptr,
> > > > > > new,
> > > > > > +                                         readq,
> > > > > > __cmpxchg_case_8,
> > > > > > 0xffffU);
> > > > > 
> > > > > What if ((unsigned long)ptr & 7) == 7 (which is a sub-case of
> > > > > what
> > > > > the
> > > > > if() above checks for? Isn't it more reasonable to require
> > > > > aligned
> > > > > 16-bit quantities here? Or if mis-aligned addresses are okay,
> > > > > you
> > > > > could
> > > > > as well emulate using __cmpxchg_case_4().
> > > > Yes, it will be more reasonable. I'll use IS_ALIGNED instead.
> > > 
> > > Not sure I get your use of "instead" here correctly. There's more
> > > to change here than just the if() condition.
> > I meant something like:
> > 
> > if ( IS_ALIGNED(ptr, 16) )
> >     __emulate_cmpxchg_case1_2(...);
> > else
> >     assert_failed("ptr isn't aligned\n");
> 
> Except that you'd better not use assert_failed() directly anywhere,
> and the above is easier as
> 
>     ASSERT(IS_ALIGNED(ptr, 16));
>     __emulate_cmpxchg_case1_2(...);
> 
> anyway (leaving aside that I guess you mean 2, not 16).
Yeah, it should be 2. Thanks.

~ Oleksii
Oleksii Kurochko Jan. 30, 2024, 2:57 p.m. UTC | #7
On Mon, 2024-01-22 at 17:27 +0100, Jan Beulich wrote:
> > +#define __xchg_acquire(ptr, new, size) \
> > +({ \
> > +    __typeof__(ptr) ptr__ = (ptr); \
> > +    __typeof__(new) new__ = (new); \
> > +    __typeof__(*(ptr)) ret__; \
> > +    switch (size) \
> > +	{ \
> > +    case 4: \
> > +        asm volatile( \
> > +            "	amoswap.w %0, %2, %1\n" \
> > +            RISCV_ACQUIRE_BARRIER \
> > +            : "=r" (ret__), "+A" (*ptr__) \
> > +            : "r" (new__) \
> > +            : "memory" ); \
> > +        break; \
> > +    case 8: \
> > +        asm volatile( \
> > +            "	amoswap.d %0, %2, %1\n" \
> > +            RISCV_ACQUIRE_BARRIER \
> > +            : "=r" (ret__), "+A" (*ptr__) \
> > +            : "r" (new__) \
> > +            : "memory" ); \
> > +        break; \
> > +    default: \
> > +        ASSERT_UNREACHABLE(); \
> > +    } \
> > +    ret__; \
> > +})
> 
> If I'm not mistaken this differs from __xchg_relaxed() only in the
> use
> of RISCV_ACQUIRE_BARRIER, and ...
> 
> > +#define xchg_acquire(ptr, x) \
> > +({ \
> > +    __typeof__(*(ptr)) x_ = (x); \
> > +    (__typeof__(*(ptr))) __xchg_acquire((ptr), x_,
> > sizeof(*(ptr))); \
> > +})
> > +
> > +#define __xchg_release(ptr, new, size) \
> > +({ \
> > +    __typeof__(ptr) ptr__ = (ptr); \
> > +    __typeof__(new) new__ = (new); \
> > +    __typeof__(*(ptr)) ret__; \
> > +    switch (size) \
> > +	{ \
> > +    case 4: \
> > +        asm volatile ( \
> > +            RISCV_RELEASE_BARRIER \
> > +            "	amoswap.w %0, %2, %1\n" \
> > +            : "=r" (ret__), "+A" (*ptr__) \
> > +            : "r" (new__) \
> > +            : "memory"); \
> > +        break; \
> > +    case 8: \
> > +        asm volatile ( \
> > +            RISCV_RELEASE_BARRIER \
> > +            "	amoswap.d %0, %2, %1\n" \
> > +            : "=r" (ret__), "+A" (*ptr__) \
> > +            : "r" (new__) \
> > +            : "memory"); \
> > +        break; \
> > +    default: \
> > +        ASSERT_UNREACHABLE(); \
> > +    } \
> > +    ret__; \
> > +})
> 
> this only in the use of RISCV_RELEASE_BARRIER. If so they likely want
> folding, to limit redundancy and make eventual updating easier. (Same
> for the cmpxchg helper further down, as it seems.)
Also the difference is in where to place barrier before or after atomic
instruction. I am not sure that we can easily folded this macros.

~ Oleksii
Jan Beulich Jan. 30, 2024, 3:05 p.m. UTC | #8
On 30.01.2024 15:57, Oleksii wrote:
> On Mon, 2024-01-22 at 17:27 +0100, Jan Beulich wrote:
>>> +#define __xchg_acquire(ptr, new, size) \
>>> +({ \
>>> +    __typeof__(ptr) ptr__ = (ptr); \
>>> +    __typeof__(new) new__ = (new); \
>>> +    __typeof__(*(ptr)) ret__; \
>>> +    switch (size) \
>>> +	{ \
>>> +    case 4: \
>>> +        asm volatile( \
>>> +            "	amoswap.w %0, %2, %1\n" \
>>> +            RISCV_ACQUIRE_BARRIER \
>>> +            : "=r" (ret__), "+A" (*ptr__) \
>>> +            : "r" (new__) \
>>> +            : "memory" ); \
>>> +        break; \
>>> +    case 8: \
>>> +        asm volatile( \
>>> +            "	amoswap.d %0, %2, %1\n" \
>>> +            RISCV_ACQUIRE_BARRIER \
>>> +            : "=r" (ret__), "+A" (*ptr__) \
>>> +            : "r" (new__) \
>>> +            : "memory" ); \
>>> +        break; \
>>> +    default: \
>>> +        ASSERT_UNREACHABLE(); \
>>> +    } \
>>> +    ret__; \
>>> +})
>>
>> If I'm not mistaken this differs from __xchg_relaxed() only in the
>> use
>> of RISCV_ACQUIRE_BARRIER, and ...
>>
>>> +#define xchg_acquire(ptr, x) \
>>> +({ \
>>> +    __typeof__(*(ptr)) x_ = (x); \
>>> +    (__typeof__(*(ptr))) __xchg_acquire((ptr), x_,
>>> sizeof(*(ptr))); \
>>> +})
>>> +
>>> +#define __xchg_release(ptr, new, size) \
>>> +({ \
>>> +    __typeof__(ptr) ptr__ = (ptr); \
>>> +    __typeof__(new) new__ = (new); \
>>> +    __typeof__(*(ptr)) ret__; \
>>> +    switch (size) \
>>> +	{ \
>>> +    case 4: \
>>> +        asm volatile ( \
>>> +            RISCV_RELEASE_BARRIER \
>>> +            "	amoswap.w %0, %2, %1\n" \
>>> +            : "=r" (ret__), "+A" (*ptr__) \
>>> +            : "r" (new__) \
>>> +            : "memory"); \
>>> +        break; \
>>> +    case 8: \
>>> +        asm volatile ( \
>>> +            RISCV_RELEASE_BARRIER \
>>> +            "	amoswap.d %0, %2, %1\n" \
>>> +            : "=r" (ret__), "+A" (*ptr__) \
>>> +            : "r" (new__) \
>>> +            : "memory"); \
>>> +        break; \
>>> +    default: \
>>> +        ASSERT_UNREACHABLE(); \
>>> +    } \
>>> +    ret__; \
>>> +})
>>
>> this only in the use of RISCV_RELEASE_BARRIER. If so they likely want
>> folding, to limit redundancy and make eventual updating easier. (Same
>> for the cmpxchg helper further down, as it seems.)
> Also the difference is in where to place barrier before or after atomic
> instruction. I am not sure that we can easily folded this macros.

The folded macro would have two barrier parameters - on for acquire, one
for release.

Jan
Oleksii Kurochko Jan. 30, 2024, 3:16 p.m. UTC | #9
On Tue, 2024-01-30 at 16:05 +0100, Jan Beulich wrote:
> On 30.01.2024 15:57, Oleksii wrote:
> > On Mon, 2024-01-22 at 17:27 +0100, Jan Beulich wrote:
> > > > +#define __xchg_acquire(ptr, new, size) \
> > > > +({ \
> > > > +    __typeof__(ptr) ptr__ = (ptr); \
> > > > +    __typeof__(new) new__ = (new); \
> > > > +    __typeof__(*(ptr)) ret__; \
> > > > +    switch (size) \
> > > > +	{ \
> > > > +    case 4: \
> > > > +        asm volatile( \
> > > > +            "	amoswap.w %0, %2, %1\n" \
> > > > +            RISCV_ACQUIRE_BARRIER \
> > > > +            : "=r" (ret__), "+A" (*ptr__) \
> > > > +            : "r" (new__) \
> > > > +            : "memory" ); \
> > > > +        break; \
> > > > +    case 8: \
> > > > +        asm volatile( \
> > > > +            "	amoswap.d %0, %2, %1\n" \
> > > > +            RISCV_ACQUIRE_BARRIER \
> > > > +            : "=r" (ret__), "+A" (*ptr__) \
> > > > +            : "r" (new__) \
> > > > +            : "memory" ); \
> > > > +        break; \
> > > > +    default: \
> > > > +        ASSERT_UNREACHABLE(); \
> > > > +    } \
> > > > +    ret__; \
> > > > +})
> > > 
> > > If I'm not mistaken this differs from __xchg_relaxed() only in
> > > the
> > > use
> > > of RISCV_ACQUIRE_BARRIER, and ...
> > > 
> > > > +#define xchg_acquire(ptr, x) \
> > > > +({ \
> > > > +    __typeof__(*(ptr)) x_ = (x); \
> > > > +    (__typeof__(*(ptr))) __xchg_acquire((ptr), x_,
> > > > sizeof(*(ptr))); \
> > > > +})
> > > > +
> > > > +#define __xchg_release(ptr, new, size) \
> > > > +({ \
> > > > +    __typeof__(ptr) ptr__ = (ptr); \
> > > > +    __typeof__(new) new__ = (new); \
> > > > +    __typeof__(*(ptr)) ret__; \
> > > > +    switch (size) \
> > > > +	{ \
> > > > +    case 4: \
> > > > +        asm volatile ( \
> > > > +            RISCV_RELEASE_BARRIER \
> > > > +            "	amoswap.w %0, %2, %1\n" \
> > > > +            : "=r" (ret__), "+A" (*ptr__) \
> > > > +            : "r" (new__) \
> > > > +            : "memory"); \
> > > > +        break; \
> > > > +    case 8: \
> > > > +        asm volatile ( \
> > > > +            RISCV_RELEASE_BARRIER \
> > > > +            "	amoswap.d %0, %2, %1\n" \
> > > > +            : "=r" (ret__), "+A" (*ptr__) \
> > > > +            : "r" (new__) \
> > > > +            : "memory"); \
> > > > +        break; \
> > > > +    default: \
> > > > +        ASSERT_UNREACHABLE(); \
> > > > +    } \
> > > > +    ret__; \
> > > > +})
> > > 
> > > this only in the use of RISCV_RELEASE_BARRIER. If so they likely
> > > want
> > > folding, to limit redundancy and make eventual updating easier.
> > > (Same
> > > for the cmpxchg helper further down, as it seems.)
> > Also the difference is in where to place barrier before or after
> > atomic
> > instruction. I am not sure that we can easily folded this macros.
> 
> The folded macro would have two barrier parameters - on for acquire,
> one
> for release.
Yes, in such case it will work.

Thanks.

~ Oleksii
diff mbox series

Patch

diff --git a/xen/arch/riscv/include/asm/cmpxchg.h b/xen/arch/riscv/include/asm/cmpxchg.h
new file mode 100644
index 0000000000..916776c403
--- /dev/null
+++ b/xen/arch/riscv/include/asm/cmpxchg.h
@@ -0,0 +1,496 @@ 
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (C) 2014 Regents of the University of California */
+
+#ifndef _ASM_RISCV_CMPXCHG_H
+#define _ASM_RISCV_CMPXCHG_H
+
+#include <xen/compiler.h>
+#include <xen/lib.h>
+
+#include <asm/fence.h>
+#include <asm/io.h>
+#include <asm/system.h>
+
+#define __xchg_relaxed(ptr, new, size) \
+({ \
+    __typeof__(ptr) ptr__ = (ptr); \
+    __typeof__(new) new__ = (new); \
+    __typeof__(*(ptr)) ret__; \
+    switch (size) \
+	{ \
+    case 4: \
+        asm volatile( \
+            "	amoswap.w %0, %2, %1\n" \
+            : "=r" (ret__), "+A" (*ptr__) \
+            : "r" (new__) \
+            : "memory" ); \
+        break; \
+    case 8: \
+        asm volatile( \
+            "	amoswap.d %0, %2, %1\n" \
+            : "=r" (ret__), "+A" (*ptr__) \
+            : "r" (new__) \
+            : "memory" ); \
+        break; \
+    default: \
+        ASSERT_UNREACHABLE(); \
+    } \
+    ret__; \
+})
+
+#define xchg_relaxed(ptr, x) \
+({ \
+    __typeof__(*(ptr)) x_ = (x); \
+    (__typeof__(*(ptr))) __xchg_relaxed((ptr), x_, sizeof(*(ptr))); \
+})
+
+#define __xchg_acquire(ptr, new, size) \
+({ \
+    __typeof__(ptr) ptr__ = (ptr); \
+    __typeof__(new) new__ = (new); \
+    __typeof__(*(ptr)) ret__; \
+    switch (size) \
+	{ \
+    case 4: \
+        asm volatile( \
+            "	amoswap.w %0, %2, %1\n" \
+            RISCV_ACQUIRE_BARRIER \
+            : "=r" (ret__), "+A" (*ptr__) \
+            : "r" (new__) \
+            : "memory" ); \
+        break; \
+    case 8: \
+        asm volatile( \
+            "	amoswap.d %0, %2, %1\n" \
+            RISCV_ACQUIRE_BARRIER \
+            : "=r" (ret__), "+A" (*ptr__) \
+            : "r" (new__) \
+            : "memory" ); \
+        break; \
+    default: \
+        ASSERT_UNREACHABLE(); \
+    } \
+    ret__; \
+})
+
+#define xchg_acquire(ptr, x) \
+({ \
+    __typeof__(*(ptr)) x_ = (x); \
+    (__typeof__(*(ptr))) __xchg_acquire((ptr), x_, sizeof(*(ptr))); \
+})
+
+#define __xchg_release(ptr, new, size) \
+({ \
+    __typeof__(ptr) ptr__ = (ptr); \
+    __typeof__(new) new__ = (new); \
+    __typeof__(*(ptr)) ret__; \
+    switch (size) \
+	{ \
+    case 4: \
+        asm volatile ( \
+            RISCV_RELEASE_BARRIER \
+            "	amoswap.w %0, %2, %1\n" \
+            : "=r" (ret__), "+A" (*ptr__) \
+            : "r" (new__) \
+            : "memory"); \
+        break; \
+    case 8: \
+        asm volatile ( \
+            RISCV_RELEASE_BARRIER \
+            "	amoswap.d %0, %2, %1\n" \
+            : "=r" (ret__), "+A" (*ptr__) \
+            : "r" (new__) \
+            : "memory"); \
+        break; \
+    default: \
+        ASSERT_UNREACHABLE(); \
+    } \
+    ret__; \
+})
+
+#define xchg_release(ptr, x) \
+({ \
+    __typeof__(*(ptr)) x_ = (x); \
+    (__typeof__(*(ptr))) __xchg_release((ptr), x_, sizeof(*(ptr))); \
+})
+
+static always_inline uint32_t __xchg_case_4(volatile uint32_t *ptr,
+                                            uint32_t new)
+{
+    __typeof__(*(ptr)) ret;
+
+    asm volatile (
+        "   amoswap.w.aqrl %0, %2, %1\n"
+        : "=r" (ret), "+A" (*ptr)
+        : "r" (new)
+        : "memory" );
+
+    return ret;
+}
+
+static always_inline uint64_t __xchg_case_8(volatile uint64_t *ptr,
+                                            uint64_t new)
+{
+    __typeof__(*(ptr)) ret;
+
+    asm volatile( \
+        "   amoswap.d.aqrl %0, %2, %1\n" \
+        : "=r" (ret), "+A" (*ptr) \
+        : "r" (new) \
+        : "memory" ); \
+
+    return ret;
+}
+
+static always_inline unsigned short __cmpxchg_case_2(volatile uint32_t *ptr,
+                                                     uint32_t old,
+                                                     uint32_t new);
+
+static always_inline unsigned short __cmpxchg_case_1(volatile uint32_t *ptr,
+                                                     uint32_t old,
+                                                     uint32_t new);
+
+static inline unsigned long __xchg(volatile void *ptr, unsigned long x, int size)
+{
+    switch (size) {
+    case 1:
+        return __cmpxchg_case_1(ptr, (uint32_t)-1, x);
+    case 2:
+        return __cmpxchg_case_2(ptr, (uint32_t)-1, x);
+    case 4:
+        return __xchg_case_4(ptr, x);
+    case 8:
+        return __xchg_case_8(ptr, x);
+    default:
+        ASSERT_UNREACHABLE();
+    }
+
+    return -1;
+}
+
+#define xchg(ptr,x) \
+({ \
+    __typeof__(*(ptr)) ret__; \
+    ret__ = (__typeof__(*(ptr))) \
+            __xchg((ptr), (unsigned long)(x), sizeof(*(ptr))); \
+    ret__; \
+})
+
+#define xchg32(ptr, x) \
+({ \
+    BUILD_BUG_ON(sizeof(*(ptr)) != 4); \
+    xchg((ptr), (x)); \
+})
+
+#define xchg64(ptr, x) \
+({ \
+    BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
+    xchg((ptr), (x)); \
+})
+
+/*
+ * Atomic compare and exchange.  Compare OLD with MEM, if identical,
+ * store NEW in MEM.  Return the initial value in MEM.  Success is
+ * indicated by comparing RETURN with OLD.
+ */
+#define __cmpxchg_relaxed(ptr, old, new, size) \
+({ \
+    __typeof__(ptr) ptr__ = (ptr); \
+    __typeof__(*(ptr)) __old = (old); \
+    __typeof__(*(ptr)) new__ = (new); \
+    __typeof__(*(ptr)) ret__; \
+    register unsigned int __rc; \
+    switch (size) \
+	{ \
+    case 4: \
+        asm volatile( \
+            "0:	lr.w %0, %2\n" \
+            "	bne  %0, %z3, 1f\n" \
+            "	sc.w %1, %z4, %2\n" \
+            "	bnez %1, 0b\n" \
+            "1:\n" \
+            : "=&r" (ret__), "=&r" (__rc), "+A" (*ptr__) \
+            : "rJ" (__old), "rJ" (new__) \
+            : "memory"); \
+        break; \
+    case 8: \
+        asm volatile( \
+            "0:	lr.d %0, %2\n" \
+            "	bne %0, %z3, 1f\n" \
+            "	sc.d %1, %z4, %2\n" \
+            "	bnez %1, 0b\n" \
+            "1:\n" \
+            : "=&r" (ret__), "=&r" (__rc), "+A" (*ptr__) \
+            : "rJ" (__old), "rJ" (new__) \
+            : "memory"); \
+        break; \
+    default: \
+        ASSERT_UNREACHABLE(); \
+    } \
+    ret__; \
+})
+
+#define cmpxchg_relaxed(ptr, o, n) \
+({ \
+    __typeof__(*(ptr)) o_ = (o); \
+    __typeof__(*(ptr)) n_ = (n); \
+    (__typeof__(*(ptr))) __cmpxchg_relaxed((ptr), \
+                    o_, n_, sizeof(*(ptr))); \
+})
+
+#define __cmpxchg_acquire(ptr, old, new, size) \
+({ \
+    __typeof__(ptr) ptr__ = (ptr); \
+    __typeof__(*(ptr)) __old = (old); \
+    __typeof__(*(ptr)) new__ = (new); \
+    __typeof__(*(ptr)) ret__; \
+    register unsigned int __rc; \
+    switch (size) \
+	{ \
+    case 4: \
+        asm volatile( \
+            "0:	lr.w %0, %2\n" \
+            "	bne  %0, %z3, 1f\n" \
+            "	sc.w %1, %z4, %2\n" \
+            "	bnez %1, 0b\n" \
+            RISCV_ACQUIRE_BARRIER \
+            "1:\n"	 \
+            : "=&r" (ret__), "=&r" (__rc), "+A" (*ptr__) \
+            : "rJ" (__old), "rJ" (new__) \
+            : "memory"); \
+        break; \
+    case 8: \
+        asm volatile( \
+            "0:	lr.d %0, %2\n" \
+            "	bne %0, %z3, 1f\n" \
+            "	sc.d %1, %z4, %2\n" \
+            "	bnez %1, 0b\n" \
+            RISCV_ACQUIRE_BARRIER \
+            "1:\n" \
+            : "=&r" (ret__), "=&r" (__rc), "+A" (*ptr__) \
+            : "rJ" (__old), "rJ" (new__) \
+            : "memory"); \
+        break; \
+    default: \
+        ASSERT_UNREACHABLE(); \
+    } \
+    ret__; \
+})
+
+#define cmpxchg_acquire(ptr, o, n) \
+({ \
+    __typeof__(*(ptr)) o_ = (o); \
+    __typeof__(*(ptr)) n_ = (n); \
+    (__typeof__(*(ptr))) __cmpxchg_acquire((ptr), o_, n_, sizeof(*(ptr))); \
+})
+
+#define __cmpxchg_release(ptr, old, new, size) \
+({									\
+    __typeof__(ptr) ptr__ = (ptr); \
+    __typeof__(*(ptr)) __old = (old); \
+    __typeof__(*(ptr)) new__ = (new); \
+    __typeof__(*(ptr)) ret__; \
+    register unsigned int __rc; \
+    switch (size) \
+	{ \
+    case 4: \
+        asm volatile ( \
+            RISCV_RELEASE_BARRIER \
+            "0:	lr.w %0, %2\n" \
+            "	bne  %0, %z3, 1f\n" \
+            "	sc.w %1, %z4, %2\n" \
+            "	bnez %1, 0b\n" \
+            "1:\n" \
+            : "=&r" (ret__), "=&r" (__rc), "+A" (*ptr__)	\
+            : "rJ" (__old), "rJ" (new__) \
+            : "memory" ); \
+        break; \
+    case 8: \
+        asm volatile ( \
+            RISCV_RELEASE_BARRIER \
+            "0:	lr.d %0, %2\n" \
+            "	bne %0, %z3, 1f\n" \
+            "	sc.d %1, %z4, %2\n" \
+            "	bnez %1, 0b\n" \
+            "1:\n" \
+            : "=&r" (ret__), "=&r" (__rc), "+A" (*ptr__) \
+            : "rJ" (__old), "rJ" (new__) \
+            : "memory" ); \
+        break; \
+    default: \
+        ASSERT_UNREACHABLE(); \
+    } \
+    ret__; \
+})
+
+#define cmpxchg_release(ptr, o, n) \
+({ \
+    __typeof__(*(ptr)) _o_ = (o); \
+    __typeof__(*(ptr)) _n_ = (n); \
+    (__typeof__(*(ptr))) __cmpxchg_release((ptr), _o_, _n_, sizeof(*(ptr))); \
+})
+
+static always_inline uint32_t __cmpxchg_case_4(volatile uint32_t *ptr,
+                                               uint32_t old,
+                                               uint32_t new)
+{
+    uint32_t ret;
+    register uint32_t rc;
+
+    asm volatile (
+        "0: lr.w %0, %2\n"
+        "   bne  %0, %z3, 1f\n"
+        "   sc.w.rl %1, %z4, %2\n"
+        "   bnez %1, 0b\n"
+        "   fence rw, rw\n"
+        "1:\n"
+        : "=&r" (ret), "=&r" (rc), "+A" (*ptr)
+        : "rJ" (old), "rJ" (new)
+        : "memory" );
+
+    return ret;
+}
+
+static always_inline uint64_t __cmpxchg_case_8(volatile uint64_t *ptr,
+                                               uint64_t old,
+                                               uint64_t new)
+{
+    uint64_t ret;
+    register uint32_t rc;
+
+    asm volatile(
+        "0: lr.d %0, %2\n"
+        "   bne %0, %z3, 1f\n"
+        "   sc.d.rl %1, %z4, %2\n"
+        "   bnez %1, 0b\n"
+        "   fence rw, rw\n"
+        "1:\n"
+        : "=&r" (ret), "=&r" (rc), "+A" (*ptr)
+        : "rJ" (old), "rJ" (new)
+        : "memory");
+
+    return ret;
+}
+
+#define __emulate_cmpxchg_case1_2(ptr, new, read_func, cmpxchg_func, swap_byte_mask_base)\
+({                                                                              \
+    __typeof__(*(ptr)) read_val;                                                \
+    __typeof__(*(ptr)) swapped_new;                                             \
+    __typeof__(*(ptr)) ret;                                                     \
+    __typeof__(*(ptr)) new_ = (__typeof__(*(ptr)))new;                          \
+                                                                                \
+    __typeof__(ptr) aligned_ptr = (__typeof__(ptr))((unsigned long)ptr & ~3);   \
+    __typeof__(*(ptr)) mask_off = ((unsigned long)ptr & 3) * 8;                 \
+    __typeof__(*(ptr)) mask =                                                   \
+      (__typeof__(*(ptr)))swap_byte_mask_base << mask_off;                      \
+    __typeof__(*(ptr)) masked_new = (new_ << mask_off) & mask;                  \
+                                                                                \
+    do {                                                                        \
+        read_val = read_func(aligned_ptr);                                      \
+        swapped_new = read_val & ~mask;                                         \
+        swapped_new |= masked_new;                                              \
+        ret = cmpxchg_func(aligned_ptr, read_val, swapped_new);                 \
+    } while ( ret != read_val );                                                \
+                                                                                \
+    ret = MASK_EXTR(swapped_new, mask);                                         \
+    ret;                                                                        \
+})
+
+static always_inline unsigned short __cmpxchg_case_2(volatile uint32_t *ptr,
+                                                     uint32_t old,
+                                                     uint32_t new)
+{
+    (void) old;
+
+    if (((unsigned long)ptr & 3) == 3)
+    {
+#ifdef CONFIG_64BIT
+        return __emulate_cmpxchg_case1_2((uint64_t *)ptr, new,
+                                         readq, __cmpxchg_case_8, 0xffffU);
+#else
+        #error "add emulation support of cmpxchg for CONFIG_32BIT"
+#endif
+    }
+    else
+        return __emulate_cmpxchg_case1_2((uint32_t *)ptr, new,
+                                         readl, __cmpxchg_case_4, 0xffffU);
+}
+
+static always_inline unsigned short __cmpxchg_case_1(volatile uint32_t *ptr,
+                                                     uint32_t old,
+                                                     uint32_t new)
+{
+    (void) old;
+
+    return __emulate_cmpxchg_case1_2((uint32_t *)ptr, new,
+                                     readl, __cmpxchg_case_4, 0xffU);
+}
+
+static always_inline unsigned long __cmpxchg(volatile void *ptr,
+                                             unsigned long old,
+                                             unsigned long new,
+                                             int size)
+{
+    switch (size)
+    {
+    case 1:
+        return __cmpxchg_case_1(ptr, old, new);
+    case 2:
+        return __cmpxchg_case_2(ptr, old, new);
+    case 4:
+        return __cmpxchg_case_4(ptr, old, new);
+    case 8:
+        return __cmpxchg_case_8(ptr, old, new);
+    default:
+        ASSERT_UNREACHABLE();
+    }
+
+    return old;
+}
+
+#define cmpxchg(ptr, o, n) \
+({ \
+    __typeof__(*(ptr)) ret__; \
+    ret__ = (__typeof__(*(ptr))) \
+            __cmpxchg((ptr), (unsigned long)(o), (unsigned long)(n), \
+                      sizeof(*(ptr))); \
+    ret__; \
+})
+
+#define cmpxchg_local(ptr, o, n) \
+    (__cmpxchg_relaxed((ptr), (o), (n), sizeof(*(ptr))))
+
+#define cmpxchg32(ptr, o, n) \
+({ \
+    BUILD_BUG_ON(sizeof(*(ptr)) != 4); \
+    cmpxchg((ptr), (o), (n)); \
+})
+
+#define cmpxchg32_local(ptr, o, n) \
+({ \
+    BUILD_BUG_ON(sizeof(*(ptr)) != 4); \
+    cmpxchg_relaxed((ptr), (o), (n)) \
+})
+
+#define cmpxchg64(ptr, o, n) \
+({ \
+    BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
+    cmpxchg((ptr), (o), (n)); \
+})
+
+#define cmpxchg64_local(ptr, o, n) \
+({ \
+    BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
+    cmpxchg_relaxed((ptr), (o), (n)); \
+})
+
+#endif /* _ASM_RISCV_CMPXCHG_H */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */