Message ID | 1616580892-80815-1-git-send-email-guoren@kernel.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | riscv: locks: introduce ticket-based spinlock implementation | expand |
On Wed, Mar 24, 2021 at 10:14:52AM +0000, guoren@kernel.org wrote: > +static inline void arch_spin_lock(arch_spinlock_t *lock) > +{ > + arch_spinlock_t lockval; > + u32 tmp; > + > + asm volatile ( > + "1: lr.w %0, %2 \n" > + " mv %1, %0 \n" > + " addw %0, %0, %3 \n" > + " sc.w %0, %0, %2 \n" > + " bnez %0, 1b \n" > + : "=&r" (tmp), "=&r" (lockval), "+A" (lock->lock) > + : "r" (1 << TICKET_NEXT) > + : "memory"); > > + while (lockval.tickets.next != lockval.tickets.owner) { > + /* > + * FIXME - we need wfi/wfe here to prevent: > + * - cache line bouncing > + * - saving cpu pipeline in multi-harts-per-core > + * processor > + */ > + lockval.tickets.owner = READ_ONCE(lock->tickets.owner); > + } > > + __atomic_acquire_fence(); > } > +static inline void arch_spin_unlock(arch_spinlock_t *lock) > { > + smp_store_release(&lock->tickets.owner, lock->tickets.owner + 1); > + /* FIXME - we need ipi/sev here to notify above */ > } Urgh, are you saying your WFE requires an explicit SEV like on ARM ? The ARM64 model is far superious IMO, and then you can use smp_cond_load_acquire() in arch_spin_lock() and call it a day.
Thx Peter, On Wed, Mar 24, 2021 at 7:09 PM Peter Zijlstra <peterz@infradead.org> wrote: > > On Wed, Mar 24, 2021 at 10:14:52AM +0000, guoren@kernel.org wrote: > > +static inline void arch_spin_lock(arch_spinlock_t *lock) > > +{ > > + arch_spinlock_t lockval; > > + u32 tmp; > > + > > + asm volatile ( > > + "1: lr.w %0, %2 \n" > > + " mv %1, %0 \n" > > + " addw %0, %0, %3 \n" > > + " sc.w %0, %0, %2 \n" > > + " bnez %0, 1b \n" > > + : "=&r" (tmp), "=&r" (lockval), "+A" (lock->lock) > > + : "r" (1 << TICKET_NEXT) > > + : "memory"); > > > > + while (lockval.tickets.next != lockval.tickets.owner) { > > + /* > > + * FIXME - we need wfi/wfe here to prevent: > > + * - cache line bouncing > > + * - saving cpu pipeline in multi-harts-per-core > > + * processor > > + */ > > + lockval.tickets.owner = READ_ONCE(lock->tickets.owner); > > + } > > > > + __atomic_acquire_fence(); > > } > > > +static inline void arch_spin_unlock(arch_spinlock_t *lock) > > { > > + smp_store_release(&lock->tickets.owner, lock->tickets.owner + 1); > > + /* FIXME - we need ipi/sev here to notify above */ > > } > > Urgh, are you saying your WFE requires an explicit SEV like on ARM ? The Yes, I'm considering that kind of code. > ARM64 model is far superious IMO, and then you can use > smp_cond_load_acquire() in arch_spin_lock() and call it a day. Great tip, thx. I'll follow that.
On Wed, Mar 24, 2021 at 12:15:47PM +0100, Vitaly Wool wrote: > On Wed, Mar 24, 2021, 11:16 AM <guoren@kernel.org> wrote: > > > From: Guo Ren <guoren@linux.alibaba.com> > > > > This patch introduces a ticket lock implementation for riscv, along the > > same lines as the implementation for arch/arm & arch/csky. > > > > Could you please provide a rationale for this? Like, what is wrong with the > current implementation. test-and-set spinlocks have terrible worst case behaviour.
On Wed, Mar 24, 2021 at 7:16 PM Vitaly Wool <vitaly.wool@konsulko.com> wrote: > > > > On Wed, Mar 24, 2021, 11:16 AM <guoren@kernel.org> wrote: >> >> From: Guo Ren <guoren@linux.alibaba.com> >> >> This patch introduces a ticket lock implementation for riscv, along the >> same lines as the implementation for arch/arm & arch/csky. > > > Could you please provide a rationale for this? Like, what is wrong with the current implementation. Ticket based spinlock's principle is here: https://lwn.net/Articles/267968/ Current implementation will cause cache line bouncing when many harts are acquiring the same spinlock. I'm seeking a solution, maybe not fitting the current RISC-V base ISA. I'll add more comments in the next version of patch. > > Thanks in advance, > > Best regards, > Vitaly >> >> >> Signed-off-by: Guo Ren <guoren@linux.alibaba.com> >> Cc: Catalin Marinas <catalin.marinas@arm.com> >> Cc: Will Deacon <will.deacon@arm.com> >> Cc: Peter Zijlstra <peterz@infradead.org> >> Cc: Palmer Dabbelt <palmerdabbelt@google.com> >> Cc: Anup Patel <anup@brainfault.org> >> Cc: Arnd Bergmann <arnd@arndb.de> >> --- >> arch/riscv/Kconfig | 1 + >> arch/riscv/include/asm/Kbuild | 1 + >> arch/riscv/include/asm/spinlock.h | 158 ++++++++++++-------------------- >> arch/riscv/include/asm/spinlock_types.h | 19 ++-- >> 4 files changed, 74 insertions(+), 105 deletions(-) >> >> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig >> index 87d7b52..7c56a20 100644 >> --- a/arch/riscv/Kconfig >> +++ b/arch/riscv/Kconfig >> @@ -30,6 +30,7 @@ config RISCV >> select ARCH_HAS_STRICT_KERNEL_RWX if MMU >> select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX >> select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT >> + select ARCH_USE_QUEUED_RWLOCKS >> select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU >> select ARCH_WANT_FRAME_POINTERS >> select ARCH_WANT_HUGE_PMD_SHARE if 64BIT >> diff --git a/arch/riscv/include/asm/Kbuild b/arch/riscv/include/asm/Kbuild >> index 445ccc9..e57ef80 100644 >> --- a/arch/riscv/include/asm/Kbuild >> +++ b/arch/riscv/include/asm/Kbuild >> @@ -3,5 +3,6 @@ generic-y += early_ioremap.h >> generic-y += extable.h >> generic-y += flat.h >> generic-y += kvm_para.h >> +generic-y += qrwlock.h >> generic-y += user.h >> generic-y += vmlinux.lds.h >> diff --git a/arch/riscv/include/asm/spinlock.h b/arch/riscv/include/asm/spinlock.h >> index f4f7fa1..2c81764 100644 >> --- a/arch/riscv/include/asm/spinlock.h >> +++ b/arch/riscv/include/asm/spinlock.h >> @@ -7,129 +7,91 @@ >> #ifndef _ASM_RISCV_SPINLOCK_H >> #define _ASM_RISCV_SPINLOCK_H >> >> -#include <linux/kernel.h> >> -#include <asm/current.h> >> -#include <asm/fence.h> >> - >> /* >> - * Simple spin lock operations. These provide no fairness guarantees. >> + * Ticket-based spin-locking. >> */ >> +static inline void arch_spin_lock(arch_spinlock_t *lock) >> +{ >> + arch_spinlock_t lockval; >> + u32 tmp; >> + >> + asm volatile ( >> + "1: lr.w %0, %2 \n" >> + " mv %1, %0 \n" >> + " addw %0, %0, %3 \n" >> + " sc.w %0, %0, %2 \n" >> + " bnez %0, 1b \n" >> + : "=&r" (tmp), "=&r" (lockval), "+A" (lock->lock) >> + : "r" (1 << TICKET_NEXT) >> + : "memory"); >> >> -/* FIXME: Replace this with a ticket lock, like MIPS. */ >> - >> -#define arch_spin_is_locked(x) (READ_ONCE((x)->lock) != 0) >> + while (lockval.tickets.next != lockval.tickets.owner) { >> + /* >> + * FIXME - we need wfi/wfe here to prevent: >> + * - cache line bouncing >> + * - saving cpu pipeline in multi-harts-per-core >> + * processor >> + */ >> + lockval.tickets.owner = READ_ONCE(lock->tickets.owner); >> + } >> >> -static inline void arch_spin_unlock(arch_spinlock_t *lock) >> -{ >> - smp_store_release(&lock->lock, 0); >> + __atomic_acquire_fence(); >> } >> >> static inline int arch_spin_trylock(arch_spinlock_t *lock) >> { >> - int tmp = 1, busy; >> - >> - __asm__ __volatile__ ( >> - " amoswap.w %0, %2, %1\n" >> - RISCV_ACQUIRE_BARRIER >> - : "=r" (busy), "+A" (lock->lock) >> - : "r" (tmp) >> + u32 tmp, contended, res; >> + >> + do { >> + asm volatile ( >> + " lr.w %0, %3 \n" >> + " srliw %1, %0, %5 \n" >> + " slliw %2, %0, %5 \n" >> + " or %1, %2, %1 \n" >> + " li %2, 0 \n" >> + " sub %1, %1, %0 \n" >> + " bnez %1, 1f \n" >> + " addw %0, %0, %4 \n" >> + " sc.w %2, %0, %3 \n" >> + "1: \n" >> + : "=&r" (tmp), "=&r" (contended), "=&r" (res), >> + "+A" (lock->lock) >> + : "r" (1 << TICKET_NEXT), "I" (TICKET_NEXT) >> : "memory"); >> + } while (res); >> >> - return !busy; >> -} >> - >> -static inline void arch_spin_lock(arch_spinlock_t *lock) >> -{ >> - while (1) { >> - if (arch_spin_is_locked(lock)) >> - continue; >> - >> - if (arch_spin_trylock(lock)) >> - break; >> + if (!contended) { >> + __atomic_acquire_fence(); >> + return 1; >> + } else { >> + return 0; >> } >> } >> >> -/***********************************************************/ >> - >> -static inline void arch_read_lock(arch_rwlock_t *lock) >> +static inline void arch_spin_unlock(arch_spinlock_t *lock) >> { >> - int tmp; >> - >> - __asm__ __volatile__( >> - "1: lr.w %1, %0\n" >> - " bltz %1, 1b\n" >> - " addi %1, %1, 1\n" >> - " sc.w %1, %1, %0\n" >> - " bnez %1, 1b\n" >> - RISCV_ACQUIRE_BARRIER >> - : "+A" (lock->lock), "=&r" (tmp) >> - :: "memory"); >> + smp_store_release(&lock->tickets.owner, lock->tickets.owner + 1); >> + /* FIXME - we need ipi/sev here to notify above */ >> } >> >> -static inline void arch_write_lock(arch_rwlock_t *lock) >> +static inline int arch_spin_value_unlocked(arch_spinlock_t lock) >> { >> - int tmp; >> - >> - __asm__ __volatile__( >> - "1: lr.w %1, %0\n" >> - " bnez %1, 1b\n" >> - " li %1, -1\n" >> - " sc.w %1, %1, %0\n" >> - " bnez %1, 1b\n" >> - RISCV_ACQUIRE_BARRIER >> - : "+A" (lock->lock), "=&r" (tmp) >> - :: "memory"); >> + return lock.tickets.owner == lock.tickets.next; >> } >> >> -static inline int arch_read_trylock(arch_rwlock_t *lock) >> +static inline int arch_spin_is_locked(arch_spinlock_t *lock) >> { >> - int busy; >> - >> - __asm__ __volatile__( >> - "1: lr.w %1, %0\n" >> - " bltz %1, 1f\n" >> - " addi %1, %1, 1\n" >> - " sc.w %1, %1, %0\n" >> - " bnez %1, 1b\n" >> - RISCV_ACQUIRE_BARRIER >> - "1:\n" >> - : "+A" (lock->lock), "=&r" (busy) >> - :: "memory"); >> - >> - return !busy; >> + return !arch_spin_value_unlocked(READ_ONCE(*lock)); >> } >> >> -static inline int arch_write_trylock(arch_rwlock_t *lock) >> +static inline int arch_spin_is_contended(arch_spinlock_t *lock) >> { >> - int busy; >> - >> - __asm__ __volatile__( >> - "1: lr.w %1, %0\n" >> - " bnez %1, 1f\n" >> - " li %1, -1\n" >> - " sc.w %1, %1, %0\n" >> - " bnez %1, 1b\n" >> - RISCV_ACQUIRE_BARRIER >> - "1:\n" >> - : "+A" (lock->lock), "=&r" (busy) >> - :: "memory"); >> + struct __raw_tickets tickets = READ_ONCE(lock->tickets); >> >> - return !busy; >> + return (tickets.next - tickets.owner) > 1; >> } >> +#define arch_spin_is_contended arch_spin_is_contended >> >> -static inline void arch_read_unlock(arch_rwlock_t *lock) >> -{ >> - __asm__ __volatile__( >> - RISCV_RELEASE_BARRIER >> - " amoadd.w x0, %1, %0\n" >> - : "+A" (lock->lock) >> - : "r" (-1) >> - : "memory"); >> -} >> - >> -static inline void arch_write_unlock(arch_rwlock_t *lock) >> -{ >> - smp_store_release(&lock->lock, 0); >> -} >> +#include <asm/qrwlock.h> >> >> #endif /* _ASM_RISCV_SPINLOCK_H */ >> diff --git a/arch/riscv/include/asm/spinlock_types.h b/arch/riscv/include/asm/spinlock_types.h >> index f398e76..d7b38bf 100644 >> --- a/arch/riscv/include/asm/spinlock_types.h >> +++ b/arch/riscv/include/asm/spinlock_types.h >> @@ -10,16 +10,21 @@ >> # error "please don't include this file directly" >> #endif >> >> +#define TICKET_NEXT 16 >> + >> typedef struct { >> - volatile unsigned int lock; >> + union { >> + u32 lock; >> + struct __raw_tickets { >> + /* little endian */ >> + u16 owner; >> + u16 next; >> + } tickets; >> + }; >> } arch_spinlock_t; >> >> -#define __ARCH_SPIN_LOCK_UNLOCKED { 0 } >> - >> -typedef struct { >> - volatile unsigned int lock; >> -} arch_rwlock_t; >> +#define __ARCH_SPIN_LOCK_UNLOCKED { { 0 } } >> >> -#define __ARCH_RW_LOCK_UNLOCKED { 0 } >> +#include <asm-generic/qrwlock_types.h> >> >> #endif /* _ASM_RISCV_SPINLOCK_TYPES_H */ >> -- >> 2.7.4 >> >> >> _______________________________________________ >> linux-riscv mailing list >> linux-riscv@lists.infradead.org >> http://lists.infradead.org/mailman/listinfo/linux-riscv
On Wed, Mar 24, 2021 at 3:45 PM <guoren@kernel.org> wrote: > > From: Guo Ren <guoren@linux.alibaba.com> > > This patch introduces a ticket lock implementation for riscv, along the > same lines as the implementation for arch/arm & arch/csky. > > Signed-off-by: Guo Ren <guoren@linux.alibaba.com> > Cc: Catalin Marinas <catalin.marinas@arm.com> > Cc: Will Deacon <will.deacon@arm.com> > Cc: Peter Zijlstra <peterz@infradead.org> > Cc: Palmer Dabbelt <palmerdabbelt@google.com> > Cc: Anup Patel <anup@brainfault.org> > Cc: Arnd Bergmann <arnd@arndb.de> > --- > arch/riscv/Kconfig | 1 + > arch/riscv/include/asm/Kbuild | 1 + > arch/riscv/include/asm/spinlock.h | 158 ++++++++++++-------------------- > arch/riscv/include/asm/spinlock_types.h | 19 ++-- NACK from myside. Linux ARM64 has moved away from ticket spinlock to qspinlock. We should directly go for qspinlock. Regards, Anup > 4 files changed, 74 insertions(+), 105 deletions(-) > > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig > index 87d7b52..7c56a20 100644 > --- a/arch/riscv/Kconfig > +++ b/arch/riscv/Kconfig > @@ -30,6 +30,7 @@ config RISCV > select ARCH_HAS_STRICT_KERNEL_RWX if MMU > select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX > select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT > + select ARCH_USE_QUEUED_RWLOCKS > select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU > select ARCH_WANT_FRAME_POINTERS > select ARCH_WANT_HUGE_PMD_SHARE if 64BIT > diff --git a/arch/riscv/include/asm/Kbuild b/arch/riscv/include/asm/Kbuild > index 445ccc9..e57ef80 100644 > --- a/arch/riscv/include/asm/Kbuild > +++ b/arch/riscv/include/asm/Kbuild > @@ -3,5 +3,6 @@ generic-y += early_ioremap.h > generic-y += extable.h > generic-y += flat.h > generic-y += kvm_para.h > +generic-y += qrwlock.h > generic-y += user.h > generic-y += vmlinux.lds.h > diff --git a/arch/riscv/include/asm/spinlock.h b/arch/riscv/include/asm/spinlock.h > index f4f7fa1..2c81764 100644 > --- a/arch/riscv/include/asm/spinlock.h > +++ b/arch/riscv/include/asm/spinlock.h > @@ -7,129 +7,91 @@ > #ifndef _ASM_RISCV_SPINLOCK_H > #define _ASM_RISCV_SPINLOCK_H > > -#include <linux/kernel.h> > -#include <asm/current.h> > -#include <asm/fence.h> > - > /* > - * Simple spin lock operations. These provide no fairness guarantees. > + * Ticket-based spin-locking. > */ > +static inline void arch_spin_lock(arch_spinlock_t *lock) > +{ > + arch_spinlock_t lockval; > + u32 tmp; > + > + asm volatile ( > + "1: lr.w %0, %2 \n" > + " mv %1, %0 \n" > + " addw %0, %0, %3 \n" > + " sc.w %0, %0, %2 \n" > + " bnez %0, 1b \n" > + : "=&r" (tmp), "=&r" (lockval), "+A" (lock->lock) > + : "r" (1 << TICKET_NEXT) > + : "memory"); > > -/* FIXME: Replace this with a ticket lock, like MIPS. */ > - > -#define arch_spin_is_locked(x) (READ_ONCE((x)->lock) != 0) > + while (lockval.tickets.next != lockval.tickets.owner) { > + /* > + * FIXME - we need wfi/wfe here to prevent: > + * - cache line bouncing > + * - saving cpu pipeline in multi-harts-per-core > + * processor > + */ > + lockval.tickets.owner = READ_ONCE(lock->tickets.owner); > + } > > -static inline void arch_spin_unlock(arch_spinlock_t *lock) > -{ > - smp_store_release(&lock->lock, 0); > + __atomic_acquire_fence(); > } > > static inline int arch_spin_trylock(arch_spinlock_t *lock) > { > - int tmp = 1, busy; > - > - __asm__ __volatile__ ( > - " amoswap.w %0, %2, %1\n" > - RISCV_ACQUIRE_BARRIER > - : "=r" (busy), "+A" (lock->lock) > - : "r" (tmp) > + u32 tmp, contended, res; > + > + do { > + asm volatile ( > + " lr.w %0, %3 \n" > + " srliw %1, %0, %5 \n" > + " slliw %2, %0, %5 \n" > + " or %1, %2, %1 \n" > + " li %2, 0 \n" > + " sub %1, %1, %0 \n" > + " bnez %1, 1f \n" > + " addw %0, %0, %4 \n" > + " sc.w %2, %0, %3 \n" > + "1: \n" > + : "=&r" (tmp), "=&r" (contended), "=&r" (res), > + "+A" (lock->lock) > + : "r" (1 << TICKET_NEXT), "I" (TICKET_NEXT) > : "memory"); > + } while (res); > > - return !busy; > -} > - > -static inline void arch_spin_lock(arch_spinlock_t *lock) > -{ > - while (1) { > - if (arch_spin_is_locked(lock)) > - continue; > - > - if (arch_spin_trylock(lock)) > - break; > + if (!contended) { > + __atomic_acquire_fence(); > + return 1; > + } else { > + return 0; > } > } > > -/***********************************************************/ > - > -static inline void arch_read_lock(arch_rwlock_t *lock) > +static inline void arch_spin_unlock(arch_spinlock_t *lock) > { > - int tmp; > - > - __asm__ __volatile__( > - "1: lr.w %1, %0\n" > - " bltz %1, 1b\n" > - " addi %1, %1, 1\n" > - " sc.w %1, %1, %0\n" > - " bnez %1, 1b\n" > - RISCV_ACQUIRE_BARRIER > - : "+A" (lock->lock), "=&r" (tmp) > - :: "memory"); > + smp_store_release(&lock->tickets.owner, lock->tickets.owner + 1); > + /* FIXME - we need ipi/sev here to notify above */ > } > > -static inline void arch_write_lock(arch_rwlock_t *lock) > +static inline int arch_spin_value_unlocked(arch_spinlock_t lock) > { > - int tmp; > - > - __asm__ __volatile__( > - "1: lr.w %1, %0\n" > - " bnez %1, 1b\n" > - " li %1, -1\n" > - " sc.w %1, %1, %0\n" > - " bnez %1, 1b\n" > - RISCV_ACQUIRE_BARRIER > - : "+A" (lock->lock), "=&r" (tmp) > - :: "memory"); > + return lock.tickets.owner == lock.tickets.next; > } > > -static inline int arch_read_trylock(arch_rwlock_t *lock) > +static inline int arch_spin_is_locked(arch_spinlock_t *lock) > { > - int busy; > - > - __asm__ __volatile__( > - "1: lr.w %1, %0\n" > - " bltz %1, 1f\n" > - " addi %1, %1, 1\n" > - " sc.w %1, %1, %0\n" > - " bnez %1, 1b\n" > - RISCV_ACQUIRE_BARRIER > - "1:\n" > - : "+A" (lock->lock), "=&r" (busy) > - :: "memory"); > - > - return !busy; > + return !arch_spin_value_unlocked(READ_ONCE(*lock)); > } > > -static inline int arch_write_trylock(arch_rwlock_t *lock) > +static inline int arch_spin_is_contended(arch_spinlock_t *lock) > { > - int busy; > - > - __asm__ __volatile__( > - "1: lr.w %1, %0\n" > - " bnez %1, 1f\n" > - " li %1, -1\n" > - " sc.w %1, %1, %0\n" > - " bnez %1, 1b\n" > - RISCV_ACQUIRE_BARRIER > - "1:\n" > - : "+A" (lock->lock), "=&r" (busy) > - :: "memory"); > + struct __raw_tickets tickets = READ_ONCE(lock->tickets); > > - return !busy; > + return (tickets.next - tickets.owner) > 1; > } > +#define arch_spin_is_contended arch_spin_is_contended > > -static inline void arch_read_unlock(arch_rwlock_t *lock) > -{ > - __asm__ __volatile__( > - RISCV_RELEASE_BARRIER > - " amoadd.w x0, %1, %0\n" > - : "+A" (lock->lock) > - : "r" (-1) > - : "memory"); > -} > - > -static inline void arch_write_unlock(arch_rwlock_t *lock) > -{ > - smp_store_release(&lock->lock, 0); > -} > +#include <asm/qrwlock.h> > > #endif /* _ASM_RISCV_SPINLOCK_H */ > diff --git a/arch/riscv/include/asm/spinlock_types.h b/arch/riscv/include/asm/spinlock_types.h > index f398e76..d7b38bf 100644 > --- a/arch/riscv/include/asm/spinlock_types.h > +++ b/arch/riscv/include/asm/spinlock_types.h > @@ -10,16 +10,21 @@ > # error "please don't include this file directly" > #endif > > +#define TICKET_NEXT 16 > + > typedef struct { > - volatile unsigned int lock; > + union { > + u32 lock; > + struct __raw_tickets { > + /* little endian */ > + u16 owner; > + u16 next; > + } tickets; > + }; > } arch_spinlock_t; > > -#define __ARCH_SPIN_LOCK_UNLOCKED { 0 } > - > -typedef struct { > - volatile unsigned int lock; > -} arch_rwlock_t; > +#define __ARCH_SPIN_LOCK_UNLOCKED { { 0 } } > > -#define __ARCH_RW_LOCK_UNLOCKED { 0 } > +#include <asm-generic/qrwlock_types.h> > > #endif /* _ASM_RISCV_SPINLOCK_TYPES_H */ > -- > 2.7.4 >
On Wed, Mar 24, 2021 at 08:24:34PM +0800, Guo Ren wrote: > On Wed, Mar 24, 2021 at 7:16 PM Vitaly Wool <vitaly.wool@konsulko.com> wrote: > > > > > > > > On Wed, Mar 24, 2021, 11:16 AM <guoren@kernel.org> wrote: > >> > >> From: Guo Ren <guoren@linux.alibaba.com> > >> > >> This patch introduces a ticket lock implementation for riscv, along the > >> same lines as the implementation for arch/arm & arch/csky. > > > > > > Could you please provide a rationale for this? Like, what is wrong with the current implementation. > Ticket based spinlock's principle is here: > https://lwn.net/Articles/267968/ > > Current implementation will cause cache line bouncing when many harts > are acquiring the same spinlock. > I'm seeking a solution, maybe not fitting the current RISC-V base ISA. Ticket locks as such don't solve the cacheline bouncing part, since they're all still spinning on the same line. The big improvement ticket locks bring is that the lock acquisition time becomes a function of the longest hold time, instead of being unbounded. However, combine it with the WFE (preferably the ARM64 variant) and you can avoid the worst of the bouncing. If you really want to get rid of the bouncing, go with qspinlock, which will spin on a cpu local line. That said, qspinlock is quite gnarly code, and only really wins from ticket when you have NUMA.
On Wed, Mar 24, 2021 at 05:58:58PM +0530, Anup Patel wrote: > On Wed, Mar 24, 2021 at 3:45 PM <guoren@kernel.org> wrote: > > > > From: Guo Ren <guoren@linux.alibaba.com> > > > > This patch introduces a ticket lock implementation for riscv, along the > > same lines as the implementation for arch/arm & arch/csky. > > > > Signed-off-by: Guo Ren <guoren@linux.alibaba.com> > > Cc: Catalin Marinas <catalin.marinas@arm.com> > > Cc: Will Deacon <will.deacon@arm.com> > > Cc: Peter Zijlstra <peterz@infradead.org> > > Cc: Palmer Dabbelt <palmerdabbelt@google.com> > > Cc: Anup Patel <anup@brainfault.org> > > Cc: Arnd Bergmann <arnd@arndb.de> > > --- > > arch/riscv/Kconfig | 1 + > > arch/riscv/include/asm/Kbuild | 1 + > > arch/riscv/include/asm/spinlock.h | 158 ++++++++++++-------------------- > > arch/riscv/include/asm/spinlock_types.h | 19 ++-- > > NACK from myside. > > Linux ARM64 has moved away from ticket spinlock to qspinlock. > > We should directly go for qspinlock. I think it is a sensible intermediate step, even if you want to go qspinlock. Ticket locks are more or less trivial and get you fairness and all that goodness without the mind bending complexity of qspinlock. Once you have the ticket lock implementation solid (and qrwlock) and everything, *then* start to carefully look at qspinlock. Now, arguably arm64 did the heavy lifting of making qspinlock good on weak architectures, but if you want to do it right, you still have to analyze the whole thing for your own architecture.
On Wed, Mar 24, 2021 at 6:08 PM Peter Zijlstra <peterz@infradead.org> wrote: > > On Wed, Mar 24, 2021 at 05:58:58PM +0530, Anup Patel wrote: > > On Wed, Mar 24, 2021 at 3:45 PM <guoren@kernel.org> wrote: > > > > > > From: Guo Ren <guoren@linux.alibaba.com> > > > > > > This patch introduces a ticket lock implementation for riscv, along the > > > same lines as the implementation for arch/arm & arch/csky. > > > > > > Signed-off-by: Guo Ren <guoren@linux.alibaba.com> > > > Cc: Catalin Marinas <catalin.marinas@arm.com> > > > Cc: Will Deacon <will.deacon@arm.com> > > > Cc: Peter Zijlstra <peterz@infradead.org> > > > Cc: Palmer Dabbelt <palmerdabbelt@google.com> > > > Cc: Anup Patel <anup@brainfault.org> > > > Cc: Arnd Bergmann <arnd@arndb.de> > > > --- > > > arch/riscv/Kconfig | 1 + > > > arch/riscv/include/asm/Kbuild | 1 + > > > arch/riscv/include/asm/spinlock.h | 158 ++++++++++++-------------------- > > > arch/riscv/include/asm/spinlock_types.h | 19 ++-- > > > > NACK from myside. > > > > Linux ARM64 has moved away from ticket spinlock to qspinlock. > > > > We should directly go for qspinlock. > > I think it is a sensible intermediate step, even if you want to go > qspinlock. Ticket locks are more or less trivial and get you fairness > and all that goodness without the mind bending complexity of qspinlock. > > Once you have the ticket lock implementation solid (and qrwlock) and > everything, *then* start to carefully look at qspinlock. I do understand qspinlock are relatively complex but the best thing about qspinlock is it tries to ensure each CPU spins on it's own location. Instead of adding ticket spinlock now and later replacing it with qspinlock, it is better to straight away explore qspinlock hence my NACK. > > Now, arguably arm64 did the heavy lifting of making qspinlock good on > weak architectures, but if you want to do it right, you still have to > analyze the whole thing for your own architecture. Most of the RISC-V implementations are weak memory ordering so it makes more sense to explore qspinlock first. Regards, Anup
On Wed, 24 Mar 2021 05:53:51 PDT (-0700), anup@brainfault.org wrote: > On Wed, Mar 24, 2021 at 6:08 PM Peter Zijlstra <peterz@infradead.org> wrote: >> >> On Wed, Mar 24, 2021 at 05:58:58PM +0530, Anup Patel wrote: >> > On Wed, Mar 24, 2021 at 3:45 PM <guoren@kernel.org> wrote: >> > > >> > > From: Guo Ren <guoren@linux.alibaba.com> >> > > >> > > This patch introduces a ticket lock implementation for riscv, along the >> > > same lines as the implementation for arch/arm & arch/csky. >> > > >> > > Signed-off-by: Guo Ren <guoren@linux.alibaba.com> >> > > Cc: Catalin Marinas <catalin.marinas@arm.com> >> > > Cc: Will Deacon <will.deacon@arm.com> >> > > Cc: Peter Zijlstra <peterz@infradead.org> >> > > Cc: Palmer Dabbelt <palmerdabbelt@google.com> >> > > Cc: Anup Patel <anup@brainfault.org> >> > > Cc: Arnd Bergmann <arnd@arndb.de> >> > > --- >> > > arch/riscv/Kconfig | 1 + >> > > arch/riscv/include/asm/Kbuild | 1 + >> > > arch/riscv/include/asm/spinlock.h | 158 ++++++++++++-------------------- >> > > arch/riscv/include/asm/spinlock_types.h | 19 ++-- >> > >> > NACK from myside. >> > >> > Linux ARM64 has moved away from ticket spinlock to qspinlock. >> > >> > We should directly go for qspinlock. >> >> I think it is a sensible intermediate step, even if you want to go >> qspinlock. Ticket locks are more or less trivial and get you fairness >> and all that goodness without the mind bending complexity of qspinlock. >> >> Once you have the ticket lock implementation solid (and qrwlock) and >> everything, *then* start to carefully look at qspinlock. > > I do understand qspinlock are relatively complex but the best thing > about qspinlock is it tries to ensure each CPU spins on it's own location. > > Instead of adding ticket spinlock now and later replacing it with qspinlock, > it is better to straight away explore qspinlock hence my NACK. > >> >> Now, arguably arm64 did the heavy lifting of making qspinlock good on >> weak architectures, but if you want to do it right, you still have to >> analyze the whole thing for your own architecture. > > Most of the RISC-V implementations are weak memory ordering so it > makes more sense to explore qspinlock first. I know I'm somewhat late to the party here. I talked with Will (and to a lesser extent Peter) about this a week or two ago and it seems the best way to go here is to start with ticket locks. They're simpler, and in Arm land they performed better until we got to the larger systems. Given that we don't have any high performance implementations of the RISC-V memory model (and likely won't any time soon) it's hard to reason about the performance of anything like this, but at a bare minimum having fair locks is a pretty big positive and ticket locks should have very little overhead while providing fairness. IMO the decision between ticket and queueing locks is really more of a property of the hardware/workload than the ISA, though there are of course some pretty deep ISA dependencies than can make one saner than the other. It seems best to me to just allow users to pick their own flavor of locks, and at least PPC is already doing that. I threw together a quick asm-generic ticket lock that can be selected at compile time, but I want to spend some more time playing with the other architectures before sending anything out.
On Sun, Apr 11, 2021 at 11:11 PM Palmer Dabbelt <palmer@dabbelt.com> wrote: > > On Wed, 24 Mar 2021 05:53:51 PDT (-0700), anup@brainfault.org wrote: > > On Wed, Mar 24, 2021 at 6:08 PM Peter Zijlstra <peterz@infradead.org> wrote: > >> > >> On Wed, Mar 24, 2021 at 05:58:58PM +0530, Anup Patel wrote: > >> > On Wed, Mar 24, 2021 at 3:45 PM <guoren@kernel.org> wrote: > >> > > > >> > > From: Guo Ren <guoren@linux.alibaba.com> > >> > > > >> > > This patch introduces a ticket lock implementation for riscv, along the > >> > > same lines as the implementation for arch/arm & arch/csky. > >> > > > >> > > Signed-off-by: Guo Ren <guoren@linux.alibaba.com> > >> > > Cc: Catalin Marinas <catalin.marinas@arm.com> > >> > > Cc: Will Deacon <will.deacon@arm.com> > >> > > Cc: Peter Zijlstra <peterz@infradead.org> > >> > > Cc: Palmer Dabbelt <palmerdabbelt@google.com> > >> > > Cc: Anup Patel <anup@brainfault.org> > >> > > Cc: Arnd Bergmann <arnd@arndb.de> > >> > > --- > >> > > arch/riscv/Kconfig | 1 + > >> > > arch/riscv/include/asm/Kbuild | 1 + > >> > > arch/riscv/include/asm/spinlock.h | 158 ++++++++++++-------------------- > >> > > arch/riscv/include/asm/spinlock_types.h | 19 ++-- > >> > > >> > NACK from myside. > >> > > >> > Linux ARM64 has moved away from ticket spinlock to qspinlock. > >> > > >> > We should directly go for qspinlock. > >> > >> I think it is a sensible intermediate step, even if you want to go > >> qspinlock. Ticket locks are more or less trivial and get you fairness > >> and all that goodness without the mind bending complexity of qspinlock. > >> > >> Once you have the ticket lock implementation solid (and qrwlock) and > >> everything, *then* start to carefully look at qspinlock. > > > > I do understand qspinlock are relatively complex but the best thing > > about qspinlock is it tries to ensure each CPU spins on it's own location. > > > > Instead of adding ticket spinlock now and later replacing it with qspinlock, > > it is better to straight away explore qspinlock hence my NACK. > > > >> > >> Now, arguably arm64 did the heavy lifting of making qspinlock good on > >> weak architectures, but if you want to do it right, you still have to > >> analyze the whole thing for your own architecture. > > > > Most of the RISC-V implementations are weak memory ordering so it > > makes more sense to explore qspinlock first. > > I know I'm somewhat late to the party here. I talked with Will (and > to a lesser extent Peter) about this a week or two ago and it seems the > best way to go here is to start with ticket locks. They're simpler, and > in Arm land they performed better until we got to the larger systems. > Given that we don't have any high performance implementations of the > RISC-V memory model (and likely won't any time soon) it's hard to reason > about the performance of anything like this, but at a bare minimum > having fair locks is a pretty big positive and ticket locks should have > very little overhead while providing fairness. > > IMO the decision between ticket and queueing locks is really more of a > property of the hardware/workload than the ISA, though there are of > course some pretty deep ISA dependencies than can make one saner than > the other. It seems best to me to just allow users to pick their own > flavor of locks, and at least PPC is already doing that. I threw > together a quick asm-generic ticket lock that can be selected at compile > time, but I want to spend some more time playing with the other > architectures before sending anything out. This discussion came up again a few weeks ago because I've been stumbling over the test-and-set implementation and was wondering if nobody cared to improve that yet. Then I saw, that there have been a few attempts so far, but they did not land. So I brought this up in RVI's platform group meeting and the attendees showed big interest to get at least fairness. I assume Guo sent out his new patchset as a reaction to this call (1 or 2 days later). We have the same situation on OpenSBI, where we've agreed (with Anup) to go for a ticket lock implementation. A series for that can be found here (the implementation was tested in the kernel): http://lists.infradead.org/pipermail/opensbi/2021-April/000789.html In the mentioned RVI call, I've asked the question if ticket locks or MCF locks are preferred. And the feedback was to go for qspinlock/qrwlock. One good argument to do so would be, to not have to maintain an RV-specific implementation, but to use a well-tested in-kernel mechanism. The feedback in the call is also aligned with the previous attempts to enable MCF-locks on RV. However, the kernel's implementation requires sub-word atomics. And here is, where we are. The discussion last week was about changing the generic kernel code to loosen its requirements (not accepted because of performance regressions on e.g. x86) and if RV actually can provide sub-word atomics in form of LL/SC loops (yes, that's possible). Providing sub-word xchg() can be done within a couple of hours (some solutions are already on the list), but that was not enough from the initial patchset from Michael on (e.g. Christoph Hellwig asked back then for moving arch-independent parts into lib, which is a good idea given other archs do the same). So I expect this might require some more time until there is a solution, that's accepted by a broad range of maintainers. I've been working on a new MCF-lock series last week. It is working, but I did not publish it yet, because I wanted to go through the 130+ emails on the riscv-linux list and check for overseen review comments and validate the patch authors. You can find the current state here: https://github.com/cmuellner/linux/pull/new/riscv-spinlocks So, if you insist on ticket locks, then let's coordinate who will do that and how it will be tested (RV32/RV64, qemu vs real hw).
Please fix your mailer to properly flow text. Reflowed it for you. On Mon, Apr 12, 2021 at 03:32:27PM +0200, Christoph Müllner wrote: > This discussion came up again a few weeks ago because I've been > stumbling over the test-and-set implementation and was wondering if > nobody cared to improve that yet. > Then I saw, that there have been a few attempts so far, but they did > not land. So I brought this up in RVI's platform group meeting and > the attendees showed big interest to get at least fairness. I assume > Guo sent out his new patchset as a reaction to this call (1 or 2 days > later). > > We have the same situation on OpenSBI, where we've agreed (with Anup) > to go for a ticket lock implementation. A series for that can be > found here (the implementation was tested in the kernel): > http://lists.infradead.org/pipermail/opensbi/2021-April/000789.html > > In the mentioned RVI call, I've asked the question if ticket locks or > MCF locks are preferred. And the feedback was to go for > qspinlock/qrwlock. One good argument to do so would be, to not have to > maintain an RV-specific implementation, but to use a well-tested > in-kernel mechanism. qrwlock does not depend on qspinlock; any fair spinlock implementation works, including ticket. > The feedback in the call is also aligned with the previous attempts to > enable MCF-locks on RV. However, the kernel's implementation requires > sub-word atomics. And here is, where we are. The discussion last week > was about changing the generic kernel code to loosen its requirements > (not accepted because of performance regressions on e.g. x86) and if > RV actually can provide sub-word atomics in form of LL/SC loops (yes, > that's possible). So qspinlock is a complex and fickle beast. Yes it works on x86 and arm64 (Will and Catalin put a _lot_ of effort into that), but IMO using such a complex thing needs to be provably better than the trivial and simple thing (tickets, test-and-set). Part of that is fwd progress, if you don't have that, stay with test-and-set. Will fixed a number of issues there, and -RT actually hit at least one. Debugging non-deterministic lock behaviour is not on the fun list. Having something simple (ticket) to fall back to to prove it's your lock going 'funneh' is very useful. > Providing sub-word xchg() can be done within a couple of hours (some > solutions are already on the list), but that was not enough from the > initial patchset from Michael on (e.g. Christoph Hellwig asked back > then for moving arch-independent parts into lib, which is a good idea > given other archs do the same). So I expect this might require some > more time until there is a solution, that's accepted by a broad range > of maintainers. Using a lib/ cmpxchg based xchg16 is daft. Per the very same arguments I made elsewhere in the thread. cmpxchg() based loops have very difficult fwd progress guarantees, esp. so on LL/SC architectures. What I would do is create a common inline helper to compute that {addr, mask, val} setup with a comment on how to use it. (As is, we have at least 2 different ways of dealing with ENDIAN-ness) > I've been working on a new MCF-lock series last week. It is working, > but I did not publish it yet, because I wanted to go through the 130+ > emails on the riscv-linux list and check for overseen review comments > and validate the patch authors. > You can find the current state here: > https://github.com/cmuellner/linux/pull/new/riscv-spinlocks That's not public. But if that's not qspinlock, how are you justifying a complex spinlock implementation? Does it perform better than ticket? > So, if you insist on ticket locks, then let's coordinate who will do > that and how it will be tested (RV32/RV64, qemu vs real hw). Real hardware is all that counts :-)
On Mon, 12 Apr 2021 06:32:27 PDT (-0700), christophm30@gmail.com wrote: > On Sun, Apr 11, 2021 at 11:11 PM Palmer Dabbelt <palmer@dabbelt.com> wrote: >> >> On Wed, 24 Mar 2021 05:53:51 PDT (-0700), anup@brainfault.org wrote: >> > On Wed, Mar 24, 2021 at 6:08 PM Peter Zijlstra <peterz@infradead.org> wrote: >> >> >> >> On Wed, Mar 24, 2021 at 05:58:58PM +0530, Anup Patel wrote: >> >> > On Wed, Mar 24, 2021 at 3:45 PM <guoren@kernel.org> wrote: >> >> > > >> >> > > From: Guo Ren <guoren@linux.alibaba.com> >> >> > > >> >> > > This patch introduces a ticket lock implementation for riscv, along the >> >> > > same lines as the implementation for arch/arm & arch/csky. >> >> > > >> >> > > Signed-off-by: Guo Ren <guoren@linux.alibaba.com> >> >> > > Cc: Catalin Marinas <catalin.marinas@arm.com> >> >> > > Cc: Will Deacon <will.deacon@arm.com> >> >> > > Cc: Peter Zijlstra <peterz@infradead.org> >> >> > > Cc: Palmer Dabbelt <palmerdabbelt@google.com> >> >> > > Cc: Anup Patel <anup@brainfault.org> >> >> > > Cc: Arnd Bergmann <arnd@arndb.de> >> >> > > --- >> >> > > arch/riscv/Kconfig | 1 + >> >> > > arch/riscv/include/asm/Kbuild | 1 + >> >> > > arch/riscv/include/asm/spinlock.h | 158 ++++++++++++-------------------- >> >> > > arch/riscv/include/asm/spinlock_types.h | 19 ++-- >> >> > >> >> > NACK from myside. >> >> > >> >> > Linux ARM64 has moved away from ticket spinlock to qspinlock. >> >> > >> >> > We should directly go for qspinlock. >> >> >> >> I think it is a sensible intermediate step, even if you want to go >> >> qspinlock. Ticket locks are more or less trivial and get you fairness >> >> and all that goodness without the mind bending complexity of qspinlock. >> >> >> >> Once you have the ticket lock implementation solid (and qrwlock) and >> >> everything, *then* start to carefully look at qspinlock. >> > >> > I do understand qspinlock are relatively complex but the best thing >> > about qspinlock is it tries to ensure each CPU spins on it's own location. >> > >> > Instead of adding ticket spinlock now and later replacing it with qspinlock, >> > it is better to straight away explore qspinlock hence my NACK. >> > >> >> >> >> Now, arguably arm64 did the heavy lifting of making qspinlock good on >> >> weak architectures, but if you want to do it right, you still have to >> >> analyze the whole thing for your own architecture. >> > >> > Most of the RISC-V implementations are weak memory ordering so it >> > makes more sense to explore qspinlock first. >> >> I know I'm somewhat late to the party here. I talked with Will (and >> to a lesser extent Peter) about this a week or two ago and it seems the >> best way to go here is to start with ticket locks. They're simpler, and >> in Arm land they performed better until we got to the larger systems. >> Given that we don't have any high performance implementations of the >> RISC-V memory model (and likely won't any time soon) it's hard to reason >> about the performance of anything like this, but at a bare minimum >> having fair locks is a pretty big positive and ticket locks should have >> very little overhead while providing fairness. >> >> IMO the decision between ticket and queueing locks is really more of a >> property of the hardware/workload than the ISA, though there are of >> course some pretty deep ISA dependencies than can make one saner than >> the other. It seems best to me to just allow users to pick their own >> flavor of locks, and at least PPC is already doing that. I threw >> together a quick asm-generic ticket lock that can be selected at compile >> time, but I want to spend some more time playing with the other >> architectures before sending anything out. > > This discussion came up again a few weeks ago because I've been stumbling over > the test-and-set implementation and was wondering if nobody cared to > improve that yet. > Then I saw, that there have been a few attempts so far, but they did not land. > So I brought this up in RVI's platform group meeting and the attendees > showed big > interest to get at least fairness. I assume Guo sent out his new > patchset as a reaction > to this call (1 or 2 days later). > > We have the same situation on OpenSBI, where we've agreed (with Anup) > to go for a ticket lock implementation. > A series for that can be found here (the implementation was tested in > the kernel): > http://lists.infradead.org/pipermail/opensbi/2021-April/000789.html > > In the mentioned RVI call, I've asked the question if ticket locks or > MCF locks are preferred. > And the feedback was to go for qspinlock/qrwlock. One good argument to > do so would be, > to not have to maintain an RV-specific implementation, but to use a > well-tested in-kernel mechanism. > > The feedback in the call is also aligned with the previous attempts to > enable MCF-locks on RV. > However, the kernel's implementation requires sub-word atomics. And > here is, where we are. > The discussion last week was about changing the generic kernel code to > loosen its requirements > (not accepted because of performance regressions on e.g. x86) and if > RV actually can provide > sub-word atomics in form of LL/SC loops (yes, that's possible). > Providing sub-word xchg() can be done within a couple of hours (some > solutions are already on the list), > but that was not enough from the initial patchset from Michael on > (e.g. Christoph Hellwig asked back then > for moving arch-independent parts into lib, which is a good idea given > other archs do the same). > So I expect this might require some more time until there is a > solution, that's accepted by > a broad range of maintainers. > > I've been working on a new MCF-lock series last week. > It is working, but I did not publish it yet, because I wanted to go > through the 130+ emails > on the riscv-linux list and check for overseen review comments and > validate the patch authors. > You can find the current state here: > https://github.com/cmuellner/linux/pull/new/riscv-spinlocks > So, if you insist on ticket locks, then let's coordinate who will do > that and how it will be tested > (RV32/RV64, qemu vs real hw). My plan is to add a generic ticket-based lock, which can be selected at compile time. It'll have no architecture dependencies (though it'll likely have some hooks for architectures that can make this go faster). Users can then just pick which spinlock flavor they want, with the idea being that smaller systems will perform better with ticket locks and larger systems will perform better with queued locks. The main goal here is to give the less widely used architectures an easy way to have fair locks, as right now we've got a lot of code duplication because any architecture that wants ticket locks has to do it themselves. I'm not really convinced we have the ability to discuss the performance of locks on RISC-V right now, at least in any meaningful capacity. The set of systems we have right now are just so far off from having a competitive memory system that optimizing for them doesn't seem to be worth the time, and as there really aren't any users we don't know what workloads people care about. I'm mostly interested in just keeping the implementation simple, and ticket locks are the simplest spinlock flavor I know of that's fair (I think we can all agree that unfair locks are going to be a disaster). There are certainly classes of systems for which ticket locks will be the wrong choice, and for those it makes sense to use the generic qspinlock implementation. We'll likely need some changes to make that go fast on RISC-V, but we won't know what those are until we have hardware. For now just having something that works (while staying away from anything that's obviously horribly inefficient) is as good as we can do, so I'm perfectly happy to take whatever we need to enable qspinlock on RISC-V. I'll likely default to the ticket locks on RISC-V as it's simpler, but my main goal is to just get rid of the code duplication. IMO the correct lock flavor is really something that's tightly coupled to both the microarchitecture and workload, but given how poorly the current crop of implementations performs on anything parallel I'm more swayed by simplicity. When we end up with real hardware I'll be happy to re-evaluate that choice, but I don't think it's all that important today because we're going to need a whole new generation of implementations (and likely at least some ISA clarifications, as whether anything based on cmpxchg can be fair right now is still an open question) before we have anything competitive. If you guys want to go throw a "thou shalt make queued spinlocks better than ticket spinlocks" (or the other way around, I don't really personally care all that much about spinlock flavors) in the platform spec that's fine. Performance knobs are always a headache so getting rid of one would be great, but ultimately this is a microarchitecture thing so we'll have to see what gets implemented.
On Mon, Apr 12, 2021 at 4:52 PM Peter Zijlstra <peterz@infradead.org> wrote: > > > Please fix your mailer to properly flow text. Reflowed it for you. > > On Mon, Apr 12, 2021 at 03:32:27PM +0200, Christoph Müllner wrote: > > > This discussion came up again a few weeks ago because I've been > > stumbling over the test-and-set implementation and was wondering if > > nobody cared to improve that yet. > > > Then I saw, that there have been a few attempts so far, but they did > > not land. So I brought this up in RVI's platform group meeting and > > the attendees showed big interest to get at least fairness. I assume > > Guo sent out his new patchset as a reaction to this call (1 or 2 days > > later). > > > > We have the same situation on OpenSBI, where we've agreed (with Anup) > > to go for a ticket lock implementation. A series for that can be > > found here (the implementation was tested in the kernel): > > http://lists.infradead.org/pipermail/opensbi/2021-April/000789.html > > > > In the mentioned RVI call, I've asked the question if ticket locks or > > MCF locks are preferred. And the feedback was to go for > > qspinlock/qrwlock. One good argument to do so would be, to not have to > > maintain an RV-specific implementation, but to use a well-tested > > in-kernel mechanism. > > qrwlock does not depend on qspinlock; any fair spinlock implementation > works, including ticket. > > > The feedback in the call is also aligned with the previous attempts to > > enable MCF-locks on RV. However, the kernel's implementation requires > > sub-word atomics. And here is, where we are. The discussion last week > > was about changing the generic kernel code to loosen its requirements > > (not accepted because of performance regressions on e.g. x86) and if > > RV actually can provide sub-word atomics in form of LL/SC loops (yes, > > that's possible). > > So qspinlock is a complex and fickle beast. Yes it works on x86 and > arm64 (Will and Catalin put a _lot_ of effort into that), but IMO using > such a complex thing needs to be provably better than the trivial and > simple thing (tickets, test-and-set). > > Part of that is fwd progress, if you don't have that, stay with > test-and-set. Will fixed a number of issues there, and -RT actually hit > at least one. > > Debugging non-deterministic lock behaviour is not on the fun list. > Having something simple (ticket) to fall back to to prove it's your lock > going 'funneh' is very useful. > > > Providing sub-word xchg() can be done within a couple of hours (some > > solutions are already on the list), but that was not enough from the > > initial patchset from Michael on (e.g. Christoph Hellwig asked back > > then for moving arch-independent parts into lib, which is a good idea > > given other archs do the same). So I expect this might require some > > more time until there is a solution, that's accepted by a broad range > > of maintainers. > > Using a lib/ cmpxchg based xchg16 is daft. Per the very same arguments I > made elsewhere in the thread. cmpxchg() based loops have very difficult > fwd progress guarantees, esp. so on LL/SC architectures. > > What I would do is create a common inline helper to compute that {addr, > mask, val} setup with a comment on how to use it. > > (As is, we have at least 2 different ways of dealing with ENDIAN-ness) Well, that's what I have here: https://github.com/cmuellner/linux/commit/9d2f6449dd848b5723326ae8be34a3d2d41dcff1 > > I've been working on a new MCF-lock series last week. It is working, > > but I did not publish it yet, because I wanted to go through the 130+ > > emails on the riscv-linux list and check for overseen review comments > > and validate the patch authors. > > > You can find the current state here: > > https://github.com/cmuellner/linux/pull/new/riscv-spinlocks > > That's not public. But if that's not qspinlock, how are you justifying a > complex spinlock implementation? Does it perform better than ticket? I pasted the wrong link. Here is a working one: https://github.com/cmuellner/linux/tree/riscv-spinlocks It is basically Guo's v4 with mask-and-set within a LL/SC loop, commits split up, non-riscv commits dropped, and commit messages rewritten. I fully understand your reservations against using MCF locks. I also agree, that debugging locking issues is not funny. FWIW: I've been debugging quite some embedded Linux boards the last years, where essential basic building blocks showed unreliable/erratic behavior (caused e.g. by an unstable voltage supply) and every attempt to monitor the bug causes it to disappear. Ticket locks are also fine for me. Still, it would be nice to get the 16-bit xchg() merged, so advanced users can try qspinlocks without much effort. Otherwise, we just suspend the discussion now and restart it from the beginning in a year (as is happening right now). > > So, if you insist on ticket locks, then let's coordinate who will do > > that and how it will be tested (RV32/RV64, qemu vs real hw). > > Real hardware is all that counts :-) Fully agree, therefore I have written that.
On Mon, Apr 12, 2021 at 7:33 PM Palmer Dabbelt <palmer@dabbelt.com> wrote: > > On Mon, 12 Apr 2021 06:32:27 PDT (-0700), christophm30@gmail.com wrote: > > On Sun, Apr 11, 2021 at 11:11 PM Palmer Dabbelt <palmer@dabbelt.com> wrote: > >> > >> On Wed, 24 Mar 2021 05:53:51 PDT (-0700), anup@brainfault.org wrote: > >> > On Wed, Mar 24, 2021 at 6:08 PM Peter Zijlstra <peterz@infradead.org> wrote: > >> >> > >> >> On Wed, Mar 24, 2021 at 05:58:58PM +0530, Anup Patel wrote: > >> >> > On Wed, Mar 24, 2021 at 3:45 PM <guoren@kernel.org> wrote: > >> >> > > > >> >> > > From: Guo Ren <guoren@linux.alibaba.com> > >> >> > > > >> >> > > This patch introduces a ticket lock implementation for riscv, along the > >> >> > > same lines as the implementation for arch/arm & arch/csky. > >> >> > > > >> >> > > Signed-off-by: Guo Ren <guoren@linux.alibaba.com> > >> >> > > Cc: Catalin Marinas <catalin.marinas@arm.com> > >> >> > > Cc: Will Deacon <will.deacon@arm.com> > >> >> > > Cc: Peter Zijlstra <peterz@infradead.org> > >> >> > > Cc: Palmer Dabbelt <palmerdabbelt@google.com> > >> >> > > Cc: Anup Patel <anup@brainfault.org> > >> >> > > Cc: Arnd Bergmann <arnd@arndb.de> > >> >> > > --- > >> >> > > arch/riscv/Kconfig | 1 + > >> >> > > arch/riscv/include/asm/Kbuild | 1 + > >> >> > > arch/riscv/include/asm/spinlock.h | 158 ++++++++++++-------------------- > >> >> > > arch/riscv/include/asm/spinlock_types.h | 19 ++-- > >> >> > > >> >> > NACK from myside. > >> >> > > >> >> > Linux ARM64 has moved away from ticket spinlock to qspinlock. > >> >> > > >> >> > We should directly go for qspinlock. > >> >> > >> >> I think it is a sensible intermediate step, even if you want to go > >> >> qspinlock. Ticket locks are more or less trivial and get you fairness > >> >> and all that goodness without the mind bending complexity of qspinlock. > >> >> > >> >> Once you have the ticket lock implementation solid (and qrwlock) and > >> >> everything, *then* start to carefully look at qspinlock. > >> > > >> > I do understand qspinlock are relatively complex but the best thing > >> > about qspinlock is it tries to ensure each CPU spins on it's own location. > >> > > >> > Instead of adding ticket spinlock now and later replacing it with qspinlock, > >> > it is better to straight away explore qspinlock hence my NACK. > >> > > >> >> > >> >> Now, arguably arm64 did the heavy lifting of making qspinlock good on > >> >> weak architectures, but if you want to do it right, you still have to > >> >> analyze the whole thing for your own architecture. > >> > > >> > Most of the RISC-V implementations are weak memory ordering so it > >> > makes more sense to explore qspinlock first. > >> > >> I know I'm somewhat late to the party here. I talked with Will (and > >> to a lesser extent Peter) about this a week or two ago and it seems the > >> best way to go here is to start with ticket locks. They're simpler, and > >> in Arm land they performed better until we got to the larger systems. > >> Given that we don't have any high performance implementations of the > >> RISC-V memory model (and likely won't any time soon) it's hard to reason > >> about the performance of anything like this, but at a bare minimum > >> having fair locks is a pretty big positive and ticket locks should have > >> very little overhead while providing fairness. > >> > >> IMO the decision between ticket and queueing locks is really more of a > >> property of the hardware/workload than the ISA, though there are of > >> course some pretty deep ISA dependencies than can make one saner than > >> the other. It seems best to me to just allow users to pick their own > >> flavor of locks, and at least PPC is already doing that. I threw > >> together a quick asm-generic ticket lock that can be selected at compile > >> time, but I want to spend some more time playing with the other > >> architectures before sending anything out. > > > > This discussion came up again a few weeks ago because I've been stumbling over > > the test-and-set implementation and was wondering if nobody cared to > > improve that yet. > > Then I saw, that there have been a few attempts so far, but they did not land. > > So I brought this up in RVI's platform group meeting and the attendees > > showed big > > interest to get at least fairness. I assume Guo sent out his new > > patchset as a reaction > > to this call (1 or 2 days later). > > > > We have the same situation on OpenSBI, where we've agreed (with Anup) > > to go for a ticket lock implementation. > > A series for that can be found here (the implementation was tested in > > the kernel): > > http://lists.infradead.org/pipermail/opensbi/2021-April/000789.html > > > > In the mentioned RVI call, I've asked the question if ticket locks or > > MCF locks are preferred. > > And the feedback was to go for qspinlock/qrwlock. One good argument to > > do so would be, > > to not have to maintain an RV-specific implementation, but to use a > > well-tested in-kernel mechanism. > > > > The feedback in the call is also aligned with the previous attempts to > > enable MCF-locks on RV. > > However, the kernel's implementation requires sub-word atomics. And > > here is, where we are. > > The discussion last week was about changing the generic kernel code to > > loosen its requirements > > (not accepted because of performance regressions on e.g. x86) and if > > RV actually can provide > > sub-word atomics in form of LL/SC loops (yes, that's possible). > > Providing sub-word xchg() can be done within a couple of hours (some > > solutions are already on the list), > > but that was not enough from the initial patchset from Michael on > > (e.g. Christoph Hellwig asked back then > > for moving arch-independent parts into lib, which is a good idea given > > other archs do the same). > > So I expect this might require some more time until there is a > > solution, that's accepted by > > a broad range of maintainers. > > > > I've been working on a new MCF-lock series last week. > > It is working, but I did not publish it yet, because I wanted to go > > through the 130+ emails > > on the riscv-linux list and check for overseen review comments and > > validate the patch authors. > > You can find the current state here: > > https://github.com/cmuellner/linux/pull/new/riscv-spinlocks > > So, if you insist on ticket locks, then let's coordinate who will do > > that and how it will be tested > > (RV32/RV64, qemu vs real hw). > > My plan is to add a generic ticket-based lock, which can be selected at > compile time. It'll have no architecture dependencies (though it'll > likely have some hooks for architectures that can make this go faster). > Users can then just pick which spinlock flavor they want, with the idea > being that smaller systems will perform better with ticket locks and > larger systems will perform better with queued locks. The main goal > here is to give the less widely used architectures an easy way to have > fair locks, as right now we've got a lot of code duplication because any > architecture that wants ticket locks has to do it themselves. In the case of LL/SC sequences, we have a maximum of 16 instructions on RISC-V. My concern with a pure-C implementation would be that we cannot guarantee this (e.g. somebody wants to compile with -O0) and I don't know of a way to abort the build in case this limit exceeds. Therefore I have preferred inline assembly for OpenSBI (my initial idea was to use closure-like LL/SC macros, where you can write the loop in form of C code). In case you care, here is the ticket lock code from the OpenSBI patchset ported over to Linux: https://github.com/cmuellner/linux/commit/40a41d561df71fbe247016b303a1ef91bf9702f3 > I'm not really convinced we have the ability to discuss the performance > of locks on RISC-V right now, at least in any meaningful capacity. The > set of systems we have right now are just so far off from having a > competitive memory system that optimizing for them doesn't seem to be > worth the time, and as there really aren't any users we don't know what > workloads people care about. I'm mostly interested in just keeping the > implementation simple, and ticket locks are the simplest spinlock flavor > I know of that's fair (I think we can all agree that unfair locks are > going to be a disaster). > > There are certainly classes of systems for which ticket locks will be > the wrong choice, and for those it makes sense to use the generic > qspinlock implementation. We'll likely need some changes to make that > go fast on RISC-V, but we won't know what those are until we have > hardware. For now just having something that works (while staying away > from anything that's obviously horribly inefficient) is as good as we > can do, so I'm perfectly happy to take whatever we need to enable > qspinlock on RISC-V. > > I'll likely default to the ticket locks on RISC-V as it's simpler, but > my main goal is to just get rid of the code duplication. IMO the > correct lock flavor is really something that's tightly coupled to both > the microarchitecture and workload, but given how poorly the current > crop of implementations performs on anything parallel I'm more swayed by > simplicity. When we end up with real hardware I'll be happy to > re-evaluate that choice, but I don't think it's all that important today > because we're going to need a whole new generation of implementations > (and likely at least some ISA clarifications, as whether anything based > on cmpxchg can be fair right now is still an open question) before we > have anything competitive. > > If you guys want to go throw a "thou shalt make queued spinlocks better > than ticket spinlocks" (or the other way around, I don't really > personally care all that much about spinlock flavors) in the platform > spec that's fine. Performance knobs are always a headache so getting > rid of one would be great, but ultimately this is a microarchitecture > thing so we'll have to see what gets implemented. No, the implementation details of the spinlock algorithm are not part of any platform specification. It was discussed in this forum because I did not know a better place to ask this question. We all expect the RISC-V kernel maintainer(s) to make a proper choice in the end. And as you said, this can be re-evaluated in the future (based on results on real HW).
On Mon, Apr 12, 2021 at 11:54:55PM +0200, Christoph Müllner wrote: > On Mon, Apr 12, 2021 at 7:33 PM Palmer Dabbelt <palmer@dabbelt.com> wrote: > > My plan is to add a generic ticket-based lock, which can be selected at > > compile time. It'll have no architecture dependencies (though it'll > > likely have some hooks for architectures that can make this go faster). > > Users can then just pick which spinlock flavor they want, with the idea > > being that smaller systems will perform better with ticket locks and > > larger systems will perform better with queued locks. The main goal > > here is to give the less widely used architectures an easy way to have > > fair locks, as right now we've got a lot of code duplication because any > > architecture that wants ticket locks has to do it themselves. > > In the case of LL/SC sequences, we have a maximum of 16 instructions > on RISC-V. My concern with a pure-C implementation would be that > we cannot guarantee this (e.g. somebody wants to compile with -O0) > and I don't know of a way to abort the build in case this limit exceeds. > Therefore I have preferred inline assembly for OpenSBI (my initial idea > was to use closure-like LL/SC macros, where you can write the loop > in form of C code). For ticket locks you really only needs atomic_fetch_add() and smp_store_release() and an architectural guarantees that the atomic_fetch_add() has fwd progress under contention and that a sub-word store (through smp_store_release()) will fail the SC. Then you can do something like: void lock(atomic_t *lock) { u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */ u16 ticket = val >> 16; for (;;) { if (ticket == (u16)val) break; cpu_relax(); val = atomic_read_acquire(lock); } } void unlock(atomic_t *lock) { u16 *ptr = (u16 *)lock + (!!__BIG_ENDIAN__); u32 val = atomic_read(lock); smp_store_release(ptr, (u16)val + 1); } That's _almost_ as simple as a test-and-set :-) It isn't quite optimal on x86 for not being allowed to use a memop on unlock, since its being forced into a load-store because of all the volatile, but whatever.
On Tue, Apr 13, 2021 at 10:03:01AM +0200, Peter Zijlstra wrote: > For ticket locks you really only needs atomic_fetch_add() and > smp_store_release() and an architectural guarantees that the > atomic_fetch_add() has fwd progress under contention and that a sub-word > store (through smp_store_release()) will fail the SC. > > Then you can do something like: > > void lock(atomic_t *lock) > { > u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */ > u16 ticket = val >> 16; > > for (;;) { > if (ticket == (u16)val) > break; > cpu_relax(); > val = atomic_read_acquire(lock); > } A possibly better might be: if (ticket == (u16)val) return; atomic_cond_read_acquire(lock, ticket == (u16)VAL); Since that allows architectures to use WFE like constructs. > } > > void unlock(atomic_t *lock) > { > u16 *ptr = (u16 *)lock + (!!__BIG_ENDIAN__); > u32 val = atomic_read(lock); > > smp_store_release(ptr, (u16)val + 1); > } > > That's _almost_ as simple as a test-and-set :-) It isn't quite optimal > on x86 for not being allowed to use a memop on unlock, since its being > forced into a load-store because of all the volatile, but whatever.
On Tue, Apr 13, 2021 at 10:03 AM Peter Zijlstra <peterz@infradead.org> wrote: > > On Mon, Apr 12, 2021 at 11:54:55PM +0200, Christoph Müllner wrote: > > On Mon, Apr 12, 2021 at 7:33 PM Palmer Dabbelt <palmer@dabbelt.com> wrote: > > > > My plan is to add a generic ticket-based lock, which can be selected at > > > compile time. It'll have no architecture dependencies (though it'll > > > likely have some hooks for architectures that can make this go faster). > > > Users can then just pick which spinlock flavor they want, with the idea > > > being that smaller systems will perform better with ticket locks and > > > larger systems will perform better with queued locks. The main goal > > > here is to give the less widely used architectures an easy way to have > > > fair locks, as right now we've got a lot of code duplication because any > > > architecture that wants ticket locks has to do it themselves. > > > > In the case of LL/SC sequences, we have a maximum of 16 instructions > > on RISC-V. My concern with a pure-C implementation would be that > > we cannot guarantee this (e.g. somebody wants to compile with -O0) > > and I don't know of a way to abort the build in case this limit exceeds. > > Therefore I have preferred inline assembly for OpenSBI (my initial idea > > was to use closure-like LL/SC macros, where you can write the loop > > in form of C code). > > For ticket locks you really only needs atomic_fetch_add() and > smp_store_release() and an architectural guarantees that the > atomic_fetch_add() has fwd progress under contention and that a sub-word > store (through smp_store_release()) will fail the SC. > > Then you can do something like: > > void lock(atomic_t *lock) > { > u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */ > u16 ticket = val >> 16; > > for (;;) { > if (ticket == (u16)val) > break; > cpu_relax(); > val = atomic_read_acquire(lock); > } > } > > void unlock(atomic_t *lock) > { > u16 *ptr = (u16 *)lock + (!!__BIG_ENDIAN__); > u32 val = atomic_read(lock); > > smp_store_release(ptr, (u16)val + 1); > } > > That's _almost_ as simple as a test-and-set :-) It isn't quite optimal > on x86 for not being allowed to use a memop on unlock, since its being > forced into a load-store because of all the volatile, but whatever. What about trylock()? I.e. one could implement trylock() without a loop, by letting trylock() fail if the SC fails. That looks safe on first view, but nobody does this right now.
On Tue, Apr 13, 2021 at 11:22:40AM +0200, Christoph Müllner wrote: > On Tue, Apr 13, 2021 at 10:03 AM Peter Zijlstra <peterz@infradead.org> wrote: > > On Mon, Apr 12, 2021 at 11:54:55PM +0200, Christoph Müllner wrote: > > > On Mon, Apr 12, 2021 at 7:33 PM Palmer Dabbelt <palmer@dabbelt.com> wrote: > > > > My plan is to add a generic ticket-based lock, which can be selected at > > > > compile time. It'll have no architecture dependencies (though it'll > > > > likely have some hooks for architectures that can make this go faster). > > > > Users can then just pick which spinlock flavor they want, with the idea > > > > being that smaller systems will perform better with ticket locks and > > > > larger systems will perform better with queued locks. The main goal > > > > here is to give the less widely used architectures an easy way to have > > > > fair locks, as right now we've got a lot of code duplication because any > > > > architecture that wants ticket locks has to do it themselves. > > > > > > In the case of LL/SC sequences, we have a maximum of 16 instructions > > > on RISC-V. My concern with a pure-C implementation would be that > > > we cannot guarantee this (e.g. somebody wants to compile with -O0) > > > and I don't know of a way to abort the build in case this limit exceeds. > > > Therefore I have preferred inline assembly for OpenSBI (my initial idea > > > was to use closure-like LL/SC macros, where you can write the loop > > > in form of C code). > > > > For ticket locks you really only needs atomic_fetch_add() and > > smp_store_release() and an architectural guarantees that the > > atomic_fetch_add() has fwd progress under contention and that a sub-word > > store (through smp_store_release()) will fail the SC. > > > > Then you can do something like: > > > > void lock(atomic_t *lock) > > { > > u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */ > > u16 ticket = val >> 16; > > > > for (;;) { > > if (ticket == (u16)val) > > break; > > cpu_relax(); > > val = atomic_read_acquire(lock); > > } > > } > > > > void unlock(atomic_t *lock) > > { > > u16 *ptr = (u16 *)lock + (!!__BIG_ENDIAN__); > > u32 val = atomic_read(lock); > > > > smp_store_release(ptr, (u16)val + 1); > > } > > > > That's _almost_ as simple as a test-and-set :-) It isn't quite optimal > > on x86 for not being allowed to use a memop on unlock, since its being > > forced into a load-store because of all the volatile, but whatever. > > What about trylock()? > I.e. one could implement trylock() without a loop, by letting > trylock() fail if the SC fails. > That looks safe on first view, but nobody does this right now. Not familiar with RISC-V but I'd recommend that a trylock only fails if the lock is locked (after LR). A SC may fail for other reasons (cacheline eviction; depending on the microarchitecture) even if the lock is unlocked. At least on arm64 we had this issue with an implementation having a tendency to always fail the first STXR.
On Tue, Apr 13, 2021 at 11:22:40AM +0200, Christoph Müllner wrote: > > For ticket locks you really only needs atomic_fetch_add() and > > smp_store_release() and an architectural guarantees that the > > atomic_fetch_add() has fwd progress under contention and that a sub-word > > store (through smp_store_release()) will fail the SC. > > > > Then you can do something like: > > > > void lock(atomic_t *lock) > > { > > u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */ > > u16 ticket = val >> 16; > > > > for (;;) { > > if (ticket == (u16)val) > > break; > > cpu_relax(); > > val = atomic_read_acquire(lock); > > } > > } > > > > void unlock(atomic_t *lock) > > { > > u16 *ptr = (u16 *)lock + (!!__BIG_ENDIAN__); > > u32 val = atomic_read(lock); > > > > smp_store_release(ptr, (u16)val + 1); > > } > > > > That's _almost_ as simple as a test-and-set :-) It isn't quite optimal > > on x86 for not being allowed to use a memop on unlock, since its being > > forced into a load-store because of all the volatile, but whatever. > > What about trylock()? > I.e. one could implement trylock() without a loop, by letting > trylock() fail if the SC fails. > That looks safe on first view, but nobody does this right now. Generic code has to use cmpxchg(), and then you get something like this: bool trylock(atomic_t *lock) { u32 old = atomic_read(lock); if ((old >> 16) != (old & 0xffff)) return false; return atomic_try_cmpxchg(lock, &old, old + (1<<16)); /* SC, for RCsc */ } That will try and do the full LL/SC loop, because it wants to complete the cmpxchg, but in generic code we have no other option. (Is this what C11's weak cmpxchg is for?)
On Tue, Apr 13, 2021 at 11:31 AM Catalin Marinas <catalin.marinas@arm.com> wrote: > > On Tue, Apr 13, 2021 at 11:22:40AM +0200, Christoph Müllner wrote: > > On Tue, Apr 13, 2021 at 10:03 AM Peter Zijlstra <peterz@infradead.org> wrote: > > > On Mon, Apr 12, 2021 at 11:54:55PM +0200, Christoph Müllner wrote: > > > > On Mon, Apr 12, 2021 at 7:33 PM Palmer Dabbelt <palmer@dabbelt.com> wrote: > > > > > My plan is to add a generic ticket-based lock, which can be selected at > > > > > compile time. It'll have no architecture dependencies (though it'll > > > > > likely have some hooks for architectures that can make this go faster). > > > > > Users can then just pick which spinlock flavor they want, with the idea > > > > > being that smaller systems will perform better with ticket locks and > > > > > larger systems will perform better with queued locks. The main goal > > > > > here is to give the less widely used architectures an easy way to have > > > > > fair locks, as right now we've got a lot of code duplication because any > > > > > architecture that wants ticket locks has to do it themselves. > > > > > > > > In the case of LL/SC sequences, we have a maximum of 16 instructions > > > > on RISC-V. My concern with a pure-C implementation would be that > > > > we cannot guarantee this (e.g. somebody wants to compile with -O0) > > > > and I don't know of a way to abort the build in case this limit exceeds. > > > > Therefore I have preferred inline assembly for OpenSBI (my initial idea > > > > was to use closure-like LL/SC macros, where you can write the loop > > > > in form of C code). > > > > > > For ticket locks you really only needs atomic_fetch_add() and > > > smp_store_release() and an architectural guarantees that the > > > atomic_fetch_add() has fwd progress under contention and that a sub-word > > > store (through smp_store_release()) will fail the SC. > > > > > > Then you can do something like: > > > > > > void lock(atomic_t *lock) > > > { > > > u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */ > > > u16 ticket = val >> 16; > > > > > > for (;;) { > > > if (ticket == (u16)val) > > > break; > > > cpu_relax(); > > > val = atomic_read_acquire(lock); > > > } > > > } > > > > > > void unlock(atomic_t *lock) > > > { > > > u16 *ptr = (u16 *)lock + (!!__BIG_ENDIAN__); > > > u32 val = atomic_read(lock); > > > > > > smp_store_release(ptr, (u16)val + 1); > > > } > > > > > > That's _almost_ as simple as a test-and-set :-) It isn't quite optimal > > > on x86 for not being allowed to use a memop on unlock, since its being > > > forced into a load-store because of all the volatile, but whatever. > > > > What about trylock()? > > I.e. one could implement trylock() without a loop, by letting > > trylock() fail if the SC fails. > > That looks safe on first view, but nobody does this right now. > > Not familiar with RISC-V but I'd recommend that a trylock only fails if > the lock is locked (after LR). A SC may fail for other reasons > (cacheline eviction; depending on the microarchitecture) even if the > lock is unlocked. At least on arm64 we had this issue with an > implementation having a tendency to always fail the first STXR. Interesting data point. Thanks!
On Tue, Apr 13, 2021 at 11:37 AM Peter Zijlstra <peterz@infradead.org> wrote: > > On Tue, Apr 13, 2021 at 11:22:40AM +0200, Christoph Müllner wrote: > > > > For ticket locks you really only needs atomic_fetch_add() and > > > smp_store_release() and an architectural guarantees that the > > > atomic_fetch_add() has fwd progress under contention and that a sub-word > > > store (through smp_store_release()) will fail the SC. > > > > > > Then you can do something like: > > > > > > void lock(atomic_t *lock) > > > { > > > u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */ > > > u16 ticket = val >> 16; > > > > > > for (;;) { > > > if (ticket == (u16)val) > > > break; > > > cpu_relax(); > > > val = atomic_read_acquire(lock); > > > } > > > } > > > > > > void unlock(atomic_t *lock) > > > { > > > u16 *ptr = (u16 *)lock + (!!__BIG_ENDIAN__); > > > u32 val = atomic_read(lock); > > > > > > smp_store_release(ptr, (u16)val + 1); > > > } > > > > > > That's _almost_ as simple as a test-and-set :-) It isn't quite optimal > > > on x86 for not being allowed to use a memop on unlock, since its being > > > forced into a load-store because of all the volatile, but whatever. > > > > What about trylock()? > > I.e. one could implement trylock() without a loop, by letting > > trylock() fail if the SC fails. > > That looks safe on first view, but nobody does this right now. > > Generic code has to use cmpxchg(), and then you get something like this: > > bool trylock(atomic_t *lock) > { > u32 old = atomic_read(lock); > > if ((old >> 16) != (old & 0xffff)) > return false; > > return atomic_try_cmpxchg(lock, &old, old + (1<<16)); /* SC, for RCsc */ > } This approach requires two loads (atomic_read() and cmpxchg()), which is not required. Detecting this pattern and optimizing it in a compiler is quite unlikely. A bit less generic solution would be to wrap the LL/SC (would be mandatory in this case) instructions and do something like this: uint32_t __smp_load_acquire_reserved(void*); int __smp_store_release_conditional(void*, uint32_t); typedef union { uint32_t v32; struct { uint16_t owner; uint16_t next; }; } arch_spinlock_t; int trylock(arch_spinlock_t *lock) { arch_spinlock_t l; int success; do { l.v32 = __smp_load_acquire_reserved(lock); if (l.owner != l.next) return 0; l.next++; success = __smp_store_release_conditional(lock, l.v32); } while (!success); return success; } But here we can't tell the compiler to optimize the code between LL and SC... > > That will try and do the full LL/SC loop, because it wants to complete > the cmpxchg, but in generic code we have no other option. > > (Is this what C11's weak cmpxchg is for?)
On Tue, Apr 13, 2021 at 12:25:00PM +0200, Christoph Müllner wrote: > On Tue, Apr 13, 2021 at 11:37 AM Peter Zijlstra <peterz@infradead.org> wrote: > > On Tue, Apr 13, 2021 at 11:22:40AM +0200, Christoph Müllner wrote: > > > What about trylock()? > > > I.e. one could implement trylock() without a loop, by letting > > > trylock() fail if the SC fails. > > > That looks safe on first view, but nobody does this right now. > > > > Generic code has to use cmpxchg(), and then you get something like this: > > > > bool trylock(atomic_t *lock) > > { > > u32 old = atomic_read(lock); > > > > if ((old >> 16) != (old & 0xffff)) > > return false; > > > > return atomic_try_cmpxchg(lock, &old, old + (1<<16)); /* SC, for RCsc */ > > } > > This approach requires two loads (atomic_read() and cmpxchg()), which > is not required. > Detecting this pattern and optimizing it in a compiler is quite unlikely. > > A bit less generic solution would be to wrap the LL/SC (would be > mandatory in this case) > instructions and do something like this: > > uint32_t __smp_load_acquire_reserved(void*); > int __smp_store_release_conditional(void*, uint32_t); > > typedef union { > uint32_t v32; > struct { > uint16_t owner; > uint16_t next; > }; > } arch_spinlock_t; > > int trylock(arch_spinlock_t *lock) > { > arch_spinlock_t l; > int success; > do { > l.v32 = __smp_load_acquire_reserved(lock); > if (l.owner != l.next) > return 0; > l.next++; > success = __smp_store_release_conditional(lock, l.v32); > } while (!success); > return success; > } > > But here we can't tell the compiler to optimize the code between LL and SC... This indeed needs some care. IIUC RISC-V has similar restrictions as arm here, no load/store instructions are allowed between LR and SC. You can't guarantee that the compiler won't spill some variable onto the stack. BTW, I think the SC doesn't need release semantics above, only the LR needs acquire semantics.
From: Catalin Marinas > Sent: 13 April 2021 11:45 ... > This indeed needs some care. IIUC RISC-V has similar restrictions as arm > here, no load/store instructions are allowed between LR and SC. You > can't guarantee that the compiler won't spill some variable onto the > stack. You can probably never guarantee the compiler won't spill to stack. Especially if someone compiles with -O0. Which probably means that anything using LR/SC must be written in asm and the C wrappers disabled. David - Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK Registration No: 1397386 (Wales)
On Tue, Apr 13, 2021 at 12:45 PM Catalin Marinas <catalin.marinas@arm.com> wrote: > > On Tue, Apr 13, 2021 at 12:25:00PM +0200, Christoph Müllner wrote: > > On Tue, Apr 13, 2021 at 11:37 AM Peter Zijlstra <peterz@infradead.org> wrote: > > > On Tue, Apr 13, 2021 at 11:22:40AM +0200, Christoph Müllner wrote: > > > > What about trylock()? > > > > I.e. one could implement trylock() without a loop, by letting > > > > trylock() fail if the SC fails. > > > > That looks safe on first view, but nobody does this right now. > > > > > > Generic code has to use cmpxchg(), and then you get something like this: > > > > > > bool trylock(atomic_t *lock) > > > { > > > u32 old = atomic_read(lock); > > > > > > if ((old >> 16) != (old & 0xffff)) > > > return false; > > > > > > return atomic_try_cmpxchg(lock, &old, old + (1<<16)); /* SC, for RCsc */ > > > } > > > > This approach requires two loads (atomic_read() and cmpxchg()), which > > is not required. > > Detecting this pattern and optimizing it in a compiler is quite unlikely. > > > > A bit less generic solution would be to wrap the LL/SC (would be > > mandatory in this case) > > instructions and do something like this: > > > > uint32_t __smp_load_acquire_reserved(void*); > > int __smp_store_release_conditional(void*, uint32_t); > > > > typedef union { > > uint32_t v32; > > struct { > > uint16_t owner; > > uint16_t next; > > }; > > } arch_spinlock_t; > > > > int trylock(arch_spinlock_t *lock) > > { > > arch_spinlock_t l; > > int success; > > do { > > l.v32 = __smp_load_acquire_reserved(lock); > > if (l.owner != l.next) > > return 0; > > l.next++; > > success = __smp_store_release_conditional(lock, l.v32); > > } while (!success); > > return success; > > } > > > > But here we can't tell the compiler to optimize the code between LL and SC... > > This indeed needs some care. IIUC RISC-V has similar restrictions as arm > here, no load/store instructions are allowed between LR and SC. You > can't guarantee that the compiler won't spill some variable onto the > stack. RISC-V behaves similar, but the specification is a bit more precise: To guarantee forward progress, the ("constrained") LL/SC sequence has to consist of <=16 instructions. Further, the "dynamic code executed between the LR and SC instructions can only contain instructions from the base “I” instruction set, excluding loads, stores, backward jumps, taken backward branches, JALR, FENCE, and SYSTEM instructions". And GCC generates a backward jump in-between LL and SC. So we have more than enough reasons, to no do it this way. > > BTW, I think the SC doesn't need release semantics above, only the LR > needs acquire semantics. > > -- > Catalin
On Tue, Apr 13, 2021 at 6:25 PM Christoph Müllner <christophm30@gmail.com> wrote: > > On Tue, Apr 13, 2021 at 11:37 AM Peter Zijlstra <peterz@infradead.org> wrote: > > > > On Tue, Apr 13, 2021 at 11:22:40AM +0200, Christoph Müllner wrote: > > > > > > For ticket locks you really only needs atomic_fetch_add() and > > > > smp_store_release() and an architectural guarantees that the > > > > atomic_fetch_add() has fwd progress under contention and that a sub-word > > > > store (through smp_store_release()) will fail the SC. > > > > > > > > Then you can do something like: > > > > > > > > void lock(atomic_t *lock) > > > > { > > > > u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */ > > > > u16 ticket = val >> 16; > > > > > > > > for (;;) { > > > > if (ticket == (u16)val) > > > > break; > > > > cpu_relax(); > > > > val = atomic_read_acquire(lock); > > > > } > > > > } > > > > > > > > void unlock(atomic_t *lock) > > > > { > > > > u16 *ptr = (u16 *)lock + (!!__BIG_ENDIAN__); > > > > u32 val = atomic_read(lock); > > > > > > > > smp_store_release(ptr, (u16)val + 1); > > > > } > > > > > > > > That's _almost_ as simple as a test-and-set :-) It isn't quite optimal > > > > on x86 for not being allowed to use a memop on unlock, since its being > > > > forced into a load-store because of all the volatile, but whatever. > > > > > > What about trylock()? > > > I.e. one could implement trylock() without a loop, by letting > > > trylock() fail if the SC fails. > > > That looks safe on first view, but nobody does this right now. > > > > Generic code has to use cmpxchg(), and then you get something like this: > > > > bool trylock(atomic_t *lock) > > { > > u32 old = atomic_read(lock); > > > > if ((old >> 16) != (old & 0xffff)) > > return false; > > > > return atomic_try_cmpxchg(lock, &old, old + (1<<16)); /* SC, for RCsc */ > > } > > This approach requires two loads (atomic_read() and cmpxchg()), which > is not required. > Detecting this pattern and optimizing it in a compiler is quite unlikely. > > A bit less generic solution would be to wrap the LL/SC (would be > mandatory in this case) > instructions and do something like this: > > uint32_t __smp_load_acquire_reserved(void*); > int __smp_store_release_conditional(void*, uint32_t); > > typedef union { > uint32_t v32; > struct { > uint16_t owner; > uint16_t next; > }; > } arch_spinlock_t; > > int trylock(arch_spinlock_t *lock) > { > arch_spinlock_t l; > int success; > do { > l.v32 = __smp_load_acquire_reserved(lock); > if (l.owner != l.next) > return 0; > l.next++; > success = __smp_store_release_conditional(lock, l.v32); It's a new semantics v.s cmpxchg, and cmpxchg is come from CAS instruction to solve some complex scenario. The primitive of cmpxchg has been widely used in Linux, so I don't think introducing a new semantics (reserved_conditional) is a good idea. Also, from this point: It seems CAS instruction is more suitable for software compatibility. Although riscv privilege spec chose the LR/SC and list some bad sides of CAS. > } while (!success); > return success; > } > > But here we can't tell the compiler to optimize the code between LL and SC... > > > > > That will try and do the full LL/SC loop, because it wants to complete > > the cmpxchg, but in generic code we have no other option. > > > > (Is this what C11's weak cmpxchg is for?)
On Tue, Apr 13, 2021 at 5:31 PM Catalin Marinas <catalin.marinas@arm.com> wrote: > > On Tue, Apr 13, 2021 at 11:22:40AM +0200, Christoph Müllner wrote: > > On Tue, Apr 13, 2021 at 10:03 AM Peter Zijlstra <peterz@infradead.org> wrote: > > > On Mon, Apr 12, 2021 at 11:54:55PM +0200, Christoph Müllner wrote: > > > > On Mon, Apr 12, 2021 at 7:33 PM Palmer Dabbelt <palmer@dabbelt.com> wrote: > > > > > My plan is to add a generic ticket-based lock, which can be selected at > > > > > compile time. It'll have no architecture dependencies (though it'll > > > > > likely have some hooks for architectures that can make this go faster). > > > > > Users can then just pick which spinlock flavor they want, with the idea > > > > > being that smaller systems will perform better with ticket locks and > > > > > larger systems will perform better with queued locks. The main goal > > > > > here is to give the less widely used architectures an easy way to have > > > > > fair locks, as right now we've got a lot of code duplication because any > > > > > architecture that wants ticket locks has to do it themselves. > > > > > > > > In the case of LL/SC sequences, we have a maximum of 16 instructions > > > > on RISC-V. My concern with a pure-C implementation would be that > > > > we cannot guarantee this (e.g. somebody wants to compile with -O0) > > > > and I don't know of a way to abort the build in case this limit exceeds. > > > > Therefore I have preferred inline assembly for OpenSBI (my initial idea > > > > was to use closure-like LL/SC macros, where you can write the loop > > > > in form of C code). > > > > > > For ticket locks you really only needs atomic_fetch_add() and > > > smp_store_release() and an architectural guarantees that the > > > atomic_fetch_add() has fwd progress under contention and that a sub-word > > > store (through smp_store_release()) will fail the SC. > > > > > > Then you can do something like: > > > > > > void lock(atomic_t *lock) > > > { > > > u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */ > > > u16 ticket = val >> 16; > > > > > > for (;;) { > > > if (ticket == (u16)val) > > > break; > > > cpu_relax(); > > > val = atomic_read_acquire(lock); > > > } > > > } > > > > > > void unlock(atomic_t *lock) > > > { > > > u16 *ptr = (u16 *)lock + (!!__BIG_ENDIAN__); > > > u32 val = atomic_read(lock); > > > > > > smp_store_release(ptr, (u16)val + 1); > > > } > > > > > > That's _almost_ as simple as a test-and-set :-) It isn't quite optimal > > > on x86 for not being allowed to use a memop on unlock, since its being > > > forced into a load-store because of all the volatile, but whatever. > > > > What about trylock()? > > I.e. one could implement trylock() without a loop, by letting > > trylock() fail if the SC fails. > > That looks safe on first view, but nobody does this right now. I think it's safe for riscv LR/SC, because in spec A 8.3 section: "As a consequence of the eventuality guarantee, if some harts in an execution environment are executing constrained LR/SC loops, and no other harts or devices in the execution environment execute an unconditional store or AMO to that reservation set, then at least one hart will eventually exit its constrained LR/SC loop." So it guarantees LR/SC pair: CPU0 CPU1 ======= ======= LR addr1 LR addr1 SC addr1 // guarantee success. SC addr1 But not guarantee, another hart unconditional store (which I mentioned before): u32 a = 0x55aa66bb; u16 *ptr = &a; CPU0 CPU1 ========= ========= xchg16(ptr, new) while(1) WRITE_ONCE(*(ptr + 1), x); > > Not familiar with RISC-V but I'd recommend that a trylock only fails if > the lock is locked (after LR). A SC may fail for other reasons > (cacheline eviction; depending on the microarchitecture) even if the > lock is unlocked. At least on arm64 we had this issue with an > implementation having a tendency to always fail the first STXR. I think it's a broken implementation for riscv. SC couldn't fail by cache line bouncing and only could fail by another real write. That means the HW implementation should use a per-hart address monitor not just grab the cache line into the exclusive state without lockdown the SNOOP channel. I think the implementation of LR/SC you mentioned is a gambling style but broke the riscv spec. Is the patch of Will's would fix up the problem you mentioned? ---- commit 9bb17be062de6f5a9c9643258951aa0935652ec3 Author: Will Deacon <will.deacon@arm.com> Date: Tue Jul 2 14:54:33 2013 +0100 ARM: locks: prefetch the destination word for write prior to strex The cost of changing a cacheline from shared to exclusive state can be significant, especially when this is triggered by an exclusive store, since it may result in having to retry the transaction. This patch prefixes our {spin,read,write}_[try]lock implementations with pldw instructions (on CPUs which support them) to try and grab the line in exclusive state from the start. arch_rwlock_t is changed to avoid using a volatile member, since this generates compiler warnings when falling back on the __builtin_prefetch intrinsic which expects a const void * argument. Acked-by: Nicolas Pitre <nico@linaro.org> Signed-off-by: Will Deacon <will.deacon@arm.com> ---- In the end, I want to conclude my suggestions here: - Using ticket-lock as default - Using ARCH_USE_QUEUED_SPINLOCKS_XCHG32 for riscv qspinlock - Disable xhg16/cmxchg16 and any sub-word atomic primitive in riscv -- Best Regards Guo Ren ML: https://lore.kernel.org/linux-csky/
Thx Peter, On Tue, Apr 13, 2021 at 4:17 PM Peter Zijlstra <peterz@infradead.org> wrote: > > On Tue, Apr 13, 2021 at 10:03:01AM +0200, Peter Zijlstra wrote: > > > For ticket locks you really only needs atomic_fetch_add() and > > smp_store_release() and an architectural guarantees that the > > atomic_fetch_add() has fwd progress under contention and that a sub-word > > store (through smp_store_release()) will fail the SC. > > > > Then you can do something like: > > > > void lock(atomic_t *lock) > > { > > u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */ > > u16 ticket = val >> 16; > > > > for (;;) { > > if (ticket == (u16)val) > > break; > > cpu_relax(); > > val = atomic_read_acquire(lock); > > } Should it be? for (;;) { if (ticket == (u16)val) { __atomic_acquire_fence(); break; } > > A possibly better might be: > > if (ticket == (u16)val) > return; Should it be? if (ticket == (u16)val) { __atomic_acquire_fence(); return; } > > atomic_cond_read_acquire(lock, ticket == (u16)VAL); > > Since that allows architectures to use WFE like constructs. > > > } > > > > void unlock(atomic_t *lock) > > { > > u16 *ptr = (u16 *)lock + (!!__BIG_ENDIAN__); > > u32 val = atomic_read(lock); > > > > smp_store_release(ptr, (u16)val + 1); > > } > > > > That's _almost_ as simple as a test-and-set :-) It isn't quite optimal > > on x86 for not being allowed to use a memop on unlock, since its being > > forced into a load-store because of all the volatile, but whatever.
On Tue, Apr 13, 2021 at 6:54 PM David Laight <David.Laight@aculab.com> wrote: > > From: Catalin Marinas > > Sent: 13 April 2021 11:45 > ... > > This indeed needs some care. IIUC RISC-V has similar restrictions as arm > > here, no load/store instructions are allowed between LR and SC. You > > can't guarantee that the compiler won't spill some variable onto the > > stack. > > You can probably never guarantee the compiler won't spill to stack. > Especially if someone compiles with -O0. > > Which probably means that anything using LR/SC must be written in > asm and the C wrappers disabled. Agree, and cmpxchg has been widely used in Linux. I think it's the last requirement for complex atomic API, although cmpxchg has ABA problem: CPU0 CPU1 ======= ====== do { old32 = load32; *ptr32 = new32_tmp; *ptr32 = old32; load32 = cmpxchg(ptr32, old32, new32); //still success } while (load32 != old32); That means cmpxhg only cares about the result but not the middle situation. It's different from LR/SC or AMO instructions. > > David > > - > Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK > Registration No: 1397386 (Wales) >
On Wed, Apr 14, 2021 at 10:26:57AM +0800, Guo Ren wrote: > Thx Peter, > > On Tue, Apr 13, 2021 at 4:17 PM Peter Zijlstra <peterz@infradead.org> wrote: > > > > On Tue, Apr 13, 2021 at 10:03:01AM +0200, Peter Zijlstra wrote: > > > > > For ticket locks you really only needs atomic_fetch_add() and > > > smp_store_release() and an architectural guarantees that the > > > atomic_fetch_add() has fwd progress under contention and that a sub-word > > > store (through smp_store_release()) will fail the SC. > > > > > > Then you can do something like: > > > > > > void lock(atomic_t *lock) > > > { > > > u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */ > > > u16 ticket = val >> 16; > > > > > > for (;;) { > > > if (ticket == (u16)val) > > > break; > > > cpu_relax(); > > > val = atomic_read_acquire(lock); > > > } > Should it be? > for (;;) { > if (ticket == (u16)val) { > __atomic_acquire_fence(); > break; > } No, atomic_fetch_add() is full smp_mb(), it even has a comment on that says so. Also, __atomic_acquire_fence() is an implementation detail of atomic, and architectures need not provide it. On top of that, IIRC the atomic _acquire/_release have RCpc ordering, where we want our locks to have RCsc ordering (and very much not weaker than RCtso). Even more so, adding barriers to atomics should really not be conditional.
On Wed, Apr 14, 2021 at 09:08:18AM +0200, Peter Zijlstra wrote: > On Wed, Apr 14, 2021 at 10:26:57AM +0800, Guo Ren wrote: > > Thx Peter, > > > > On Tue, Apr 13, 2021 at 4:17 PM Peter Zijlstra <peterz@infradead.org> wrote: > > > > > > On Tue, Apr 13, 2021 at 10:03:01AM +0200, Peter Zijlstra wrote: > > > > > > > For ticket locks you really only needs atomic_fetch_add() and > > > > smp_store_release() and an architectural guarantees that the > > > > atomic_fetch_add() has fwd progress under contention and that a sub-word > > > > store (through smp_store_release()) will fail the SC. > > > > > > > > Then you can do something like: > > > > > > > > void lock(atomic_t *lock) > > > > { > > > > u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */ > > > > u16 ticket = val >> 16; > > > > > > > > for (;;) { > > > > if (ticket == (u16)val) > > > > break; > > > > cpu_relax(); > > > > val = atomic_read_acquire(lock); > > > > } > > Should it be? > > for (;;) { > > if (ticket == (u16)val) { > > __atomic_acquire_fence(); > > break; > > } > > No, atomic_fetch_add() is full smp_mb(), it even has a comment on that > says so. > > Also, __atomic_acquire_fence() is an implementation detail of atomic, > and architectures need not provide it. On top of that, IIRC the atomic > _acquire/_release have RCpc ordering, where we want our locks to have > RCsc ordering (and very much not weaker than RCtso). Even more so, > adding barriers to atomics should really not be conditional. That made me look at the qspinlock code, and queued_spin_*lock() uses atomic_try_cmpxchg_acquire(), which means any arch that uses qspinlock and has RCpc atomics will give us massive pain. Current archs using qspinlock are: x86, arm64, power, sparc64, mips and openrisc (WTF?!). Of those, x86 and sparc are TSO archs with SC atomics, arm64 has RCsc atomics, power has RCtso atomics (and is the arch we all hate for having RCtso locks). Now MIPS has all sorts of ill specified barriers, but last time looked at it it didn't actually use any of that and stuck to using smp_mb(), so it will have RCsc atomics. /me goes look at wth openrisc is.. doesn't even appear to have asm/barrier.h :-/ Looking at wikipedia it also doesn't appear to actually have hardware ... I'm thinking openrisc is a prime candidate for this ticket_lock.h we're all talking about.
On Wed, Apr 14, 2021 at 08:23:51AM +0800, Guo Ren wrote: > On Tue, Apr 13, 2021 at 5:31 PM Catalin Marinas <catalin.marinas@arm.com> wrote: > > On Tue, Apr 13, 2021 at 11:22:40AM +0200, Christoph Müllner wrote: > > > On Tue, Apr 13, 2021 at 10:03 AM Peter Zijlstra <peterz@infradead.org> wrote: > > > > On Mon, Apr 12, 2021 at 11:54:55PM +0200, Christoph Müllner wrote: > > > > > On Mon, Apr 12, 2021 at 7:33 PM Palmer Dabbelt <palmer@dabbelt.com> wrote: > > > > > > My plan is to add a generic ticket-based lock, which can be selected at > > > > > > compile time. It'll have no architecture dependencies (though it'll > > > > > > likely have some hooks for architectures that can make this go faster). > > > > > > Users can then just pick which spinlock flavor they want, with the idea > > > > > > being that smaller systems will perform better with ticket locks and > > > > > > larger systems will perform better with queued locks. The main goal > > > > > > here is to give the less widely used architectures an easy way to have > > > > > > fair locks, as right now we've got a lot of code duplication because any > > > > > > architecture that wants ticket locks has to do it themselves. > > > > > > > > > > In the case of LL/SC sequences, we have a maximum of 16 instructions > > > > > on RISC-V. My concern with a pure-C implementation would be that > > > > > we cannot guarantee this (e.g. somebody wants to compile with -O0) > > > > > and I don't know of a way to abort the build in case this limit exceeds. > > > > > Therefore I have preferred inline assembly for OpenSBI (my initial idea > > > > > was to use closure-like LL/SC macros, where you can write the loop > > > > > in form of C code). > > > > > > > > For ticket locks you really only needs atomic_fetch_add() and > > > > smp_store_release() and an architectural guarantees that the > > > > atomic_fetch_add() has fwd progress under contention and that a sub-word > > > > store (through smp_store_release()) will fail the SC. > > > > > > > > Then you can do something like: > > > > > > > > void lock(atomic_t *lock) > > > > { > > > > u32 val = atomic_fetch_add(1<<16, lock); /* SC, gives us RCsc */ > > > > u16 ticket = val >> 16; > > > > > > > > for (;;) { > > > > if (ticket == (u16)val) > > > > break; > > > > cpu_relax(); > > > > val = atomic_read_acquire(lock); > > > > } > > > > } > > > > > > > > void unlock(atomic_t *lock) > > > > { > > > > u16 *ptr = (u16 *)lock + (!!__BIG_ENDIAN__); > > > > u32 val = atomic_read(lock); > > > > > > > > smp_store_release(ptr, (u16)val + 1); > > > > } > > > > > > > > That's _almost_ as simple as a test-and-set :-) It isn't quite optimal > > > > on x86 for not being allowed to use a memop on unlock, since its being > > > > forced into a load-store because of all the volatile, but whatever. > > > > > > What about trylock()? > > > I.e. one could implement trylock() without a loop, by letting > > > trylock() fail if the SC fails. > > > That looks safe on first view, but nobody does this right now. > > I think it's safe for riscv LR/SC, because in spec A 8.3 section: > "As a consequence of the eventuality guarantee, if some harts in an > execution environment are executing constrained LR/SC loops, and no > other harts or devices in the execution environment execute an > unconditional store or AMO to that reservation set, then at least one > hart will eventually exit its constrained LR/SC loop." This is clearly talking about _loops_ and that one hart will _eventually_ exit the loop. It does not say that there is a guaranteed LR/SC successful sequence or single iteration of the loop. > So it guarantees LR/SC pair: > > CPU0 CPU1 > ======= ======= > LR addr1 > LR addr1 > SC addr1 // guarantee success. > SC addr1 I don't see the RISC-V spec guaranteeing the (eventual) success of the SC on CPU1 _without_ a loop. > But not guarantee, another hart unconditional store (which I mentioned before): > u32 a = 0x55aa66bb; > u16 *ptr = &a; > > CPU0 CPU1 > ========= ========= > xchg16(ptr, new) while(1) > WRITE_ONCE(*(ptr + 1), x); If xchg16() is implemented with LR/SC, that's not guaranteed either. If it implemented as some form of swap, the architecture may guarantee it's success (or more like it won't deadlock). > > Not familiar with RISC-V but I'd recommend that a trylock only fails if > > the lock is locked (after LR). A SC may fail for other reasons > > (cacheline eviction; depending on the microarchitecture) even if the > > lock is unlocked. At least on arm64 we had this issue with an > > implementation having a tendency to always fail the first STXR. > > I think it's a broken implementation for riscv. SC couldn't fail by > cache line bouncing and only could fail by another real write. > That means the HW implementation should use a per-hart address monitor > not just grab the cache line into the exclusive state without lockdown > the SNOOP channel. > I think the implementation of LR/SC you mentioned is a gambling style > but broke the riscv spec. Arm has the notion of exclusive monitors (local, global) but an implementation may fake them by using cacheline states. And that's allowed since the monitor doesn't need to guarantee success on the first try but an eventual success of an ldxr/stxr loop. > Is the patch of Will's would fix up the problem you mentioned? > ---- > commit 9bb17be062de6f5a9c9643258951aa0935652ec3 > Author: Will Deacon <will.deacon@arm.com> > Date: Tue Jul 2 14:54:33 2013 +0100 > > ARM: locks: prefetch the destination word for write prior to strex No, that's only an optimisation. A prefetch would not guarantee that the cacheline stays in certain state. There can be a long time between the LR and SC, especially if interrupts are enabled or if you add virtualisation into the mix. The commit I was referring to is 4ecf7ccb1973 ("arm64: spinlock: retry trylock operation if strex fails on free lock").
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 87d7b52..7c56a20 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -30,6 +30,7 @@ config RISCV select ARCH_HAS_STRICT_KERNEL_RWX if MMU select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT + select ARCH_USE_QUEUED_RWLOCKS select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU select ARCH_WANT_FRAME_POINTERS select ARCH_WANT_HUGE_PMD_SHARE if 64BIT diff --git a/arch/riscv/include/asm/Kbuild b/arch/riscv/include/asm/Kbuild index 445ccc9..e57ef80 100644 --- a/arch/riscv/include/asm/Kbuild +++ b/arch/riscv/include/asm/Kbuild @@ -3,5 +3,6 @@ generic-y += early_ioremap.h generic-y += extable.h generic-y += flat.h generic-y += kvm_para.h +generic-y += qrwlock.h generic-y += user.h generic-y += vmlinux.lds.h diff --git a/arch/riscv/include/asm/spinlock.h b/arch/riscv/include/asm/spinlock.h index f4f7fa1..2c81764 100644 --- a/arch/riscv/include/asm/spinlock.h +++ b/arch/riscv/include/asm/spinlock.h @@ -7,129 +7,91 @@ #ifndef _ASM_RISCV_SPINLOCK_H #define _ASM_RISCV_SPINLOCK_H -#include <linux/kernel.h> -#include <asm/current.h> -#include <asm/fence.h> - /* - * Simple spin lock operations. These provide no fairness guarantees. + * Ticket-based spin-locking. */ +static inline void arch_spin_lock(arch_spinlock_t *lock) +{ + arch_spinlock_t lockval; + u32 tmp; + + asm volatile ( + "1: lr.w %0, %2 \n" + " mv %1, %0 \n" + " addw %0, %0, %3 \n" + " sc.w %0, %0, %2 \n" + " bnez %0, 1b \n" + : "=&r" (tmp), "=&r" (lockval), "+A" (lock->lock) + : "r" (1 << TICKET_NEXT) + : "memory"); -/* FIXME: Replace this with a ticket lock, like MIPS. */ - -#define arch_spin_is_locked(x) (READ_ONCE((x)->lock) != 0) + while (lockval.tickets.next != lockval.tickets.owner) { + /* + * FIXME - we need wfi/wfe here to prevent: + * - cache line bouncing + * - saving cpu pipeline in multi-harts-per-core + * processor + */ + lockval.tickets.owner = READ_ONCE(lock->tickets.owner); + } -static inline void arch_spin_unlock(arch_spinlock_t *lock) -{ - smp_store_release(&lock->lock, 0); + __atomic_acquire_fence(); } static inline int arch_spin_trylock(arch_spinlock_t *lock) { - int tmp = 1, busy; - - __asm__ __volatile__ ( - " amoswap.w %0, %2, %1\n" - RISCV_ACQUIRE_BARRIER - : "=r" (busy), "+A" (lock->lock) - : "r" (tmp) + u32 tmp, contended, res; + + do { + asm volatile ( + " lr.w %0, %3 \n" + " srliw %1, %0, %5 \n" + " slliw %2, %0, %5 \n" + " or %1, %2, %1 \n" + " li %2, 0 \n" + " sub %1, %1, %0 \n" + " bnez %1, 1f \n" + " addw %0, %0, %4 \n" + " sc.w %2, %0, %3 \n" + "1: \n" + : "=&r" (tmp), "=&r" (contended), "=&r" (res), + "+A" (lock->lock) + : "r" (1 << TICKET_NEXT), "I" (TICKET_NEXT) : "memory"); + } while (res); - return !busy; -} - -static inline void arch_spin_lock(arch_spinlock_t *lock) -{ - while (1) { - if (arch_spin_is_locked(lock)) - continue; - - if (arch_spin_trylock(lock)) - break; + if (!contended) { + __atomic_acquire_fence(); + return 1; + } else { + return 0; } } -/***********************************************************/ - -static inline void arch_read_lock(arch_rwlock_t *lock) +static inline void arch_spin_unlock(arch_spinlock_t *lock) { - int tmp; - - __asm__ __volatile__( - "1: lr.w %1, %0\n" - " bltz %1, 1b\n" - " addi %1, %1, 1\n" - " sc.w %1, %1, %0\n" - " bnez %1, 1b\n" - RISCV_ACQUIRE_BARRIER - : "+A" (lock->lock), "=&r" (tmp) - :: "memory"); + smp_store_release(&lock->tickets.owner, lock->tickets.owner + 1); + /* FIXME - we need ipi/sev here to notify above */ } -static inline void arch_write_lock(arch_rwlock_t *lock) +static inline int arch_spin_value_unlocked(arch_spinlock_t lock) { - int tmp; - - __asm__ __volatile__( - "1: lr.w %1, %0\n" - " bnez %1, 1b\n" - " li %1, -1\n" - " sc.w %1, %1, %0\n" - " bnez %1, 1b\n" - RISCV_ACQUIRE_BARRIER - : "+A" (lock->lock), "=&r" (tmp) - :: "memory"); + return lock.tickets.owner == lock.tickets.next; } -static inline int arch_read_trylock(arch_rwlock_t *lock) +static inline int arch_spin_is_locked(arch_spinlock_t *lock) { - int busy; - - __asm__ __volatile__( - "1: lr.w %1, %0\n" - " bltz %1, 1f\n" - " addi %1, %1, 1\n" - " sc.w %1, %1, %0\n" - " bnez %1, 1b\n" - RISCV_ACQUIRE_BARRIER - "1:\n" - : "+A" (lock->lock), "=&r" (busy) - :: "memory"); - - return !busy; + return !arch_spin_value_unlocked(READ_ONCE(*lock)); } -static inline int arch_write_trylock(arch_rwlock_t *lock) +static inline int arch_spin_is_contended(arch_spinlock_t *lock) { - int busy; - - __asm__ __volatile__( - "1: lr.w %1, %0\n" - " bnez %1, 1f\n" - " li %1, -1\n" - " sc.w %1, %1, %0\n" - " bnez %1, 1b\n" - RISCV_ACQUIRE_BARRIER - "1:\n" - : "+A" (lock->lock), "=&r" (busy) - :: "memory"); + struct __raw_tickets tickets = READ_ONCE(lock->tickets); - return !busy; + return (tickets.next - tickets.owner) > 1; } +#define arch_spin_is_contended arch_spin_is_contended -static inline void arch_read_unlock(arch_rwlock_t *lock) -{ - __asm__ __volatile__( - RISCV_RELEASE_BARRIER - " amoadd.w x0, %1, %0\n" - : "+A" (lock->lock) - : "r" (-1) - : "memory"); -} - -static inline void arch_write_unlock(arch_rwlock_t *lock) -{ - smp_store_release(&lock->lock, 0); -} +#include <asm/qrwlock.h> #endif /* _ASM_RISCV_SPINLOCK_H */ diff --git a/arch/riscv/include/asm/spinlock_types.h b/arch/riscv/include/asm/spinlock_types.h index f398e76..d7b38bf 100644 --- a/arch/riscv/include/asm/spinlock_types.h +++ b/arch/riscv/include/asm/spinlock_types.h @@ -10,16 +10,21 @@ # error "please don't include this file directly" #endif +#define TICKET_NEXT 16 + typedef struct { - volatile unsigned int lock; + union { + u32 lock; + struct __raw_tickets { + /* little endian */ + u16 owner; + u16 next; + } tickets; + }; } arch_spinlock_t; -#define __ARCH_SPIN_LOCK_UNLOCKED { 0 } - -typedef struct { - volatile unsigned int lock; -} arch_rwlock_t; +#define __ARCH_SPIN_LOCK_UNLOCKED { { 0 } } -#define __ARCH_RW_LOCK_UNLOCKED { 0 } +#include <asm-generic/qrwlock_types.h> #endif /* _ASM_RISCV_SPINLOCK_TYPES_H */