diff mbox series

[V11,05/17] riscv: qspinlock: Add basic queued_spinlock support

Message ID 20230910082911.3378782-6-guoren@kernel.org (mailing list archive)
State Changes Requested
Headers show
Series riscv: Add Native/Paravirt qspinlock support | expand

Checks

Context Check Description
conchuod/tree_selection fail Failed to apply to next/pending-fixes, riscv/for-next or riscv/master

Commit Message

Guo Ren Sept. 10, 2023, 8:28 a.m. UTC
From: Guo Ren <guoren@linux.alibaba.com>

The requirements of qspinlock have been documented by commit:
a8ad07e5240c ("asm-generic: qspinlock: Indicate the use of mixed-size
atomics").

Although RISC-V ISA gives out a weaker forward guarantee LR/SC, which
doesn't satisfy the requirements of qspinlock above, it won't prevent
some riscv vendors from implementing a strong fwd guarantee LR/SC in
microarchitecture to match xchg_tail requirement. T-HEAD C9xx processor
is the one.

We've tested the patch on SOPHGO sg2042 & th1520 and passed the stress
test on Fedora & Ubuntu & OpenEuler ... Here is the performance
comparison between qspinlock and ticket_lock on sg2042 (64 cores):

sysbench test=threads threads=32 yields=100 lock=8 (+13.8%):
  queued_spinlock 0.5109/0.00
  ticket_spinlock 0.5814/0.00

perf futex/hash (+6.7%):
  queued_spinlock 1444393 operations/sec (+- 0.09%)
  ticket_spinlock 1353215 operations/sec (+- 0.15%)

perf futex/wake-parallel (+8.6%):
  queued_spinlock (waking 1/64 threads) in 0.0253 ms (+-2.90%)
  ticket_spinlock (waking 1/64 threads) in 0.0275 ms (+-3.12%)

perf futex/requeue (+4.2%):
  queued_spinlock Requeued 64 of 64 threads in 0.0785 ms (+-0.55%)
  ticket_spinlock Requeued 64 of 64 threads in 0.0818 ms (+-4.12%)

System Benchmarks (+6.4%)
  queued_spinlock:
    System Benchmarks Index Values               BASELINE       RESULT    INDEX
    Dhrystone 2 using register variables         116700.0  628613745.4  53865.8
    Double-Precision Whetstone                       55.0     182422.8  33167.8
    Execl Throughput                                 43.0      13116.6   3050.4
    File Copy 1024 bufsize 2000 maxblocks          3960.0    7762306.2  19601.8
    File Copy 256 bufsize 500 maxblocks            1655.0    3417556.8  20649.9
    File Copy 4096 bufsize 8000 maxblocks          5800.0    7427995.7  12806.9
    Pipe Throughput                               12440.0   23058600.5  18535.9
    Pipe-based Context Switching                   4000.0    2835617.7   7089.0
    Process Creation                                126.0      12537.3    995.0
    Shell Scripts (1 concurrent)                     42.4      57057.4  13456.9
    Shell Scripts (8 concurrent)                      6.0       7367.1  12278.5
    System Call Overhead                          15000.0   33308301.3  22205.5
                                                                       ========
    System Benchmarks Index Score                                       12426.1

  ticket_spinlock:
    System Benchmarks Index Values               BASELINE       RESULT    INDEX
    Dhrystone 2 using register variables         116700.0  626541701.9  53688.2
    Double-Precision Whetstone                       55.0     181921.0  33076.5
    Execl Throughput                                 43.0      12625.1   2936.1
    File Copy 1024 bufsize 2000 maxblocks          3960.0    6553792.9  16550.0
    File Copy 256 bufsize 500 maxblocks            1655.0    3189231.6  19270.3
    File Copy 4096 bufsize 8000 maxblocks          5800.0    7221277.0  12450.5
    Pipe Throughput                               12440.0   20594018.7  16554.7
    Pipe-based Context Switching                   4000.0    2571117.7   6427.8
    Process Creation                                126.0      10798.4    857.0
    Shell Scripts (1 concurrent)                     42.4      57227.5  13497.1
    Shell Scripts (8 concurrent)                      6.0       7329.2  12215.3
    System Call Overhead                          15000.0   30766778.4  20511.2
                                                                       ========
    System Benchmarks Index Score                                       11670.7

The qspinlock has a significant improvement on SOPHGO SG2042 64
cores platform than the ticket_lock.

Signed-off-by: Guo Ren <guoren@kernel.org>
Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
---
 arch/riscv/Kconfig                | 16 ++++++++++++++++
 arch/riscv/include/asm/Kbuild     |  3 ++-
 arch/riscv/include/asm/spinlock.h | 17 +++++++++++++++++
 3 files changed, 35 insertions(+), 1 deletion(-)
 create mode 100644 arch/riscv/include/asm/spinlock.h

Comments

Leonardo Bras Sept. 13, 2023, 8:28 p.m. UTC | #1
On Sun, Sep 10, 2023 at 04:28:59AM -0400, guoren@kernel.org wrote:
> From: Guo Ren <guoren@linux.alibaba.com>
> 
> The requirements of qspinlock have been documented by commit:
> a8ad07e5240c ("asm-generic: qspinlock: Indicate the use of mixed-size
> atomics").
> 
> Although RISC-V ISA gives out a weaker forward guarantee LR/SC, which
> doesn't satisfy the requirements of qspinlock above, it won't prevent
> some riscv vendors from implementing a strong fwd guarantee LR/SC in
> microarchitecture to match xchg_tail requirement. T-HEAD C9xx processor
> is the one.
> 
> We've tested the patch on SOPHGO sg2042 & th1520 and passed the stress
> test on Fedora & Ubuntu & OpenEuler ... Here is the performance
> comparison between qspinlock and ticket_lock on sg2042 (64 cores):
> 
> sysbench test=threads threads=32 yields=100 lock=8 (+13.8%):
>   queued_spinlock 0.5109/0.00
>   ticket_spinlock 0.5814/0.00
> 
> perf futex/hash (+6.7%):
>   queued_spinlock 1444393 operations/sec (+- 0.09%)
>   ticket_spinlock 1353215 operations/sec (+- 0.15%)
> 
> perf futex/wake-parallel (+8.6%):
>   queued_spinlock (waking 1/64 threads) in 0.0253 ms (+-2.90%)
>   ticket_spinlock (waking 1/64 threads) in 0.0275 ms (+-3.12%)
> 
> perf futex/requeue (+4.2%):
>   queued_spinlock Requeued 64 of 64 threads in 0.0785 ms (+-0.55%)
>   ticket_spinlock Requeued 64 of 64 threads in 0.0818 ms (+-4.12%)
> 
> System Benchmarks (+6.4%)
>   queued_spinlock:
>     System Benchmarks Index Values               BASELINE       RESULT    INDEX
>     Dhrystone 2 using register variables         116700.0  628613745.4  53865.8
>     Double-Precision Whetstone                       55.0     182422.8  33167.8
>     Execl Throughput                                 43.0      13116.6   3050.4
>     File Copy 1024 bufsize 2000 maxblocks          3960.0    7762306.2  19601.8
>     File Copy 256 bufsize 500 maxblocks            1655.0    3417556.8  20649.9
>     File Copy 4096 bufsize 8000 maxblocks          5800.0    7427995.7  12806.9
>     Pipe Throughput                               12440.0   23058600.5  18535.9
>     Pipe-based Context Switching                   4000.0    2835617.7   7089.0
>     Process Creation                                126.0      12537.3    995.0
>     Shell Scripts (1 concurrent)                     42.4      57057.4  13456.9
>     Shell Scripts (8 concurrent)                      6.0       7367.1  12278.5
>     System Call Overhead                          15000.0   33308301.3  22205.5
>                                                                        ========
>     System Benchmarks Index Score                                       12426.1
> 
>   ticket_spinlock:
>     System Benchmarks Index Values               BASELINE       RESULT    INDEX
>     Dhrystone 2 using register variables         116700.0  626541701.9  53688.2
>     Double-Precision Whetstone                       55.0     181921.0  33076.5
>     Execl Throughput                                 43.0      12625.1   2936.1
>     File Copy 1024 bufsize 2000 maxblocks          3960.0    6553792.9  16550.0
>     File Copy 256 bufsize 500 maxblocks            1655.0    3189231.6  19270.3
>     File Copy 4096 bufsize 8000 maxblocks          5800.0    7221277.0  12450.5
>     Pipe Throughput                               12440.0   20594018.7  16554.7
>     Pipe-based Context Switching                   4000.0    2571117.7   6427.8
>     Process Creation                                126.0      10798.4    857.0
>     Shell Scripts (1 concurrent)                     42.4      57227.5  13497.1
>     Shell Scripts (8 concurrent)                      6.0       7329.2  12215.3
>     System Call Overhead                          15000.0   30766778.4  20511.2
>                                                                        ========
>     System Benchmarks Index Score                                       11670.7
> 
> The qspinlock has a significant improvement on SOPHGO SG2042 64
> cores platform than the ticket_lock.
> 
> Signed-off-by: Guo Ren <guoren@kernel.org>
> Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
> ---
>  arch/riscv/Kconfig                | 16 ++++++++++++++++
>  arch/riscv/include/asm/Kbuild     |  3 ++-
>  arch/riscv/include/asm/spinlock.h | 17 +++++++++++++++++
>  3 files changed, 35 insertions(+), 1 deletion(-)
>  create mode 100644 arch/riscv/include/asm/spinlock.h
> 
> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index 2c346fe169c1..7f39bfc75744 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -471,6 +471,22 @@ config NODES_SHIFT
>  	  Specify the maximum number of NUMA Nodes available on the target
>  	  system.  Increases memory reserved to accommodate various tables.
>  
> +choice
> +	prompt "RISC-V spinlock type"
> +	default RISCV_TICKET_SPINLOCKS
> +
> +config RISCV_TICKET_SPINLOCKS
> +	bool "Using ticket spinlock"
> +
> +config RISCV_QUEUED_SPINLOCKS
> +	bool "Using queued spinlock"
> +	depends on SMP && MMU
> +	select ARCH_USE_QUEUED_SPINLOCKS
> +	help
> +	  Make sure your micro arch LL/SC has a strong forward progress guarantee.
> +	  Otherwise, stay at ticket-lock.
> +endchoice
> +
>  config RISCV_ALTERNATIVE
>  	bool
>  	depends on !XIP_KERNEL
> diff --git a/arch/riscv/include/asm/Kbuild b/arch/riscv/include/asm/Kbuild
> index 504f8b7e72d4..a0dc85e4a754 100644
> --- a/arch/riscv/include/asm/Kbuild
> +++ b/arch/riscv/include/asm/Kbuild
> @@ -2,10 +2,11 @@
>  generic-y += early_ioremap.h
>  generic-y += flat.h
>  generic-y += kvm_para.h
> +generic-y += mcs_spinlock.h
>  generic-y += parport.h
> -generic-y += spinlock.h

IIUC here you take the asm-generic/spinlock.h (which defines arch_spin_*()) 
and include the asm-generic headers of mcs_spinlock and qspinlock. 

In this case, the qspinlock.h will provide the arch_spin_*() interfaces, 
which seems the oposite of the above description (ticket spinlocks being 
the standard).

Shouldn't ticket-spinlock.h also get included here?
(Also, I am probably missing something, as I dont' see the use of 
mcs_spinlock here.)

>  generic-y += spinlock_types.h
>  generic-y += qrwlock.h
>  generic-y += qrwlock_types.h
> +generic-y += qspinlock.h
>  generic-y += user.h
>  generic-y += vmlinux.lds.h
> diff --git a/arch/riscv/include/asm/spinlock.h b/arch/riscv/include/asm/spinlock.h
> new file mode 100644
> index 000000000000..c644a92d4548
> --- /dev/null
> +++ b/arch/riscv/include/asm/spinlock.h
> @@ -0,0 +1,17 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +#ifndef __ASM_RISCV_SPINLOCK_H
> +#define __ASM_RISCV_SPINLOCK_H
> +
> +#ifdef CONFIG_QUEUED_SPINLOCKS
> +#define _Q_PENDING_LOOPS	(1 << 9)
> +#endif

Any reason the above define couldn't be merged on the ifdef below?

> +
> +#ifdef CONFIG_QUEUED_SPINLOCKS
> +#include <asm/qspinlock.h>
> +#include <asm/qrwlock.h>
> +#else
> +#include <asm-generic/spinlock.h>
> +#endif
> +
> +#endif /* __ASM_RISCV_SPINLOCK_H */
> -- 
> 2.36.1
> 

Thanks!
Leo
Guo Ren Sept. 14, 2023, 4:46 a.m. UTC | #2
On Thu, Sep 14, 2023 at 4:29 AM Leonardo Bras <leobras@redhat.com> wrote:
>
> On Sun, Sep 10, 2023 at 04:28:59AM -0400, guoren@kernel.org wrote:
> > From: Guo Ren <guoren@linux.alibaba.com>
> >
> > The requirements of qspinlock have been documented by commit:
> > a8ad07e5240c ("asm-generic: qspinlock: Indicate the use of mixed-size
> > atomics").
> >
> > Although RISC-V ISA gives out a weaker forward guarantee LR/SC, which
> > doesn't satisfy the requirements of qspinlock above, it won't prevent
> > some riscv vendors from implementing a strong fwd guarantee LR/SC in
> > microarchitecture to match xchg_tail requirement. T-HEAD C9xx processor
> > is the one.
> >
> > We've tested the patch on SOPHGO sg2042 & th1520 and passed the stress
> > test on Fedora & Ubuntu & OpenEuler ... Here is the performance
> > comparison between qspinlock and ticket_lock on sg2042 (64 cores):
> >
> > sysbench test=threads threads=32 yields=100 lock=8 (+13.8%):
> >   queued_spinlock 0.5109/0.00
> >   ticket_spinlock 0.5814/0.00
> >
> > perf futex/hash (+6.7%):
> >   queued_spinlock 1444393 operations/sec (+- 0.09%)
> >   ticket_spinlock 1353215 operations/sec (+- 0.15%)
> >
> > perf futex/wake-parallel (+8.6%):
> >   queued_spinlock (waking 1/64 threads) in 0.0253 ms (+-2.90%)
> >   ticket_spinlock (waking 1/64 threads) in 0.0275 ms (+-3.12%)
> >
> > perf futex/requeue (+4.2%):
> >   queued_spinlock Requeued 64 of 64 threads in 0.0785 ms (+-0.55%)
> >   ticket_spinlock Requeued 64 of 64 threads in 0.0818 ms (+-4.12%)
> >
> > System Benchmarks (+6.4%)
> >   queued_spinlock:
> >     System Benchmarks Index Values               BASELINE       RESULT    INDEX
> >     Dhrystone 2 using register variables         116700.0  628613745.4  53865.8
> >     Double-Precision Whetstone                       55.0     182422.8  33167.8
> >     Execl Throughput                                 43.0      13116.6   3050.4
> >     File Copy 1024 bufsize 2000 maxblocks          3960.0    7762306.2  19601.8
> >     File Copy 256 bufsize 500 maxblocks            1655.0    3417556.8  20649.9
> >     File Copy 4096 bufsize 8000 maxblocks          5800.0    7427995.7  12806.9
> >     Pipe Throughput                               12440.0   23058600.5  18535.9
> >     Pipe-based Context Switching                   4000.0    2835617.7   7089.0
> >     Process Creation                                126.0      12537.3    995.0
> >     Shell Scripts (1 concurrent)                     42.4      57057.4  13456.9
> >     Shell Scripts (8 concurrent)                      6.0       7367.1  12278.5
> >     System Call Overhead                          15000.0   33308301.3  22205.5
> >                                                                        ========
> >     System Benchmarks Index Score                                       12426.1
> >
> >   ticket_spinlock:
> >     System Benchmarks Index Values               BASELINE       RESULT    INDEX
> >     Dhrystone 2 using register variables         116700.0  626541701.9  53688.2
> >     Double-Precision Whetstone                       55.0     181921.0  33076.5
> >     Execl Throughput                                 43.0      12625.1   2936.1
> >     File Copy 1024 bufsize 2000 maxblocks          3960.0    6553792.9  16550.0
> >     File Copy 256 bufsize 500 maxblocks            1655.0    3189231.6  19270.3
> >     File Copy 4096 bufsize 8000 maxblocks          5800.0    7221277.0  12450.5
> >     Pipe Throughput                               12440.0   20594018.7  16554.7
> >     Pipe-based Context Switching                   4000.0    2571117.7   6427.8
> >     Process Creation                                126.0      10798.4    857.0
> >     Shell Scripts (1 concurrent)                     42.4      57227.5  13497.1
> >     Shell Scripts (8 concurrent)                      6.0       7329.2  12215.3
> >     System Call Overhead                          15000.0   30766778.4  20511.2
> >                                                                        ========
> >     System Benchmarks Index Score                                       11670.7
> >
> > The qspinlock has a significant improvement on SOPHGO SG2042 64
> > cores platform than the ticket_lock.
> >
> > Signed-off-by: Guo Ren <guoren@kernel.org>
> > Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
> > ---
> >  arch/riscv/Kconfig                | 16 ++++++++++++++++
> >  arch/riscv/include/asm/Kbuild     |  3 ++-
> >  arch/riscv/include/asm/spinlock.h | 17 +++++++++++++++++
> >  3 files changed, 35 insertions(+), 1 deletion(-)
> >  create mode 100644 arch/riscv/include/asm/spinlock.h
> >
> > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> > index 2c346fe169c1..7f39bfc75744 100644
> > --- a/arch/riscv/Kconfig
> > +++ b/arch/riscv/Kconfig
> > @@ -471,6 +471,22 @@ config NODES_SHIFT
> >         Specify the maximum number of NUMA Nodes available on the target
> >         system.  Increases memory reserved to accommodate various tables.
> >
> > +choice
> > +     prompt "RISC-V spinlock type"
> > +     default RISCV_TICKET_SPINLOCKS
> > +
> > +config RISCV_TICKET_SPINLOCKS
> > +     bool "Using ticket spinlock"
> > +
> > +config RISCV_QUEUED_SPINLOCKS
> > +     bool "Using queued spinlock"
> > +     depends on SMP && MMU
> > +     select ARCH_USE_QUEUED_SPINLOCKS
> > +     help
> > +       Make sure your micro arch LL/SC has a strong forward progress guarantee.
> > +       Otherwise, stay at ticket-lock.
> > +endchoice
> > +
> >  config RISCV_ALTERNATIVE
> >       bool
> >       depends on !XIP_KERNEL
> > diff --git a/arch/riscv/include/asm/Kbuild b/arch/riscv/include/asm/Kbuild
> > index 504f8b7e72d4..a0dc85e4a754 100644
> > --- a/arch/riscv/include/asm/Kbuild
> > +++ b/arch/riscv/include/asm/Kbuild
> > @@ -2,10 +2,11 @@
> >  generic-y += early_ioremap.h
> >  generic-y += flat.h
> >  generic-y += kvm_para.h
> > +generic-y += mcs_spinlock.h
> >  generic-y += parport.h
> > -generic-y += spinlock.h
>
> IIUC here you take the asm-generic/spinlock.h (which defines arch_spin_*())
> and include the asm-generic headers of mcs_spinlock and qspinlock.
>
> In this case, the qspinlock.h will provide the arch_spin_*() interfaces,
> which seems the oposite of the above description (ticket spinlocks being
> the standard).
>
> Shouldn't ticket-spinlock.h also get included here?
> (Also, I am probably missing something, as I dont' see the use of
> mcs_spinlock here.)
No, because asm-generic/spinlock.h:
...
#include <asm-generic/ticket_spinlock.h>
...

>
> >  generic-y += spinlock_types.h
> >  generic-y += qrwlock.h
> >  generic-y += qrwlock_types.h
> > +generic-y += qspinlock.h
> >  generic-y += user.h
> >  generic-y += vmlinux.lds.h
> > diff --git a/arch/riscv/include/asm/spinlock.h b/arch/riscv/include/asm/spinlock.h
> > new file mode 100644
> > index 000000000000..c644a92d4548
> > --- /dev/null
> > +++ b/arch/riscv/include/asm/spinlock.h
> > @@ -0,0 +1,17 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +
> > +#ifndef __ASM_RISCV_SPINLOCK_H
> > +#define __ASM_RISCV_SPINLOCK_H
> > +
> > +#ifdef CONFIG_QUEUED_SPINLOCKS
> > +#define _Q_PENDING_LOOPS     (1 << 9)
> > +#endif
>
> Any reason the above define couldn't be merged on the ifdef below?
Easy for the next patch to modify. See Waiman's comment:

https://lore.kernel.org/linux-riscv/4cc7113a-0e4e-763a-cba2-7963bcd26c7a@redhat.com/

> diff --git a/arch/riscv/include/asm/spinlock.h b/arch/riscv/include/asm/spinlock.h
> index c644a92d4548..9eb3ad31e564 100644
> --- a/arch/riscv/include/asm/spinlock.h
> +++ b/arch/riscv/include/asm/spinlock.h
> @@ -7,11 +7,94 @@
>   #define _Q_PENDING_LOOPS (1 << 9)
>   #endif
>

I see why you separated the _Q_PENDING_LOOPS out.


>
> > +
> > +#ifdef CONFIG_QUEUED_SPINLOCKS
> > +#include <asm/qspinlock.h>
> > +#include <asm/qrwlock.h>
> > +#else
> > +#include <asm-generic/spinlock.h>
> > +#endif
> > +
> > +#endif /* __ASM_RISCV_SPINLOCK_H */
> > --
> > 2.36.1
> >
>
> Thanks!
> Leo
>
Leonardo Bras Sept. 14, 2023, 9:43 a.m. UTC | #3
On Thu, Sep 14, 2023 at 12:46:56PM +0800, Guo Ren wrote:
> On Thu, Sep 14, 2023 at 4:29 AM Leonardo Bras <leobras@redhat.com> wrote:
> >
> > On Sun, Sep 10, 2023 at 04:28:59AM -0400, guoren@kernel.org wrote:
> > > From: Guo Ren <guoren@linux.alibaba.com>
> > >
> > > The requirements of qspinlock have been documented by commit:
> > > a8ad07e5240c ("asm-generic: qspinlock: Indicate the use of mixed-size
> > > atomics").
> > >
> > > Although RISC-V ISA gives out a weaker forward guarantee LR/SC, which
> > > doesn't satisfy the requirements of qspinlock above, it won't prevent
> > > some riscv vendors from implementing a strong fwd guarantee LR/SC in
> > > microarchitecture to match xchg_tail requirement. T-HEAD C9xx processor
> > > is the one.
> > >
> > > We've tested the patch on SOPHGO sg2042 & th1520 and passed the stress
> > > test on Fedora & Ubuntu & OpenEuler ... Here is the performance
> > > comparison between qspinlock and ticket_lock on sg2042 (64 cores):
> > >
> > > sysbench test=threads threads=32 yields=100 lock=8 (+13.8%):
> > >   queued_spinlock 0.5109/0.00
> > >   ticket_spinlock 0.5814/0.00
> > >
> > > perf futex/hash (+6.7%):
> > >   queued_spinlock 1444393 operations/sec (+- 0.09%)
> > >   ticket_spinlock 1353215 operations/sec (+- 0.15%)
> > >
> > > perf futex/wake-parallel (+8.6%):
> > >   queued_spinlock (waking 1/64 threads) in 0.0253 ms (+-2.90%)
> > >   ticket_spinlock (waking 1/64 threads) in 0.0275 ms (+-3.12%)
> > >
> > > perf futex/requeue (+4.2%):
> > >   queued_spinlock Requeued 64 of 64 threads in 0.0785 ms (+-0.55%)
> > >   ticket_spinlock Requeued 64 of 64 threads in 0.0818 ms (+-4.12%)
> > >
> > > System Benchmarks (+6.4%)
> > >   queued_spinlock:
> > >     System Benchmarks Index Values               BASELINE       RESULT    INDEX
> > >     Dhrystone 2 using register variables         116700.0  628613745.4  53865.8
> > >     Double-Precision Whetstone                       55.0     182422.8  33167.8
> > >     Execl Throughput                                 43.0      13116.6   3050.4
> > >     File Copy 1024 bufsize 2000 maxblocks          3960.0    7762306.2  19601.8
> > >     File Copy 256 bufsize 500 maxblocks            1655.0    3417556.8  20649.9
> > >     File Copy 4096 bufsize 8000 maxblocks          5800.0    7427995.7  12806.9
> > >     Pipe Throughput                               12440.0   23058600.5  18535.9
> > >     Pipe-based Context Switching                   4000.0    2835617.7   7089.0
> > >     Process Creation                                126.0      12537.3    995.0
> > >     Shell Scripts (1 concurrent)                     42.4      57057.4  13456.9
> > >     Shell Scripts (8 concurrent)                      6.0       7367.1  12278.5
> > >     System Call Overhead                          15000.0   33308301.3  22205.5
> > >                                                                        ========
> > >     System Benchmarks Index Score                                       12426.1
> > >
> > >   ticket_spinlock:
> > >     System Benchmarks Index Values               BASELINE       RESULT    INDEX
> > >     Dhrystone 2 using register variables         116700.0  626541701.9  53688.2
> > >     Double-Precision Whetstone                       55.0     181921.0  33076.5
> > >     Execl Throughput                                 43.0      12625.1   2936.1
> > >     File Copy 1024 bufsize 2000 maxblocks          3960.0    6553792.9  16550.0
> > >     File Copy 256 bufsize 500 maxblocks            1655.0    3189231.6  19270.3
> > >     File Copy 4096 bufsize 8000 maxblocks          5800.0    7221277.0  12450.5
> > >     Pipe Throughput                               12440.0   20594018.7  16554.7
> > >     Pipe-based Context Switching                   4000.0    2571117.7   6427.8
> > >     Process Creation                                126.0      10798.4    857.0
> > >     Shell Scripts (1 concurrent)                     42.4      57227.5  13497.1
> > >     Shell Scripts (8 concurrent)                      6.0       7329.2  12215.3
> > >     System Call Overhead                          15000.0   30766778.4  20511.2
> > >                                                                        ========
> > >     System Benchmarks Index Score                                       11670.7
> > >
> > > The qspinlock has a significant improvement on SOPHGO SG2042 64
> > > cores platform than the ticket_lock.
> > >
> > > Signed-off-by: Guo Ren <guoren@kernel.org>
> > > Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
> > > ---
> > >  arch/riscv/Kconfig                | 16 ++++++++++++++++
> > >  arch/riscv/include/asm/Kbuild     |  3 ++-
> > >  arch/riscv/include/asm/spinlock.h | 17 +++++++++++++++++
> > >  3 files changed, 35 insertions(+), 1 deletion(-)
> > >  create mode 100644 arch/riscv/include/asm/spinlock.h
> > >
> > > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> > > index 2c346fe169c1..7f39bfc75744 100644
> > > --- a/arch/riscv/Kconfig
> > > +++ b/arch/riscv/Kconfig
> > > @@ -471,6 +471,22 @@ config NODES_SHIFT
> > >         Specify the maximum number of NUMA Nodes available on the target
> > >         system.  Increases memory reserved to accommodate various tables.
> > >
> > > +choice
> > > +     prompt "RISC-V spinlock type"
> > > +     default RISCV_TICKET_SPINLOCKS
> > > +
> > > +config RISCV_TICKET_SPINLOCKS
> > > +     bool "Using ticket spinlock"
> > > +
> > > +config RISCV_QUEUED_SPINLOCKS
> > > +     bool "Using queued spinlock"
> > > +     depends on SMP && MMU
> > > +     select ARCH_USE_QUEUED_SPINLOCKS
> > > +     help
> > > +       Make sure your micro arch LL/SC has a strong forward progress guarantee.
> > > +       Otherwise, stay at ticket-lock.
> > > +endchoice
> > > +
> > >  config RISCV_ALTERNATIVE
> > >       bool
> > >       depends on !XIP_KERNEL
> > > diff --git a/arch/riscv/include/asm/Kbuild b/arch/riscv/include/asm/Kbuild
> > > index 504f8b7e72d4..a0dc85e4a754 100644
> > > --- a/arch/riscv/include/asm/Kbuild
> > > +++ b/arch/riscv/include/asm/Kbuild
> > > @@ -2,10 +2,11 @@
> > >  generic-y += early_ioremap.h
> > >  generic-y += flat.h
> > >  generic-y += kvm_para.h
> > > +generic-y += mcs_spinlock.h
> > >  generic-y += parport.h
> > > -generic-y += spinlock.h
> >
> > IIUC here you take the asm-generic/spinlock.h (which defines arch_spin_*())
> > and include the asm-generic headers of mcs_spinlock and qspinlock.
> >
> > In this case, the qspinlock.h will provide the arch_spin_*() interfaces,
> > which seems the oposite of the above description (ticket spinlocks being
> > the standard).
> >
> > Shouldn't ticket-spinlock.h also get included here?
> > (Also, I am probably missing something, as I dont' see the use of
> > mcs_spinlock here.)
> No, because asm-generic/spinlock.h:
> ...
> #include <asm-generic/ticket_spinlock.h>
> ...
> 

But aren't you removing asm-generic/spinlock.h below ?
-generic-y += spinlock.h

> >
> > >  generic-y += spinlock_types.h
> > >  generic-y += qrwlock.h
> > >  generic-y += qrwlock_types.h
> > > +generic-y += qspinlock.h
> > >  generic-y += user.h
> > >  generic-y += vmlinux.lds.h
> > > diff --git a/arch/riscv/include/asm/spinlock.h b/arch/riscv/include/asm/spinlock.h
> > > new file mode 100644
> > > index 000000000000..c644a92d4548
> > > --- /dev/null
> > > +++ b/arch/riscv/include/asm/spinlock.h
> > > @@ -0,0 +1,17 @@
> > > +/* SPDX-License-Identifier: GPL-2.0 */
> > > +
> > > +#ifndef __ASM_RISCV_SPINLOCK_H
> > > +#define __ASM_RISCV_SPINLOCK_H
> > > +
> > > +#ifdef CONFIG_QUEUED_SPINLOCKS
> > > +#define _Q_PENDING_LOOPS     (1 << 9)
> > > +#endif
> >
> > Any reason the above define couldn't be merged on the ifdef below?
> Easy for the next patch to modify. See Waiman's comment:
> 
> https://lore.kernel.org/linux-riscv/4cc7113a-0e4e-763a-cba2-7963bcd26c7a@redhat.com/
> 
> > diff --git a/arch/riscv/include/asm/spinlock.h b/arch/riscv/include/asm/spinlock.h
> > index c644a92d4548..9eb3ad31e564 100644
> > --- a/arch/riscv/include/asm/spinlock.h
> > +++ b/arch/riscv/include/asm/spinlock.h
> > @@ -7,11 +7,94 @@
> >   #define _Q_PENDING_LOOPS (1 << 9)
> >   #endif
> >
> 
> I see why you separated the _Q_PENDING_LOOPS out.
> 

I see, should be fine then.

Thanks!
Leo

> 
> >
> > > +
> > > +#ifdef CONFIG_QUEUED_SPINLOCKS
> > > +#include <asm/qspinlock.h>
> > > +#include <asm/qrwlock.h>
> > > +#else
> > > +#include <asm-generic/spinlock.h>
> > > +#endif
> > > +
> > > +#endif /* __ASM_RISCV_SPINLOCK_H */
> > > --
> > > 2.36.1
> > >
> >
> > Thanks!
> > Leo
> >
> 
> 
> -- 
> Best Regards
>  Guo Ren
>
Guo Ren Sept. 15, 2023, 2:10 a.m. UTC | #4
On Thu, Sep 14, 2023 at 5:43 PM Leonardo Bras <leobras@redhat.com> wrote:
>
> On Thu, Sep 14, 2023 at 12:46:56PM +0800, Guo Ren wrote:
> > On Thu, Sep 14, 2023 at 4:29 AM Leonardo Bras <leobras@redhat.com> wrote:
> > >
> > > On Sun, Sep 10, 2023 at 04:28:59AM -0400, guoren@kernel.org wrote:
> > > > From: Guo Ren <guoren@linux.alibaba.com>
> > > >
> > > > The requirements of qspinlock have been documented by commit:
> > > > a8ad07e5240c ("asm-generic: qspinlock: Indicate the use of mixed-size
> > > > atomics").
> > > >
> > > > Although RISC-V ISA gives out a weaker forward guarantee LR/SC, which
> > > > doesn't satisfy the requirements of qspinlock above, it won't prevent
> > > > some riscv vendors from implementing a strong fwd guarantee LR/SC in
> > > > microarchitecture to match xchg_tail requirement. T-HEAD C9xx processor
> > > > is the one.
> > > >
> > > > We've tested the patch on SOPHGO sg2042 & th1520 and passed the stress
> > > > test on Fedora & Ubuntu & OpenEuler ... Here is the performance
> > > > comparison between qspinlock and ticket_lock on sg2042 (64 cores):
> > > >
> > > > sysbench test=threads threads=32 yields=100 lock=8 (+13.8%):
> > > >   queued_spinlock 0.5109/0.00
> > > >   ticket_spinlock 0.5814/0.00
> > > >
> > > > perf futex/hash (+6.7%):
> > > >   queued_spinlock 1444393 operations/sec (+- 0.09%)
> > > >   ticket_spinlock 1353215 operations/sec (+- 0.15%)
> > > >
> > > > perf futex/wake-parallel (+8.6%):
> > > >   queued_spinlock (waking 1/64 threads) in 0.0253 ms (+-2.90%)
> > > >   ticket_spinlock (waking 1/64 threads) in 0.0275 ms (+-3.12%)
> > > >
> > > > perf futex/requeue (+4.2%):
> > > >   queued_spinlock Requeued 64 of 64 threads in 0.0785 ms (+-0.55%)
> > > >   ticket_spinlock Requeued 64 of 64 threads in 0.0818 ms (+-4.12%)
> > > >
> > > > System Benchmarks (+6.4%)
> > > >   queued_spinlock:
> > > >     System Benchmarks Index Values               BASELINE       RESULT    INDEX
> > > >     Dhrystone 2 using register variables         116700.0  628613745.4  53865.8
> > > >     Double-Precision Whetstone                       55.0     182422.8  33167.8
> > > >     Execl Throughput                                 43.0      13116.6   3050.4
> > > >     File Copy 1024 bufsize 2000 maxblocks          3960.0    7762306.2  19601.8
> > > >     File Copy 256 bufsize 500 maxblocks            1655.0    3417556.8  20649.9
> > > >     File Copy 4096 bufsize 8000 maxblocks          5800.0    7427995.7  12806.9
> > > >     Pipe Throughput                               12440.0   23058600.5  18535.9
> > > >     Pipe-based Context Switching                   4000.0    2835617.7   7089.0
> > > >     Process Creation                                126.0      12537.3    995.0
> > > >     Shell Scripts (1 concurrent)                     42.4      57057.4  13456.9
> > > >     Shell Scripts (8 concurrent)                      6.0       7367.1  12278.5
> > > >     System Call Overhead                          15000.0   33308301.3  22205.5
> > > >                                                                        ========
> > > >     System Benchmarks Index Score                                       12426.1
> > > >
> > > >   ticket_spinlock:
> > > >     System Benchmarks Index Values               BASELINE       RESULT    INDEX
> > > >     Dhrystone 2 using register variables         116700.0  626541701.9  53688.2
> > > >     Double-Precision Whetstone                       55.0     181921.0  33076.5
> > > >     Execl Throughput                                 43.0      12625.1   2936.1
> > > >     File Copy 1024 bufsize 2000 maxblocks          3960.0    6553792.9  16550.0
> > > >     File Copy 256 bufsize 500 maxblocks            1655.0    3189231.6  19270.3
> > > >     File Copy 4096 bufsize 8000 maxblocks          5800.0    7221277.0  12450.5
> > > >     Pipe Throughput                               12440.0   20594018.7  16554.7
> > > >     Pipe-based Context Switching                   4000.0    2571117.7   6427.8
> > > >     Process Creation                                126.0      10798.4    857.0
> > > >     Shell Scripts (1 concurrent)                     42.4      57227.5  13497.1
> > > >     Shell Scripts (8 concurrent)                      6.0       7329.2  12215.3
> > > >     System Call Overhead                          15000.0   30766778.4  20511.2
> > > >                                                                        ========
> > > >     System Benchmarks Index Score                                       11670.7
> > > >
> > > > The qspinlock has a significant improvement on SOPHGO SG2042 64
> > > > cores platform than the ticket_lock.
> > > >
> > > > Signed-off-by: Guo Ren <guoren@kernel.org>
> > > > Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
> > > > ---
> > > >  arch/riscv/Kconfig                | 16 ++++++++++++++++
> > > >  arch/riscv/include/asm/Kbuild     |  3 ++-
> > > >  arch/riscv/include/asm/spinlock.h | 17 +++++++++++++++++
> > > >  3 files changed, 35 insertions(+), 1 deletion(-)
> > > >  create mode 100644 arch/riscv/include/asm/spinlock.h
> > > >
> > > > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> > > > index 2c346fe169c1..7f39bfc75744 100644
> > > > --- a/arch/riscv/Kconfig
> > > > +++ b/arch/riscv/Kconfig
> > > > @@ -471,6 +471,22 @@ config NODES_SHIFT
> > > >         Specify the maximum number of NUMA Nodes available on the target
> > > >         system.  Increases memory reserved to accommodate various tables.
> > > >
> > > > +choice
> > > > +     prompt "RISC-V spinlock type"
> > > > +     default RISCV_TICKET_SPINLOCKS
> > > > +
> > > > +config RISCV_TICKET_SPINLOCKS
> > > > +     bool "Using ticket spinlock"
> > > > +
> > > > +config RISCV_QUEUED_SPINLOCKS
> > > > +     bool "Using queued spinlock"
> > > > +     depends on SMP && MMU
> > > > +     select ARCH_USE_QUEUED_SPINLOCKS
> > > > +     help
> > > > +       Make sure your micro arch LL/SC has a strong forward progress guarantee.
> > > > +       Otherwise, stay at ticket-lock.
> > > > +endchoice
> > > > +
> > > >  config RISCV_ALTERNATIVE
> > > >       bool
> > > >       depends on !XIP_KERNEL
> > > > diff --git a/arch/riscv/include/asm/Kbuild b/arch/riscv/include/asm/Kbuild
> > > > index 504f8b7e72d4..a0dc85e4a754 100644
> > > > --- a/arch/riscv/include/asm/Kbuild
> > > > +++ b/arch/riscv/include/asm/Kbuild
> > > > @@ -2,10 +2,11 @@
> > > >  generic-y += early_ioremap.h
> > > >  generic-y += flat.h
> > > >  generic-y += kvm_para.h
> > > > +generic-y += mcs_spinlock.h
> > > >  generic-y += parport.h
> > > > -generic-y += spinlock.h
> > >
> > > IIUC here you take the asm-generic/spinlock.h (which defines arch_spin_*())
> > > and include the asm-generic headers of mcs_spinlock and qspinlock.
> > >
> > > In this case, the qspinlock.h will provide the arch_spin_*() interfaces,
> > > which seems the oposite of the above description (ticket spinlocks being
> > > the standard).
> > >
> > > Shouldn't ticket-spinlock.h also get included here?
> > > (Also, I am probably missing something, as I dont' see the use of
> > > mcs_spinlock here.)
> > No, because asm-generic/spinlock.h:
> > ...
> > #include <asm-generic/ticket_spinlock.h>
> > ...
> >
>
> But aren't you removing asm-generic/spinlock.h below ?
> -generic-y += spinlock.h
Yes, current is:

arch/riscv/include/asm/spinlock.h -> include/asm-generic/spinlock.h ->
include/asm-generic/ticket_spinlock.h

+#ifdef CONFIG_QUEUED_SPINLOCKS
+#include <asm/qspinlock.h>
+#include <asm/qrwlock.h>
+#else
+#include <asm-generic/spinlock.h>
+#endif

So, you want me:
+#ifdef CONFIG_QUEUED_SPINLOCKS
+#include <asm/qspinlock.h>
+#else
+#include <asm-generic/ticket_spinlock.h>
+#endif

+#include <asm/qrwlock.h>

Right?

>
> > >
> > > >  generic-y += spinlock_types.h
> > > >  generic-y += qrwlock.h
> > > >  generic-y += qrwlock_types.h
> > > > +generic-y += qspinlock.h
> > > >  generic-y += user.h
> > > >  generic-y += vmlinux.lds.h
> > > > diff --git a/arch/riscv/include/asm/spinlock.h b/arch/riscv/include/asm/spinlock.h
> > > > new file mode 100644
> > > > index 000000000000..c644a92d4548
> > > > --- /dev/null
> > > > +++ b/arch/riscv/include/asm/spinlock.h
> > > > @@ -0,0 +1,17 @@
> > > > +/* SPDX-License-Identifier: GPL-2.0 */
> > > > +
> > > > +#ifndef __ASM_RISCV_SPINLOCK_H
> > > > +#define __ASM_RISCV_SPINLOCK_H
> > > > +
> > > > +#ifdef CONFIG_QUEUED_SPINLOCKS
> > > > +#define _Q_PENDING_LOOPS     (1 << 9)
> > > > +#endif
> > >
> > > Any reason the above define couldn't be merged on the ifdef below?
> > Easy for the next patch to modify. See Waiman's comment:
> >
> > https://lore.kernel.org/linux-riscv/4cc7113a-0e4e-763a-cba2-7963bcd26c7a@redhat.com/
> >
> > > diff --git a/arch/riscv/include/asm/spinlock.h b/arch/riscv/include/asm/spinlock.h
> > > index c644a92d4548..9eb3ad31e564 100644
> > > --- a/arch/riscv/include/asm/spinlock.h
> > > +++ b/arch/riscv/include/asm/spinlock.h
> > > @@ -7,11 +7,94 @@
> > >   #define _Q_PENDING_LOOPS (1 << 9)
> > >   #endif
> > >
> >
> > I see why you separated the _Q_PENDING_LOOPS out.
> >
>
> I see, should be fine then.
>
> Thanks!
> Leo
>
> >
> > >
> > > > +
> > > > +#ifdef CONFIG_QUEUED_SPINLOCKS
> > > > +#include <asm/qspinlock.h>
> > > > +#include <asm/qrwlock.h>
> > > > +#else
> > > > +#include <asm-generic/spinlock.h>
> > > > +#endif
> > > > +
> > > > +#endif /* __ASM_RISCV_SPINLOCK_H */
> > > > --
> > > > 2.36.1
> > > >
> > >
> > > Thanks!
> > > Leo
> > >
> >
> >
> > --
> > Best Regards
> >  Guo Ren
> >
>
Leonardo Bras Sept. 15, 2023, 9:08 a.m. UTC | #5
On Fri, Sep 15, 2023 at 10:10:25AM +0800, Guo Ren wrote:
> On Thu, Sep 14, 2023 at 5:43 PM Leonardo Bras <leobras@redhat.com> wrote:
> >
> > On Thu, Sep 14, 2023 at 12:46:56PM +0800, Guo Ren wrote:
> > > On Thu, Sep 14, 2023 at 4:29 AM Leonardo Bras <leobras@redhat.com> wrote:
> > > >
> > > > On Sun, Sep 10, 2023 at 04:28:59AM -0400, guoren@kernel.org wrote:
> > > > > From: Guo Ren <guoren@linux.alibaba.com>
> > > > >
> > > > > The requirements of qspinlock have been documented by commit:
> > > > > a8ad07e5240c ("asm-generic: qspinlock: Indicate the use of mixed-size
> > > > > atomics").
> > > > >
> > > > > Although RISC-V ISA gives out a weaker forward guarantee LR/SC, which
> > > > > doesn't satisfy the requirements of qspinlock above, it won't prevent
> > > > > some riscv vendors from implementing a strong fwd guarantee LR/SC in
> > > > > microarchitecture to match xchg_tail requirement. T-HEAD C9xx processor
> > > > > is the one.
> > > > >
> > > > > We've tested the patch on SOPHGO sg2042 & th1520 and passed the stress
> > > > > test on Fedora & Ubuntu & OpenEuler ... Here is the performance
> > > > > comparison between qspinlock and ticket_lock on sg2042 (64 cores):
> > > > >
> > > > > sysbench test=threads threads=32 yields=100 lock=8 (+13.8%):
> > > > >   queued_spinlock 0.5109/0.00
> > > > >   ticket_spinlock 0.5814/0.00
> > > > >
> > > > > perf futex/hash (+6.7%):
> > > > >   queued_spinlock 1444393 operations/sec (+- 0.09%)
> > > > >   ticket_spinlock 1353215 operations/sec (+- 0.15%)
> > > > >
> > > > > perf futex/wake-parallel (+8.6%):
> > > > >   queued_spinlock (waking 1/64 threads) in 0.0253 ms (+-2.90%)
> > > > >   ticket_spinlock (waking 1/64 threads) in 0.0275 ms (+-3.12%)
> > > > >
> > > > > perf futex/requeue (+4.2%):
> > > > >   queued_spinlock Requeued 64 of 64 threads in 0.0785 ms (+-0.55%)
> > > > >   ticket_spinlock Requeued 64 of 64 threads in 0.0818 ms (+-4.12%)
> > > > >
> > > > > System Benchmarks (+6.4%)
> > > > >   queued_spinlock:
> > > > >     System Benchmarks Index Values               BASELINE       RESULT    INDEX
> > > > >     Dhrystone 2 using register variables         116700.0  628613745.4  53865.8
> > > > >     Double-Precision Whetstone                       55.0     182422.8  33167.8
> > > > >     Execl Throughput                                 43.0      13116.6   3050.4
> > > > >     File Copy 1024 bufsize 2000 maxblocks          3960.0    7762306.2  19601.8
> > > > >     File Copy 256 bufsize 500 maxblocks            1655.0    3417556.8  20649.9
> > > > >     File Copy 4096 bufsize 8000 maxblocks          5800.0    7427995.7  12806.9
> > > > >     Pipe Throughput                               12440.0   23058600.5  18535.9
> > > > >     Pipe-based Context Switching                   4000.0    2835617.7   7089.0
> > > > >     Process Creation                                126.0      12537.3    995.0
> > > > >     Shell Scripts (1 concurrent)                     42.4      57057.4  13456.9
> > > > >     Shell Scripts (8 concurrent)                      6.0       7367.1  12278.5
> > > > >     System Call Overhead                          15000.0   33308301.3  22205.5
> > > > >                                                                        ========
> > > > >     System Benchmarks Index Score                                       12426.1
> > > > >
> > > > >   ticket_spinlock:
> > > > >     System Benchmarks Index Values               BASELINE       RESULT    INDEX
> > > > >     Dhrystone 2 using register variables         116700.0  626541701.9  53688.2
> > > > >     Double-Precision Whetstone                       55.0     181921.0  33076.5
> > > > >     Execl Throughput                                 43.0      12625.1   2936.1
> > > > >     File Copy 1024 bufsize 2000 maxblocks          3960.0    6553792.9  16550.0
> > > > >     File Copy 256 bufsize 500 maxblocks            1655.0    3189231.6  19270.3
> > > > >     File Copy 4096 bufsize 8000 maxblocks          5800.0    7221277.0  12450.5
> > > > >     Pipe Throughput                               12440.0   20594018.7  16554.7
> > > > >     Pipe-based Context Switching                   4000.0    2571117.7   6427.8
> > > > >     Process Creation                                126.0      10798.4    857.0
> > > > >     Shell Scripts (1 concurrent)                     42.4      57227.5  13497.1
> > > > >     Shell Scripts (8 concurrent)                      6.0       7329.2  12215.3
> > > > >     System Call Overhead                          15000.0   30766778.4  20511.2
> > > > >                                                                        ========
> > > > >     System Benchmarks Index Score                                       11670.7
> > > > >
> > > > > The qspinlock has a significant improvement on SOPHGO SG2042 64
> > > > > cores platform than the ticket_lock.
> > > > >
> > > > > Signed-off-by: Guo Ren <guoren@kernel.org>
> > > > > Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
> > > > > ---
> > > > >  arch/riscv/Kconfig                | 16 ++++++++++++++++
> > > > >  arch/riscv/include/asm/Kbuild     |  3 ++-
> > > > >  arch/riscv/include/asm/spinlock.h | 17 +++++++++++++++++
> > > > >  3 files changed, 35 insertions(+), 1 deletion(-)
> > > > >  create mode 100644 arch/riscv/include/asm/spinlock.h
> > > > >
> > > > > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> > > > > index 2c346fe169c1..7f39bfc75744 100644
> > > > > --- a/arch/riscv/Kconfig
> > > > > +++ b/arch/riscv/Kconfig
> > > > > @@ -471,6 +471,22 @@ config NODES_SHIFT
> > > > >         Specify the maximum number of NUMA Nodes available on the target
> > > > >         system.  Increases memory reserved to accommodate various tables.
> > > > >
> > > > > +choice
> > > > > +     prompt "RISC-V spinlock type"
> > > > > +     default RISCV_TICKET_SPINLOCKS
> > > > > +
> > > > > +config RISCV_TICKET_SPINLOCKS
> > > > > +     bool "Using ticket spinlock"
> > > > > +
> > > > > +config RISCV_QUEUED_SPINLOCKS
> > > > > +     bool "Using queued spinlock"
> > > > > +     depends on SMP && MMU
> > > > > +     select ARCH_USE_QUEUED_SPINLOCKS
> > > > > +     help
> > > > > +       Make sure your micro arch LL/SC has a strong forward progress guarantee.
> > > > > +       Otherwise, stay at ticket-lock.
> > > > > +endchoice
> > > > > +
> > > > >  config RISCV_ALTERNATIVE
> > > > >       bool
> > > > >       depends on !XIP_KERNEL
> > > > > diff --git a/arch/riscv/include/asm/Kbuild b/arch/riscv/include/asm/Kbuild
> > > > > index 504f8b7e72d4..a0dc85e4a754 100644
> > > > > --- a/arch/riscv/include/asm/Kbuild
> > > > > +++ b/arch/riscv/include/asm/Kbuild
> > > > > @@ -2,10 +2,11 @@
> > > > >  generic-y += early_ioremap.h
> > > > >  generic-y += flat.h
> > > > >  generic-y += kvm_para.h
> > > > > +generic-y += mcs_spinlock.h
> > > > >  generic-y += parport.h
> > > > > -generic-y += spinlock.h
> > > >
> > > > IIUC here you take the asm-generic/spinlock.h (which defines arch_spin_*())
> > > > and include the asm-generic headers of mcs_spinlock and qspinlock.
> > > >
> > > > In this case, the qspinlock.h will provide the arch_spin_*() interfaces,
> > > > which seems the oposite of the above description (ticket spinlocks being
> > > > the standard).
> > > >
> > > > Shouldn't ticket-spinlock.h also get included here?
> > > > (Also, I am probably missing something, as I dont' see the use of
> > > > mcs_spinlock here.)
> > > No, because asm-generic/spinlock.h:
> > > ...
> > > #include <asm-generic/ticket_spinlock.h>
> > > ...
> > >
> >
> > But aren't you removing asm-generic/spinlock.h below ?
> > -generic-y += spinlock.h
> Yes, current is:
> 
> arch/riscv/include/asm/spinlock.h -> include/asm-generic/spinlock.h ->
> include/asm-generic/ticket_spinlock.h

I did a little reading on how generic-y works (which I was unaware):

"If an architecture uses a verbatim copy of a header from 
include/asm-generic then this is listed in the file 
arch/$(SRCARCH)/include/asm/Kbuild [...] During the prepare phase of the 
build a wrapper include file is generated in the directory [...]"

Oh, so you are removing the asm-generic/spinlock.h because it's link 
was replaced by a new asm/spinlock.h. 

You add qspinlock.h to generic-y because it's new in riscv, and add 
mcs_spinlock.h because it's needed by qspinlock.h. 

Ok, it makes sense now.

Sorry about this noise.
I was unaware of how generic-y worked, and (wrongly) 
assumed it was about including headers automatically in the build.


> 
> +#ifdef CONFIG_QUEUED_SPINLOCKS
> +#include <asm/qspinlock.h>
> +#include <asm/qrwlock.h>
> +#else
> +#include <asm-generic/spinlock.h>
> +#endif
> 
> So, you want me:
> +#ifdef CONFIG_QUEUED_SPINLOCKS
> +#include <asm/qspinlock.h>
> +#else
> +#include <asm-generic/ticket_spinlock.h>
> +#endif
> 
> +#include <asm/qrwlock.h>
> 
> Right?

No, I didn't mean that.
I was just worried about the arch_spin_*() interfaces, but they should be 
fine.

BTW, according to kernel doc on generic-y, shouldn't be a better idea to 
add 'ticket_spinlock.h' to generic-y, and include above as 
asm/ticket_spinlock.h? 

Or is generic-y reserved only for stuff which is indirectly included by 
other headers? 

Thanks!
Leo

> 
> >
> > > >
> > > > >  generic-y += spinlock_types.h
> > > > >  generic-y += qrwlock.h
> > > > >  generic-y += qrwlock_types.h
> > > > > +generic-y += qspinlock.h
> > > > >  generic-y += user.h
> > > > >  generic-y += vmlinux.lds.h
> > > > > diff --git a/arch/riscv/include/asm/spinlock.h b/arch/riscv/include/asm/spinlock.h
> > > > > new file mode 100644
> > > > > index 000000000000..c644a92d4548
> > > > > --- /dev/null
> > > > > +++ b/arch/riscv/include/asm/spinlock.h
> > > > > @@ -0,0 +1,17 @@
> > > > > +/* SPDX-License-Identifier: GPL-2.0 */
> > > > > +
> > > > > +#ifndef __ASM_RISCV_SPINLOCK_H
> > > > > +#define __ASM_RISCV_SPINLOCK_H
> > > > > +
> > > > > +#ifdef CONFIG_QUEUED_SPINLOCKS
> > > > > +#define _Q_PENDING_LOOPS     (1 << 9)
> > > > > +#endif
> > > >
> > > > Any reason the above define couldn't be merged on the ifdef below?
> > > Easy for the next patch to modify. See Waiman's comment:
> > >
> > > https://lore.kernel.org/linux-riscv/4cc7113a-0e4e-763a-cba2-7963bcd26c7a@redhat.com/
> > >
> > > > diff --git a/arch/riscv/include/asm/spinlock.h b/arch/riscv/include/asm/spinlock.h
> > > > index c644a92d4548..9eb3ad31e564 100644
> > > > --- a/arch/riscv/include/asm/spinlock.h
> > > > +++ b/arch/riscv/include/asm/spinlock.h
> > > > @@ -7,11 +7,94 @@
> > > >   #define _Q_PENDING_LOOPS (1 << 9)
> > > >   #endif
> > > >
> > >
> > > I see why you separated the _Q_PENDING_LOOPS out.
> > >
> >
> > I see, should be fine then.
> >
> > Thanks!
> > Leo
> >
> > >
> > > >
> > > > > +
> > > > > +#ifdef CONFIG_QUEUED_SPINLOCKS
> > > > > +#include <asm/qspinlock.h>
> > > > > +#include <asm/qrwlock.h>
> > > > > +#else
> > > > > +#include <asm-generic/spinlock.h>
> > > > > +#endif
> > > > > +
> > > > > +#endif /* __ASM_RISCV_SPINLOCK_H */
> > > > > --
> > > > > 2.36.1
> > > > >
> > > >
> > > > Thanks!
> > > > Leo
> > > >
> > >
> > >
> > > --
> > > Best Regards
> > >  Guo Ren
> > >
> >
> 
> 
> -- 
> Best Regards
>  Guo Ren
>
Guo Ren Sept. 17, 2023, 3:02 p.m. UTC | #6
On Fri, Sep 15, 2023 at 5:08 PM Leonardo Bras <leobras@redhat.com> wrote:
>
> On Fri, Sep 15, 2023 at 10:10:25AM +0800, Guo Ren wrote:
> > On Thu, Sep 14, 2023 at 5:43 PM Leonardo Bras <leobras@redhat.com> wrote:
> > >
> > > On Thu, Sep 14, 2023 at 12:46:56PM +0800, Guo Ren wrote:
> > > > On Thu, Sep 14, 2023 at 4:29 AM Leonardo Bras <leobras@redhat.com> wrote:
> > > > >
> > > > > On Sun, Sep 10, 2023 at 04:28:59AM -0400, guoren@kernel.org wrote:
> > > > > > From: Guo Ren <guoren@linux.alibaba.com>
> > > > > >
> > > > > > The requirements of qspinlock have been documented by commit:
> > > > > > a8ad07e5240c ("asm-generic: qspinlock: Indicate the use of mixed-size
> > > > > > atomics").
> > > > > >
> > > > > > Although RISC-V ISA gives out a weaker forward guarantee LR/SC, which
> > > > > > doesn't satisfy the requirements of qspinlock above, it won't prevent
> > > > > > some riscv vendors from implementing a strong fwd guarantee LR/SC in
> > > > > > microarchitecture to match xchg_tail requirement. T-HEAD C9xx processor
> > > > > > is the one.
> > > > > >
> > > > > > We've tested the patch on SOPHGO sg2042 & th1520 and passed the stress
> > > > > > test on Fedora & Ubuntu & OpenEuler ... Here is the performance
> > > > > > comparison between qspinlock and ticket_lock on sg2042 (64 cores):
> > > > > >
> > > > > > sysbench test=threads threads=32 yields=100 lock=8 (+13.8%):
> > > > > >   queued_spinlock 0.5109/0.00
> > > > > >   ticket_spinlock 0.5814/0.00
> > > > > >
> > > > > > perf futex/hash (+6.7%):
> > > > > >   queued_spinlock 1444393 operations/sec (+- 0.09%)
> > > > > >   ticket_spinlock 1353215 operations/sec (+- 0.15%)
> > > > > >
> > > > > > perf futex/wake-parallel (+8.6%):
> > > > > >   queued_spinlock (waking 1/64 threads) in 0.0253 ms (+-2.90%)
> > > > > >   ticket_spinlock (waking 1/64 threads) in 0.0275 ms (+-3.12%)
> > > > > >
> > > > > > perf futex/requeue (+4.2%):
> > > > > >   queued_spinlock Requeued 64 of 64 threads in 0.0785 ms (+-0.55%)
> > > > > >   ticket_spinlock Requeued 64 of 64 threads in 0.0818 ms (+-4.12%)
> > > > > >
> > > > > > System Benchmarks (+6.4%)
> > > > > >   queued_spinlock:
> > > > > >     System Benchmarks Index Values               BASELINE       RESULT    INDEX
> > > > > >     Dhrystone 2 using register variables         116700.0  628613745.4  53865.8
> > > > > >     Double-Precision Whetstone                       55.0     182422.8  33167.8
> > > > > >     Execl Throughput                                 43.0      13116.6   3050.4
> > > > > >     File Copy 1024 bufsize 2000 maxblocks          3960.0    7762306.2  19601.8
> > > > > >     File Copy 256 bufsize 500 maxblocks            1655.0    3417556.8  20649.9
> > > > > >     File Copy 4096 bufsize 8000 maxblocks          5800.0    7427995.7  12806.9
> > > > > >     Pipe Throughput                               12440.0   23058600.5  18535.9
> > > > > >     Pipe-based Context Switching                   4000.0    2835617.7   7089.0
> > > > > >     Process Creation                                126.0      12537.3    995.0
> > > > > >     Shell Scripts (1 concurrent)                     42.4      57057.4  13456.9
> > > > > >     Shell Scripts (8 concurrent)                      6.0       7367.1  12278.5
> > > > > >     System Call Overhead                          15000.0   33308301.3  22205.5
> > > > > >                                                                        ========
> > > > > >     System Benchmarks Index Score                                       12426.1
> > > > > >
> > > > > >   ticket_spinlock:
> > > > > >     System Benchmarks Index Values               BASELINE       RESULT    INDEX
> > > > > >     Dhrystone 2 using register variables         116700.0  626541701.9  53688.2
> > > > > >     Double-Precision Whetstone                       55.0     181921.0  33076.5
> > > > > >     Execl Throughput                                 43.0      12625.1   2936.1
> > > > > >     File Copy 1024 bufsize 2000 maxblocks          3960.0    6553792.9  16550.0
> > > > > >     File Copy 256 bufsize 500 maxblocks            1655.0    3189231.6  19270.3
> > > > > >     File Copy 4096 bufsize 8000 maxblocks          5800.0    7221277.0  12450.5
> > > > > >     Pipe Throughput                               12440.0   20594018.7  16554.7
> > > > > >     Pipe-based Context Switching                   4000.0    2571117.7   6427.8
> > > > > >     Process Creation                                126.0      10798.4    857.0
> > > > > >     Shell Scripts (1 concurrent)                     42.4      57227.5  13497.1
> > > > > >     Shell Scripts (8 concurrent)                      6.0       7329.2  12215.3
> > > > > >     System Call Overhead                          15000.0   30766778.4  20511.2
> > > > > >                                                                        ========
> > > > > >     System Benchmarks Index Score                                       11670.7
> > > > > >
> > > > > > The qspinlock has a significant improvement on SOPHGO SG2042 64
> > > > > > cores platform than the ticket_lock.
> > > > > >
> > > > > > Signed-off-by: Guo Ren <guoren@kernel.org>
> > > > > > Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
> > > > > > ---
> > > > > >  arch/riscv/Kconfig                | 16 ++++++++++++++++
> > > > > >  arch/riscv/include/asm/Kbuild     |  3 ++-
> > > > > >  arch/riscv/include/asm/spinlock.h | 17 +++++++++++++++++
> > > > > >  3 files changed, 35 insertions(+), 1 deletion(-)
> > > > > >  create mode 100644 arch/riscv/include/asm/spinlock.h
> > > > > >
> > > > > > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> > > > > > index 2c346fe169c1..7f39bfc75744 100644
> > > > > > --- a/arch/riscv/Kconfig
> > > > > > +++ b/arch/riscv/Kconfig
> > > > > > @@ -471,6 +471,22 @@ config NODES_SHIFT
> > > > > >         Specify the maximum number of NUMA Nodes available on the target
> > > > > >         system.  Increases memory reserved to accommodate various tables.
> > > > > >
> > > > > > +choice
> > > > > > +     prompt "RISC-V spinlock type"
> > > > > > +     default RISCV_TICKET_SPINLOCKS
> > > > > > +
> > > > > > +config RISCV_TICKET_SPINLOCKS
> > > > > > +     bool "Using ticket spinlock"
> > > > > > +
> > > > > > +config RISCV_QUEUED_SPINLOCKS
> > > > > > +     bool "Using queued spinlock"
> > > > > > +     depends on SMP && MMU
> > > > > > +     select ARCH_USE_QUEUED_SPINLOCKS
> > > > > > +     help
> > > > > > +       Make sure your micro arch LL/SC has a strong forward progress guarantee.
> > > > > > +       Otherwise, stay at ticket-lock.
> > > > > > +endchoice
> > > > > > +
> > > > > >  config RISCV_ALTERNATIVE
> > > > > >       bool
> > > > > >       depends on !XIP_KERNEL
> > > > > > diff --git a/arch/riscv/include/asm/Kbuild b/arch/riscv/include/asm/Kbuild
> > > > > > index 504f8b7e72d4..a0dc85e4a754 100644
> > > > > > --- a/arch/riscv/include/asm/Kbuild
> > > > > > +++ b/arch/riscv/include/asm/Kbuild
> > > > > > @@ -2,10 +2,11 @@
> > > > > >  generic-y += early_ioremap.h
> > > > > >  generic-y += flat.h
> > > > > >  generic-y += kvm_para.h
> > > > > > +generic-y += mcs_spinlock.h
> > > > > >  generic-y += parport.h
> > > > > > -generic-y += spinlock.h
> > > > >
> > > > > IIUC here you take the asm-generic/spinlock.h (which defines arch_spin_*())
> > > > > and include the asm-generic headers of mcs_spinlock and qspinlock.
> > > > >
> > > > > In this case, the qspinlock.h will provide the arch_spin_*() interfaces,
> > > > > which seems the oposite of the above description (ticket spinlocks being
> > > > > the standard).
> > > > >
> > > > > Shouldn't ticket-spinlock.h also get included here?
> > > > > (Also, I am probably missing something, as I dont' see the use of
> > > > > mcs_spinlock here.)
> > > > No, because asm-generic/spinlock.h:
> > > > ...
> > > > #include <asm-generic/ticket_spinlock.h>
> > > > ...
> > > >
> > >
> > > But aren't you removing asm-generic/spinlock.h below ?
> > > -generic-y += spinlock.h
> > Yes, current is:
> >
> > arch/riscv/include/asm/spinlock.h -> include/asm-generic/spinlock.h ->
> > include/asm-generic/ticket_spinlock.h
>
> I did a little reading on how generic-y works (which I was unaware):
>
> "If an architecture uses a verbatim copy of a header from
> include/asm-generic then this is listed in the file
> arch/$(SRCARCH)/include/asm/Kbuild [...] During the prepare phase of the
> build a wrapper include file is generated in the directory [...]"
>
> Oh, so you are removing the asm-generic/spinlock.h because it's link
> was replaced by a new asm/spinlock.h.
>
> You add qspinlock.h to generic-y because it's new in riscv, and add
> mcs_spinlock.h because it's needed by qspinlock.h.
>
> Ok, it makes sense now.
>
> Sorry about this noise.
> I was unaware of how generic-y worked, and (wrongly)
> assumed it was about including headers automatically in the build.
>
>
> >
> > +#ifdef CONFIG_QUEUED_SPINLOCKS
> > +#include <asm/qspinlock.h>
> > +#include <asm/qrwlock.h>
> > +#else
> > +#include <asm-generic/spinlock.h>
> > +#endif
> >
> > So, you want me:
> > +#ifdef CONFIG_QUEUED_SPINLOCKS
> > +#include <asm/qspinlock.h>
> > +#else
> > +#include <asm-generic/ticket_spinlock.h>
> > +#endif
> >
> > +#include <asm/qrwlock.h>
> >
> > Right?
>
> No, I didn't mean that.
> I was just worried about the arch_spin_*() interfaces, but they should be
> fine.
>
> BTW, according to kernel doc on generic-y, shouldn't be a better idea to
> add 'ticket_spinlock.h' to generic-y, and include above as
> asm/ticket_spinlock.h?
>
> Or is generic-y reserved only for stuff which is indirectly included by
> other headers?
It's okay to add generic-y for ticket_spinlock.h, and I'm okay with
the following:

+#ifdef CONFIG_QUEUED_SPINLOCKS
+#include <asm/qspinlock.h>
+#else
+#include <asm/ticket_spinlock.h>
+#endif

+#include <asm/qrwlock.h>

>
> Thanks!
> Leo
>
> >
> > >
> > > > >
> > > > > >  generic-y += spinlock_types.h
> > > > > >  generic-y += qrwlock.h
> > > > > >  generic-y += qrwlock_types.h
> > > > > > +generic-y += qspinlock.h
> > > > > >  generic-y += user.h
> > > > > >  generic-y += vmlinux.lds.h
> > > > > > diff --git a/arch/riscv/include/asm/spinlock.h b/arch/riscv/include/asm/spinlock.h
> > > > > > new file mode 100644
> > > > > > index 000000000000..c644a92d4548
> > > > > > --- /dev/null
> > > > > > +++ b/arch/riscv/include/asm/spinlock.h
> > > > > > @@ -0,0 +1,17 @@
> > > > > > +/* SPDX-License-Identifier: GPL-2.0 */
> > > > > > +
> > > > > > +#ifndef __ASM_RISCV_SPINLOCK_H
> > > > > > +#define __ASM_RISCV_SPINLOCK_H
> > > > > > +
> > > > > > +#ifdef CONFIG_QUEUED_SPINLOCKS
> > > > > > +#define _Q_PENDING_LOOPS     (1 << 9)
> > > > > > +#endif
> > > > >
> > > > > Any reason the above define couldn't be merged on the ifdef below?
> > > > Easy for the next patch to modify. See Waiman's comment:
> > > >
> > > > https://lore.kernel.org/linux-riscv/4cc7113a-0e4e-763a-cba2-7963bcd26c7a@redhat.com/
> > > >
> > > > > diff --git a/arch/riscv/include/asm/spinlock.h b/arch/riscv/include/asm/spinlock.h
> > > > > index c644a92d4548..9eb3ad31e564 100644
> > > > > --- a/arch/riscv/include/asm/spinlock.h
> > > > > +++ b/arch/riscv/include/asm/spinlock.h
> > > > > @@ -7,11 +7,94 @@
> > > > >   #define _Q_PENDING_LOOPS (1 << 9)
> > > > >   #endif
> > > > >
> > > >
> > > > I see why you separated the _Q_PENDING_LOOPS out.
> > > >
> > >
> > > I see, should be fine then.
> > >
> > > Thanks!
> > > Leo
> > >
> > > >
> > > > >
> > > > > > +
> > > > > > +#ifdef CONFIG_QUEUED_SPINLOCKS
> > > > > > +#include <asm/qspinlock.h>
> > > > > > +#include <asm/qrwlock.h>
> > > > > > +#else
> > > > > > +#include <asm-generic/spinlock.h>
> > > > > > +#endif
> > > > > > +
> > > > > > +#endif /* __ASM_RISCV_SPINLOCK_H */
> > > > > > --
> > > > > > 2.36.1
> > > > > >
> > > > >
> > > > > Thanks!
> > > > > Leo
> > > > >
> > > >
> > > >
> > > > --
> > > > Best Regards
> > > >  Guo Ren
> > > >
> > >
> >
> >
> > --
> > Best Regards
> >  Guo Ren
> >
>
Leonardo Bras Sept. 19, 2023, 5:20 a.m. UTC | #7
On Sun, Sep 17, 2023 at 11:02:47PM +0800, Guo Ren wrote:
> On Fri, Sep 15, 2023 at 5:08 PM Leonardo Bras <leobras@redhat.com> wrote:
> >
> > On Fri, Sep 15, 2023 at 10:10:25AM +0800, Guo Ren wrote:
> > > On Thu, Sep 14, 2023 at 5:43 PM Leonardo Bras <leobras@redhat.com> wrote:
> > > >
> > > > On Thu, Sep 14, 2023 at 12:46:56PM +0800, Guo Ren wrote:
> > > > > On Thu, Sep 14, 2023 at 4:29 AM Leonardo Bras <leobras@redhat.com> wrote:
> > > > > >
> > > > > > On Sun, Sep 10, 2023 at 04:28:59AM -0400, guoren@kernel.org wrote:
> > > > > > > From: Guo Ren <guoren@linux.alibaba.com>
> > > > > > >
> > > > > > > The requirements of qspinlock have been documented by commit:
> > > > > > > a8ad07e5240c ("asm-generic: qspinlock: Indicate the use of mixed-size
> > > > > > > atomics").
> > > > > > >
> > > > > > > Although RISC-V ISA gives out a weaker forward guarantee LR/SC, which
> > > > > > > doesn't satisfy the requirements of qspinlock above, it won't prevent
> > > > > > > some riscv vendors from implementing a strong fwd guarantee LR/SC in
> > > > > > > microarchitecture to match xchg_tail requirement. T-HEAD C9xx processor
> > > > > > > is the one.
> > > > > > >
> > > > > > > We've tested the patch on SOPHGO sg2042 & th1520 and passed the stress
> > > > > > > test on Fedora & Ubuntu & OpenEuler ... Here is the performance
> > > > > > > comparison between qspinlock and ticket_lock on sg2042 (64 cores):
> > > > > > >
> > > > > > > sysbench test=threads threads=32 yields=100 lock=8 (+13.8%):
> > > > > > >   queued_spinlock 0.5109/0.00
> > > > > > >   ticket_spinlock 0.5814/0.00
> > > > > > >
> > > > > > > perf futex/hash (+6.7%):
> > > > > > >   queued_spinlock 1444393 operations/sec (+- 0.09%)
> > > > > > >   ticket_spinlock 1353215 operations/sec (+- 0.15%)
> > > > > > >
> > > > > > > perf futex/wake-parallel (+8.6%):
> > > > > > >   queued_spinlock (waking 1/64 threads) in 0.0253 ms (+-2.90%)
> > > > > > >   ticket_spinlock (waking 1/64 threads) in 0.0275 ms (+-3.12%)
> > > > > > >
> > > > > > > perf futex/requeue (+4.2%):
> > > > > > >   queued_spinlock Requeued 64 of 64 threads in 0.0785 ms (+-0.55%)
> > > > > > >   ticket_spinlock Requeued 64 of 64 threads in 0.0818 ms (+-4.12%)
> > > > > > >
> > > > > > > System Benchmarks (+6.4%)
> > > > > > >   queued_spinlock:
> > > > > > >     System Benchmarks Index Values               BASELINE       RESULT    INDEX
> > > > > > >     Dhrystone 2 using register variables         116700.0  628613745.4  53865.8
> > > > > > >     Double-Precision Whetstone                       55.0     182422.8  33167.8
> > > > > > >     Execl Throughput                                 43.0      13116.6   3050.4
> > > > > > >     File Copy 1024 bufsize 2000 maxblocks          3960.0    7762306.2  19601.8
> > > > > > >     File Copy 256 bufsize 500 maxblocks            1655.0    3417556.8  20649.9
> > > > > > >     File Copy 4096 bufsize 8000 maxblocks          5800.0    7427995.7  12806.9
> > > > > > >     Pipe Throughput                               12440.0   23058600.5  18535.9
> > > > > > >     Pipe-based Context Switching                   4000.0    2835617.7   7089.0
> > > > > > >     Process Creation                                126.0      12537.3    995.0
> > > > > > >     Shell Scripts (1 concurrent)                     42.4      57057.4  13456.9
> > > > > > >     Shell Scripts (8 concurrent)                      6.0       7367.1  12278.5
> > > > > > >     System Call Overhead                          15000.0   33308301.3  22205.5
> > > > > > >                                                                        ========
> > > > > > >     System Benchmarks Index Score                                       12426.1
> > > > > > >
> > > > > > >   ticket_spinlock:
> > > > > > >     System Benchmarks Index Values               BASELINE       RESULT    INDEX
> > > > > > >     Dhrystone 2 using register variables         116700.0  626541701.9  53688.2
> > > > > > >     Double-Precision Whetstone                       55.0     181921.0  33076.5
> > > > > > >     Execl Throughput                                 43.0      12625.1   2936.1
> > > > > > >     File Copy 1024 bufsize 2000 maxblocks          3960.0    6553792.9  16550.0
> > > > > > >     File Copy 256 bufsize 500 maxblocks            1655.0    3189231.6  19270.3
> > > > > > >     File Copy 4096 bufsize 8000 maxblocks          5800.0    7221277.0  12450.5
> > > > > > >     Pipe Throughput                               12440.0   20594018.7  16554.7
> > > > > > >     Pipe-based Context Switching                   4000.0    2571117.7   6427.8
> > > > > > >     Process Creation                                126.0      10798.4    857.0
> > > > > > >     Shell Scripts (1 concurrent)                     42.4      57227.5  13497.1
> > > > > > >     Shell Scripts (8 concurrent)                      6.0       7329.2  12215.3
> > > > > > >     System Call Overhead                          15000.0   30766778.4  20511.2
> > > > > > >                                                                        ========
> > > > > > >     System Benchmarks Index Score                                       11670.7
> > > > > > >
> > > > > > > The qspinlock has a significant improvement on SOPHGO SG2042 64
> > > > > > > cores platform than the ticket_lock.
> > > > > > >
> > > > > > > Signed-off-by: Guo Ren <guoren@kernel.org>
> > > > > > > Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
> > > > > > > ---
> > > > > > >  arch/riscv/Kconfig                | 16 ++++++++++++++++
> > > > > > >  arch/riscv/include/asm/Kbuild     |  3 ++-
> > > > > > >  arch/riscv/include/asm/spinlock.h | 17 +++++++++++++++++
> > > > > > >  3 files changed, 35 insertions(+), 1 deletion(-)
> > > > > > >  create mode 100644 arch/riscv/include/asm/spinlock.h
> > > > > > >
> > > > > > > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> > > > > > > index 2c346fe169c1..7f39bfc75744 100644
> > > > > > > --- a/arch/riscv/Kconfig
> > > > > > > +++ b/arch/riscv/Kconfig
> > > > > > > @@ -471,6 +471,22 @@ config NODES_SHIFT
> > > > > > >         Specify the maximum number of NUMA Nodes available on the target
> > > > > > >         system.  Increases memory reserved to accommodate various tables.
> > > > > > >
> > > > > > > +choice
> > > > > > > +     prompt "RISC-V spinlock type"
> > > > > > > +     default RISCV_TICKET_SPINLOCKS
> > > > > > > +
> > > > > > > +config RISCV_TICKET_SPINLOCKS
> > > > > > > +     bool "Using ticket spinlock"
> > > > > > > +
> > > > > > > +config RISCV_QUEUED_SPINLOCKS
> > > > > > > +     bool "Using queued spinlock"
> > > > > > > +     depends on SMP && MMU
> > > > > > > +     select ARCH_USE_QUEUED_SPINLOCKS
> > > > > > > +     help
> > > > > > > +       Make sure your micro arch LL/SC has a strong forward progress guarantee.
> > > > > > > +       Otherwise, stay at ticket-lock.
> > > > > > > +endchoice
> > > > > > > +
> > > > > > >  config RISCV_ALTERNATIVE
> > > > > > >       bool
> > > > > > >       depends on !XIP_KERNEL
> > > > > > > diff --git a/arch/riscv/include/asm/Kbuild b/arch/riscv/include/asm/Kbuild
> > > > > > > index 504f8b7e72d4..a0dc85e4a754 100644
> > > > > > > --- a/arch/riscv/include/asm/Kbuild
> > > > > > > +++ b/arch/riscv/include/asm/Kbuild
> > > > > > > @@ -2,10 +2,11 @@
> > > > > > >  generic-y += early_ioremap.h
> > > > > > >  generic-y += flat.h
> > > > > > >  generic-y += kvm_para.h
> > > > > > > +generic-y += mcs_spinlock.h
> > > > > > >  generic-y += parport.h
> > > > > > > -generic-y += spinlock.h
> > > > > >
> > > > > > IIUC here you take the asm-generic/spinlock.h (which defines arch_spin_*())
> > > > > > and include the asm-generic headers of mcs_spinlock and qspinlock.
> > > > > >
> > > > > > In this case, the qspinlock.h will provide the arch_spin_*() interfaces,
> > > > > > which seems the oposite of the above description (ticket spinlocks being
> > > > > > the standard).
> > > > > >
> > > > > > Shouldn't ticket-spinlock.h also get included here?
> > > > > > (Also, I am probably missing something, as I dont' see the use of
> > > > > > mcs_spinlock here.)
> > > > > No, because asm-generic/spinlock.h:
> > > > > ...
> > > > > #include <asm-generic/ticket_spinlock.h>
> > > > > ...
> > > > >
> > > >
> > > > But aren't you removing asm-generic/spinlock.h below ?
> > > > -generic-y += spinlock.h
> > > Yes, current is:
> > >
> > > arch/riscv/include/asm/spinlock.h -> include/asm-generic/spinlock.h ->
> > > include/asm-generic/ticket_spinlock.h
> >
> > I did a little reading on how generic-y works (which I was unaware):
> >
> > "If an architecture uses a verbatim copy of a header from
> > include/asm-generic then this is listed in the file
> > arch/$(SRCARCH)/include/asm/Kbuild [...] During the prepare phase of the
> > build a wrapper include file is generated in the directory [...]"
> >
> > Oh, so you are removing the asm-generic/spinlock.h because it's link
> > was replaced by a new asm/spinlock.h.
> >
> > You add qspinlock.h to generic-y because it's new in riscv, and add
> > mcs_spinlock.h because it's needed by qspinlock.h.
> >
> > Ok, it makes sense now.
> >
> > Sorry about this noise.
> > I was unaware of how generic-y worked, and (wrongly)
> > assumed it was about including headers automatically in the build.
> >
> >
> > >
> > > +#ifdef CONFIG_QUEUED_SPINLOCKS
> > > +#include <asm/qspinlock.h>
> > > +#include <asm/qrwlock.h>
> > > +#else
> > > +#include <asm-generic/spinlock.h>
> > > +#endif
> > >
> > > So, you want me:
> > > +#ifdef CONFIG_QUEUED_SPINLOCKS
> > > +#include <asm/qspinlock.h>
> > > +#else
> > > +#include <asm-generic/ticket_spinlock.h>
> > > +#endif
> > >
> > > +#include <asm/qrwlock.h>
> > >
> > > Right?
> >
> > No, I didn't mean that.
> > I was just worried about the arch_spin_*() interfaces, but they should be
> > fine.
> >
> > BTW, according to kernel doc on generic-y, shouldn't be a better idea to
> > add 'ticket_spinlock.h' to generic-y, and include above as
> > asm/ticket_spinlock.h?
> >
> > Or is generic-y reserved only for stuff which is indirectly included by
> > other headers?
> It's okay to add generic-y for ticket_spinlock.h, and I'm okay with
> the following:
> 
> +#ifdef CONFIG_QUEUED_SPINLOCKS
> +#include <asm/qspinlock.h>
> +#else
> +#include <asm/ticket_spinlock.h>
> +#endif
> 
> +#include <asm/qrwlock.h>

It does look more intuitive, so I am glad it works for you :)

FWIW:
Reviewed-by: Leonardo Bras <leobras@redhat.com>

Thanks!
Leo

> 
> >
> > Thanks!
> > Leo
> >
> > >
> > > >
> > > > > >
> > > > > > >  generic-y += spinlock_types.h
> > > > > > >  generic-y += qrwlock.h
> > > > > > >  generic-y += qrwlock_types.h
> > > > > > > +generic-y += qspinlock.h
> > > > > > >  generic-y += user.h
> > > > > > >  generic-y += vmlinux.lds.h
> > > > > > > diff --git a/arch/riscv/include/asm/spinlock.h b/arch/riscv/include/asm/spinlock.h
> > > > > > > new file mode 100644
> > > > > > > index 000000000000..c644a92d4548
> > > > > > > --- /dev/null
> > > > > > > +++ b/arch/riscv/include/asm/spinlock.h
> > > > > > > @@ -0,0 +1,17 @@
> > > > > > > +/* SPDX-License-Identifier: GPL-2.0 */
> > > > > > > +
> > > > > > > +#ifndef __ASM_RISCV_SPINLOCK_H
> > > > > > > +#define __ASM_RISCV_SPINLOCK_H
> > > > > > > +
> > > > > > > +#ifdef CONFIG_QUEUED_SPINLOCKS
> > > > > > > +#define _Q_PENDING_LOOPS     (1 << 9)
> > > > > > > +#endif
> > > > > >
> > > > > > Any reason the above define couldn't be merged on the ifdef below?
> > > > > Easy for the next patch to modify. See Waiman's comment:
> > > > >
> > > > > https://lore.kernel.org/linux-riscv/4cc7113a-0e4e-763a-cba2-7963bcd26c7a@redhat.com/
> > > > >
> > > > > > diff --git a/arch/riscv/include/asm/spinlock.h b/arch/riscv/include/asm/spinlock.h
> > > > > > index c644a92d4548..9eb3ad31e564 100644
> > > > > > --- a/arch/riscv/include/asm/spinlock.h
> > > > > > +++ b/arch/riscv/include/asm/spinlock.h
> > > > > > @@ -7,11 +7,94 @@
> > > > > >   #define _Q_PENDING_LOOPS (1 << 9)
> > > > > >   #endif
> > > > > >
> > > > >
> > > > > I see why you separated the _Q_PENDING_LOOPS out.
> > > > >
> > > >
> > > > I see, should be fine then.
> > > >
> > > > Thanks!
> > > > Leo
> > > >
> > > > >
> > > > > >
> > > > > > > +
> > > > > > > +#ifdef CONFIG_QUEUED_SPINLOCKS
> > > > > > > +#include <asm/qspinlock.h>
> > > > > > > +#include <asm/qrwlock.h>
> > > > > > > +#else
> > > > > > > +#include <asm-generic/spinlock.h>
> > > > > > > +#endif
> > > > > > > +
> > > > > > > +#endif /* __ASM_RISCV_SPINLOCK_H */
> > > > > > > --
> > > > > > > 2.36.1
> > > > > > >
> > > > > >
> > > > > > Thanks!
> > > > > > Leo
> > > > > >
> > > > >
> > > > >
> > > > > --
> > > > > Best Regards
> > > > >  Guo Ren
> > > > >
> > > >
> > >
> > >
> > > --
> > > Best Regards
> > >  Guo Ren
> > >
> >
> 
> 
> -- 
> Best Regards
>  Guo Ren
>
diff mbox series

Patch

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 2c346fe169c1..7f39bfc75744 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -471,6 +471,22 @@  config NODES_SHIFT
 	  Specify the maximum number of NUMA Nodes available on the target
 	  system.  Increases memory reserved to accommodate various tables.
 
+choice
+	prompt "RISC-V spinlock type"
+	default RISCV_TICKET_SPINLOCKS
+
+config RISCV_TICKET_SPINLOCKS
+	bool "Using ticket spinlock"
+
+config RISCV_QUEUED_SPINLOCKS
+	bool "Using queued spinlock"
+	depends on SMP && MMU
+	select ARCH_USE_QUEUED_SPINLOCKS
+	help
+	  Make sure your micro arch LL/SC has a strong forward progress guarantee.
+	  Otherwise, stay at ticket-lock.
+endchoice
+
 config RISCV_ALTERNATIVE
 	bool
 	depends on !XIP_KERNEL
diff --git a/arch/riscv/include/asm/Kbuild b/arch/riscv/include/asm/Kbuild
index 504f8b7e72d4..a0dc85e4a754 100644
--- a/arch/riscv/include/asm/Kbuild
+++ b/arch/riscv/include/asm/Kbuild
@@ -2,10 +2,11 @@ 
 generic-y += early_ioremap.h
 generic-y += flat.h
 generic-y += kvm_para.h
+generic-y += mcs_spinlock.h
 generic-y += parport.h
-generic-y += spinlock.h
 generic-y += spinlock_types.h
 generic-y += qrwlock.h
 generic-y += qrwlock_types.h
+generic-y += qspinlock.h
 generic-y += user.h
 generic-y += vmlinux.lds.h
diff --git a/arch/riscv/include/asm/spinlock.h b/arch/riscv/include/asm/spinlock.h
new file mode 100644
index 000000000000..c644a92d4548
--- /dev/null
+++ b/arch/riscv/include/asm/spinlock.h
@@ -0,0 +1,17 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __ASM_RISCV_SPINLOCK_H
+#define __ASM_RISCV_SPINLOCK_H
+
+#ifdef CONFIG_QUEUED_SPINLOCKS
+#define _Q_PENDING_LOOPS	(1 << 9)
+#endif
+
+#ifdef CONFIG_QUEUED_SPINLOCKS
+#include <asm/qspinlock.h>
+#include <asm/qrwlock.h>
+#else
+#include <asm-generic/spinlock.h>
+#endif
+
+#endif /* __ASM_RISCV_SPINLOCK_H */