diff mbox series

[v3,2/2] riscv: Optimize bitops with Zbb extension

Message ID 20230926094655.3102758-3-xiao.w.wang@intel.com (mailing list archive)
State Superseded
Headers show
Series riscv: Optimize bitops with Zbb extension | expand

Checks

Context Check Description
conchuod/tree_selection fail Failed to apply to next/pending-fixes, riscv/for-next or riscv/master
conchuod/vmtest-fixes-PR fail merge-conflict

Commit Message

Wang, Xiao W Sept. 26, 2023, 9:46 a.m. UTC
This patch leverages the alternative mechanism to dynamically optimize
bitops (including __ffs, __fls, ffs, fls) with Zbb instructions. When
Zbb ext is not supported by the runtime CPU, legacy implementation is
used. If Zbb is supported, then the optimized variants will be selected
via alternative patching.

The legacy bitops support is taken from the generic C implementation as
fallback.

If the parameter is a build-time constant, we leverage compiler builtin to
calculate the result directly, this approach is inspired by x86 bitops
implementation.

EFI stub runs before the kernel, so alternative mechanism should not be
used there, this patch introduces a macro NO_ALTERNATIVE for this purpose.

Signed-off-by: Xiao Wang <xiao.w.wang@intel.com>
---
 arch/riscv/include/asm/bitops.h       | 266 +++++++++++++++++++++++++-
 drivers/firmware/efi/libstub/Makefile |   2 +-
 2 files changed, 264 insertions(+), 4 deletions(-)

Comments

Charlie Jenkins Oct. 27, 2023, 6:24 a.m. UTC | #1
On Tue, Sep 26, 2023 at 05:46:55PM +0800, Xiao Wang wrote:
> This patch leverages the alternative mechanism to dynamically optimize
> bitops (including __ffs, __fls, ffs, fls) with Zbb instructions. When
> Zbb ext is not supported by the runtime CPU, legacy implementation is
> used. If Zbb is supported, then the optimized variants will be selected
> via alternative patching.
> 
> The legacy bitops support is taken from the generic C implementation as
> fallback.
> 
> If the parameter is a build-time constant, we leverage compiler builtin to
> calculate the result directly, this approach is inspired by x86 bitops
> implementation.
> 
> EFI stub runs before the kernel, so alternative mechanism should not be
> used there, this patch introduces a macro NO_ALTERNATIVE for this purpose.
> 
> Signed-off-by: Xiao Wang <xiao.w.wang@intel.com>
> ---
>  arch/riscv/include/asm/bitops.h       | 266 +++++++++++++++++++++++++-
>  drivers/firmware/efi/libstub/Makefile |   2 +-
>  2 files changed, 264 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/riscv/include/asm/bitops.h b/arch/riscv/include/asm/bitops.h
> index 3540b690944b..c97e774cb647 100644
> --- a/arch/riscv/include/asm/bitops.h
> +++ b/arch/riscv/include/asm/bitops.h
> @@ -15,13 +15,273 @@
>  #include <asm/barrier.h>
>  #include <asm/bitsperlong.h>
>  
> +#if !defined(CONFIG_RISCV_ISA_ZBB) || defined(NO_ALTERNATIVE)
>  #include <asm-generic/bitops/__ffs.h>
> -#include <asm-generic/bitops/ffz.h>
> -#include <asm-generic/bitops/fls.h>
>  #include <asm-generic/bitops/__fls.h>
> +#include <asm-generic/bitops/ffs.h>
> +#include <asm-generic/bitops/fls.h>
> +
> +#else
> +#include <asm/alternative-macros.h>
> +#include <asm/hwcap.h>
> +
> +#if (BITS_PER_LONG == 64)
> +#define CTZW	"ctzw "
> +#define CLZW	"clzw "
> +#elif (BITS_PER_LONG == 32)
> +#define CTZW	"ctz "
> +#define CLZW	"clz "
> +#else
> +#error "Unexpected BITS_PER_LONG"
> +#endif
> +
> +static __always_inline unsigned long variable__ffs(unsigned long word)
> +{
> +	int num;
> +
> +	asm_volatile_goto(
> +		ALTERNATIVE("j %l[legacy]", "nop", 0, RISCV_ISA_EXT_ZBB, 1)
> +		: : : : legacy);
> +
> +	asm volatile (
> +		".option push\n"
> +		".option arch,+zbb\n"
> +		"ctz %0, %1\n"
> +		".option pop\n"
> +		: "=r" (word) : "r" (word) :);
> +
> +	return word;

That's so clean!

> +
> +legacy:
> +	num = 0;
> +#if BITS_PER_LONG == 64
> +	if ((word & 0xffffffff) == 0) {
> +		num += 32;
> +		word >>= 32;
> +	}
> +#endif
> +	if ((word & 0xffff) == 0) {
> +		num += 16;
> +		word >>= 16;
> +	}
> +	if ((word & 0xff) == 0) {
> +		num += 8;
> +		word >>= 8;
> +	}
> +	if ((word & 0xf) == 0) {
> +		num += 4;
> +		word >>= 4;
> +	}
> +	if ((word & 0x3) == 0) {
> +		num += 2;
> +		word >>= 2;
> +	}
> +	if ((word & 0x1) == 0)
> +		num += 1;
> +	return num;
> +}
> +
> +/**
> + * __ffs - find first set bit in a long word
> + * @word: The word to search
> + *
> + * Undefined if no set bit exists, so code should check against 0 first.
> + */
> +#define __ffs(word)				\
> +	(__builtin_constant_p(word) ?		\
> +	 (unsigned long)__builtin_ctzl(word) :	\
> +	 variable__ffs(word))
> +
> +static __always_inline unsigned long variable__fls(unsigned long word)
> +{
> +	int num;
> +
> +	asm_volatile_goto(
> +		ALTERNATIVE("j %l[legacy]", "nop", 0, RISCV_ISA_EXT_ZBB, 1)
> +		: : : : legacy);
> +
> +	asm volatile (
> +		".option push\n"
> +		".option arch,+zbb\n"
> +		"clz %0, %1\n"
> +		".option pop\n"
> +		: "=r" (word) : "r" (word) :);
> +
> +	return BITS_PER_LONG - 1 - word;
> +
> +legacy:
> +	num = BITS_PER_LONG - 1;
> +#if BITS_PER_LONG == 64
> +	if (!(word & (~0ul << 32))) {
> +		num -= 32;
> +		word <<= 32;
> +	}
> +#endif
> +	if (!(word & (~0ul << (BITS_PER_LONG-16)))) {
> +		num -= 16;
> +		word <<= 16;
> +	}
> +	if (!(word & (~0ul << (BITS_PER_LONG-8)))) {
> +		num -= 8;
> +		word <<= 8;
> +	}
> +	if (!(word & (~0ul << (BITS_PER_LONG-4)))) {
> +		num -= 4;
> +		word <<= 4;
> +	}
> +	if (!(word & (~0ul << (BITS_PER_LONG-2)))) {
> +		num -= 2;
> +		word <<= 2;
> +	}
> +	if (!(word & (~0ul << (BITS_PER_LONG-1))))
> +		num -= 1;
> +	return num;
> +}
> +
> +/**
> + * __fls - find last set bit in a long word
> + * @word: the word to search
> + *
> + * Undefined if no set bit exists, so code should check against 0 first.
> + */
> +#define __fls(word)							\
> +	(__builtin_constant_p(word) ?					\
> +	 (unsigned long)(BITS_PER_LONG - 1 - __builtin_clzl(word)) :	\
> +	 variable__fls(word))
> +
> +static __always_inline int variable_ffs(int x)
> +{
> +	int r;
> +
> +	asm_volatile_goto(
> +		ALTERNATIVE("j %l[legacy]", "nop", 0, RISCV_ISA_EXT_ZBB, 1)
> +		: : : : legacy);
> +
> +	asm volatile (
> +		".option push\n"
> +		".option arch,+zbb\n"
> +		"bnez %1, 1f\n"
> +		"li %0, 0\n"
> +		"j 2f\n"
> +		"1:\n"
> +		CTZW "%0, %1\n"
> +		"addi %0, %0, 1\n"
> +		"2:\n"
> +		".option pop\n"
> +		: "=r" (r) : "r" (x) :);
> +
> +	return r;
> +

I generally like to move as much code out of asm as possible. You are
also able to remove one of the branches if you rearrange as follows:

if (!x)
	return 0;

asm volatile (
	".option push\n"
        ".option arch,+zbb\n"
        CTZW "%0, %1\n"
        ".option pop\n"
        : "=r" (r) : "r" (x) :);

return r + 1;

You could then also extract out the "if (!x)" at the beginning of the
legacy section at put it at the top of the function.

> +legacy:
> +	r = 1;
> +	if (!x)
> +		return 0;
> +	if (!(x & 0xffff)) {
> +		x >>= 16;
> +		r += 16;
> +	}
> +	if (!(x & 0xff)) {
> +		x >>= 8;
> +		r += 8;
> +	}
> +	if (!(x & 0xf)) {
> +		x >>= 4;
> +		r += 4;
> +	}
> +	if (!(x & 3)) {
> +		x >>= 2;
> +		r += 2;
> +	}
> +	if (!(x & 1)) {
> +		x >>= 1;
> +		r += 1;
> +	}
> +	return r;
> +}
> +
> +/**
> + * ffs - find first set bit in a word
> + * @x: the word to search
> + *
> + * This is defined the same way as the libc and compiler builtin ffs routines.
> + *
> + * ffs(value) returns 0 if value is 0 or the position of the first set bit if
> + * value is nonzero. The first (least significant) bit is at position 1.
> + */
> +#define ffs(x) (__builtin_constant_p(x) ? __builtin_ffs(x) : variable_ffs(x))
> +
> +static __always_inline int variable_fls(unsigned int x)
> +{
> +	int r;
> +
> +	asm_volatile_goto(
> +		ALTERNATIVE("j %l[legacy]", "nop", 0, RISCV_ISA_EXT_ZBB, 1)
> +		: : : : legacy);
> +
> +	asm volatile (
> +		".option push\n"
> +		".option arch,+zbb\n"
> +		"bnez %1, 1f\n"
> +		"li %0, 0\n"
> +		"j 2f\n"
> +		"1:\n"
> +		CLZW "%0, %1\n"
> +		"neg %0, %0\n"
> +		"addi %0, %0, 32\n"
> +		"2:\n"
> +		".option pop\n"
> +		: "=r" (r) : "r" (x) :);
> +
> +	return r;
> +

Same comment as above but with appropriate changes.

> +legacy:
> +	r = 32;
> +	if (!x)
> +		return 0;
> +	if (!(x & 0xffff0000u)) {
> +		x <<= 16;
> +		r -= 16;
> +	}
> +	if (!(x & 0xff000000u)) {
> +		x <<= 8;
> +		r -= 8;
> +	}
> +	if (!(x & 0xf0000000u)) {
> +		x <<= 4;
> +		r -= 4;
> +	}
> +	if (!(x & 0xc0000000u)) {
> +		x <<= 2;
> +		r -= 2;
> +	}
> +	if (!(x & 0x80000000u)) {
> +		x <<= 1;
> +		r -= 1;
> +	}
> +	return r;
> +}
> +
> +/**
> + * fls - find last set bit in a word
> + * @x: the word to search
> + *
> + * This is defined in a similar way as ffs, but returns the position of the most
> + * significant set bit.
> + *
> + * fls(value) returns 0 if value is 0 or the position of the last set bit if
> + * value is nonzero. The last (most significant) bit is at position 32.
> + */
> +#define fls(x)								\
> +	(__builtin_constant_p(x) ?					\
> +	 (int)(((x) != 0) ?						\
> +	  (sizeof(unsigned int) * 8 - __builtin_clz(x)) : 0) :		\
> +	 variable_fls(x))
> +
> +#endif

As a nit, it's nice when large #ifdef blocks have a comment like

/* !defined(CONFIG_RISCV_ISA_ZBB) || defined(NO_ALTERNATIVE) */

Overall great changes, just that small optimization to get rid of the
jump.

- Charlie

> +
> +#include <asm-generic/bitops/ffz.h>
>  #include <asm-generic/bitops/fls64.h>
>  #include <asm-generic/bitops/sched.h>
> -#include <asm-generic/bitops/ffs.h>
>  
>  #include <asm-generic/bitops/hweight.h>
>  
> diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile
> index a1157c2a7170..d68cacd4e3af 100644
> --- a/drivers/firmware/efi/libstub/Makefile
> +++ b/drivers/firmware/efi/libstub/Makefile
> @@ -28,7 +28,7 @@ cflags-$(CONFIG_ARM)		+= -DEFI_HAVE_STRLEN -DEFI_HAVE_STRNLEN \
>  				   -DEFI_HAVE_MEMCHR -DEFI_HAVE_STRRCHR \
>  				   -DEFI_HAVE_STRCMP -fno-builtin -fpic \
>  				   $(call cc-option,-mno-single-pic-base)
> -cflags-$(CONFIG_RISCV)		+= -fpic
> +cflags-$(CONFIG_RISCV)		+= -fpic -DNO_ALTERNATIVE
>  cflags-$(CONFIG_LOONGARCH)	+= -fpie
>  
>  cflags-$(CONFIG_EFI_PARAMS_FROM_FDT)	+= -I$(srctree)/scripts/dtc/libfdt
> -- 
> 2.25.1
> 
> 
> _______________________________________________
> linux-riscv mailing list
> linux-riscv@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-riscv
Wang, Xiao W Oct. 30, 2023, 2:12 a.m. UTC | #2
> -----Original Message-----
> From: Charlie Jenkins <charlie@rivosinc.com>
> Sent: Friday, October 27, 2023 2:25 PM
> To: Wang, Xiao W <xiao.w.wang@intel.com>
> Cc: paul.walmsley@sifive.com; palmer@dabbelt.com;
> aou@eecs.berkeley.edu; ardb@kernel.org; anup@brainfault.org; Li, Haicheng
> <haicheng.li@intel.com>; ajones@ventanamicro.com; Liu, Yujie
> <yujie.liu@intel.com>; linux-riscv@lists.infradead.org; linux-
> efi@vger.kernel.org; linux-kernel@vger.kernel.org
> Subject: Re: [PATCH v3 2/2] riscv: Optimize bitops with Zbb extension
> 
> On Tue, Sep 26, 2023 at 05:46:55PM +0800, Xiao Wang wrote:
> > This patch leverages the alternative mechanism to dynamically optimize
> > bitops (including __ffs, __fls, ffs, fls) with Zbb instructions. When
> > Zbb ext is not supported by the runtime CPU, legacy implementation is
> > used. If Zbb is supported, then the optimized variants will be selected
> > via alternative patching.
> >
> > The legacy bitops support is taken from the generic C implementation as
> > fallback.
> >
> > If the parameter is a build-time constant, we leverage compiler builtin to
> > calculate the result directly, this approach is inspired by x86 bitops
> > implementation.
> >
> > EFI stub runs before the kernel, so alternative mechanism should not be
> > used there, this patch introduces a macro NO_ALTERNATIVE for this
> purpose.
> >
> > Signed-off-by: Xiao Wang <xiao.w.wang@intel.com>
> > ---
> >  arch/riscv/include/asm/bitops.h       | 266 +++++++++++++++++++++++++-
> >  drivers/firmware/efi/libstub/Makefile |   2 +-
> >  2 files changed, 264 insertions(+), 4 deletions(-)
> >
> > diff --git a/arch/riscv/include/asm/bitops.h
> b/arch/riscv/include/asm/bitops.h
> > index 3540b690944b..c97e774cb647 100644
> > --- a/arch/riscv/include/asm/bitops.h
> > +++ b/arch/riscv/include/asm/bitops.h
> > @@ -15,13 +15,273 @@
> >  #include <asm/barrier.h>
> >  #include <asm/bitsperlong.h>
> >
> > +#if !defined(CONFIG_RISCV_ISA_ZBB) || defined(NO_ALTERNATIVE)
> >  #include <asm-generic/bitops/__ffs.h>
> > -#include <asm-generic/bitops/ffz.h>
> > -#include <asm-generic/bitops/fls.h>
> >  #include <asm-generic/bitops/__fls.h>
> > +#include <asm-generic/bitops/ffs.h>
> > +#include <asm-generic/bitops/fls.h>
> > +
> > +#else
> > +#include <asm/alternative-macros.h>
> > +#include <asm/hwcap.h>
> > +
> > +#if (BITS_PER_LONG == 64)
> > +#define CTZW	"ctzw "
> > +#define CLZW	"clzw "
> > +#elif (BITS_PER_LONG == 32)
> > +#define CTZW	"ctz "
> > +#define CLZW	"clz "
> > +#else
> > +#error "Unexpected BITS_PER_LONG"
> > +#endif
> > +
> > +static __always_inline unsigned long variable__ffs(unsigned long word)
> > +{
> > +	int num;
> > +
> > +	asm_volatile_goto(
> > +		ALTERNATIVE("j %l[legacy]", "nop", 0, RISCV_ISA_EXT_ZBB, 1)
> > +		: : : : legacy);
> > +
> > +	asm volatile (
> > +		".option push\n"
> > +		".option arch,+zbb\n"
> > +		"ctz %0, %1\n"
> > +		".option pop\n"
> > +		: "=r" (word) : "r" (word) :);
> > +
> > +	return word;
> 
> That's so clean!
> 
> > +
> > +legacy:
> > +	num = 0;
> > +#if BITS_PER_LONG == 64
> > +	if ((word & 0xffffffff) == 0) {
> > +		num += 32;
> > +		word >>= 32;
> > +	}
> > +#endif
> > +	if ((word & 0xffff) == 0) {
> > +		num += 16;
> > +		word >>= 16;
> > +	}
> > +	if ((word & 0xff) == 0) {
> > +		num += 8;
> > +		word >>= 8;
> > +	}
> > +	if ((word & 0xf) == 0) {
> > +		num += 4;
> > +		word >>= 4;
> > +	}
> > +	if ((word & 0x3) == 0) {
> > +		num += 2;
> > +		word >>= 2;
> > +	}
> > +	if ((word & 0x1) == 0)
> > +		num += 1;
> > +	return num;
> > +}
> > +
> > +/**
> > + * __ffs - find first set bit in a long word
> > + * @word: The word to search
> > + *
> > + * Undefined if no set bit exists, so code should check against 0 first.
> > + */
> > +#define __ffs(word)				\
> > +	(__builtin_constant_p(word) ?		\
> > +	 (unsigned long)__builtin_ctzl(word) :	\
> > +	 variable__ffs(word))
> > +
> > +static __always_inline unsigned long variable__fls(unsigned long word)
> > +{
> > +	int num;
> > +
> > +	asm_volatile_goto(
> > +		ALTERNATIVE("j %l[legacy]", "nop", 0, RISCV_ISA_EXT_ZBB, 1)
> > +		: : : : legacy);
> > +
> > +	asm volatile (
> > +		".option push\n"
> > +		".option arch,+zbb\n"
> > +		"clz %0, %1\n"
> > +		".option pop\n"
> > +		: "=r" (word) : "r" (word) :);
> > +
> > +	return BITS_PER_LONG - 1 - word;
> > +
> > +legacy:
> > +	num = BITS_PER_LONG - 1;
> > +#if BITS_PER_LONG == 64
> > +	if (!(word & (~0ul << 32))) {
> > +		num -= 32;
> > +		word <<= 32;
> > +	}
> > +#endif
> > +	if (!(word & (~0ul << (BITS_PER_LONG-16)))) {
> > +		num -= 16;
> > +		word <<= 16;
> > +	}
> > +	if (!(word & (~0ul << (BITS_PER_LONG-8)))) {
> > +		num -= 8;
> > +		word <<= 8;
> > +	}
> > +	if (!(word & (~0ul << (BITS_PER_LONG-4)))) {
> > +		num -= 4;
> > +		word <<= 4;
> > +	}
> > +	if (!(word & (~0ul << (BITS_PER_LONG-2)))) {
> > +		num -= 2;
> > +		word <<= 2;
> > +	}
> > +	if (!(word & (~0ul << (BITS_PER_LONG-1))))
> > +		num -= 1;
> > +	return num;
> > +}
> > +
> > +/**
> > + * __fls - find last set bit in a long word
> > + * @word: the word to search
> > + *
> > + * Undefined if no set bit exists, so code should check against 0 first.
> > + */
> > +#define __fls(word)							\
> > +	(__builtin_constant_p(word) ?					\
> > +	 (unsigned long)(BITS_PER_LONG - 1 - __builtin_clzl(word)) :	\
> > +	 variable__fls(word))
> > +
> > +static __always_inline int variable_ffs(int x)
> > +{
> > +	int r;
> > +
> > +	asm_volatile_goto(
> > +		ALTERNATIVE("j %l[legacy]", "nop", 0, RISCV_ISA_EXT_ZBB, 1)
> > +		: : : : legacy);
> > +
> > +	asm volatile (
> > +		".option push\n"
> > +		".option arch,+zbb\n"
> > +		"bnez %1, 1f\n"
> > +		"li %0, 0\n"
> > +		"j 2f\n"
> > +		"1:\n"
> > +		CTZW "%0, %1\n"
> > +		"addi %0, %0, 1\n"
> > +		"2:\n"
> > +		".option pop\n"
> > +		: "=r" (r) : "r" (x) :);
> > +
> > +	return r;
> > +
> 
> I generally like to move as much code out of asm as possible. You are
> also able to remove one of the branches if you rearrange as follows:
> 
> if (!x)
> 	return 0;
> 
> asm volatile (
> 	".option push\n"
>         ".option arch,+zbb\n"
>         CTZW "%0, %1\n"
>         ".option pop\n"
>         : "=r" (r) : "r" (x) :);
> 
> return r + 1;
> 
> You could then also extract out the "if (!x)" at the beginning of the
> legacy section at put it at the top of the function.
> 

Agree, less asm code will be more reader friendly. And your sample code looks
nice, thanks for the suggestion. Will do the adjustment in next version.

> > +legacy:
> > +	r = 1;
> > +	if (!x)
> > +		return 0;
> > +	if (!(x & 0xffff)) {
> > +		x >>= 16;
> > +		r += 16;
> > +	}
> > +	if (!(x & 0xff)) {
> > +		x >>= 8;
> > +		r += 8;
> > +	}
> > +	if (!(x & 0xf)) {
> > +		x >>= 4;
> > +		r += 4;
> > +	}
> > +	if (!(x & 3)) {
> > +		x >>= 2;
> > +		r += 2;
> > +	}
> > +	if (!(x & 1)) {
> > +		x >>= 1;
> > +		r += 1;
> > +	}
> > +	return r;
> > +}
> > +
> > +/**
> > + * ffs - find first set bit in a word
> > + * @x: the word to search
> > + *
> > + * This is defined the same way as the libc and compiler builtin ffs routines.
> > + *
> > + * ffs(value) returns 0 if value is 0 or the position of the first set bit if
> > + * value is nonzero. The first (least significant) bit is at position 1.
> > + */
> > +#define ffs(x) (__builtin_constant_p(x) ? __builtin_ffs(x) : variable_ffs(x))
> > +
> > +static __always_inline int variable_fls(unsigned int x)
> > +{
> > +	int r;
> > +
> > +	asm_volatile_goto(
> > +		ALTERNATIVE("j %l[legacy]", "nop", 0, RISCV_ISA_EXT_ZBB, 1)
> > +		: : : : legacy);
> > +
> > +	asm volatile (
> > +		".option push\n"
> > +		".option arch,+zbb\n"
> > +		"bnez %1, 1f\n"
> > +		"li %0, 0\n"
> > +		"j 2f\n"
> > +		"1:\n"
> > +		CLZW "%0, %1\n"
> > +		"neg %0, %0\n"
> > +		"addi %0, %0, 32\n"
> > +		"2:\n"
> > +		".option pop\n"
> > +		: "=r" (r) : "r" (x) :);
> > +
> > +	return r;
> > +
> 
> Same comment as above but with appropriate changes.

Will refine it in next version.

> 
> > +legacy:
> > +	r = 32;
> > +	if (!x)
> > +		return 0;
> > +	if (!(x & 0xffff0000u)) {
> > +		x <<= 16;
> > +		r -= 16;
> > +	}
> > +	if (!(x & 0xff000000u)) {
> > +		x <<= 8;
> > +		r -= 8;
> > +	}
> > +	if (!(x & 0xf0000000u)) {
> > +		x <<= 4;
> > +		r -= 4;
> > +	}
> > +	if (!(x & 0xc0000000u)) {
> > +		x <<= 2;
> > +		r -= 2;
> > +	}
> > +	if (!(x & 0x80000000u)) {
> > +		x <<= 1;
> > +		r -= 1;
> > +	}
> > +	return r;
> > +}
> > +
> > +/**
> > + * fls - find last set bit in a word
> > + * @x: the word to search
> > + *
> > + * This is defined in a similar way as ffs, but returns the position of the most
> > + * significant set bit.
> > + *
> > + * fls(value) returns 0 if value is 0 or the position of the last set bit if
> > + * value is nonzero. The last (most significant) bit is at position 32.
> > + */
> > +#define fls(x)
> 	\
> > +	(__builtin_constant_p(x) ?					\
> > +	 (int)(((x) != 0) ?						\
> > +	  (sizeof(unsigned int) * 8 - __builtin_clz(x)) : 0) :		\
> > +	 variable_fls(x))
> > +
> > +#endif
> 
> As a nit, it's nice when large #ifdef blocks have a comment like
> 
> /* !defined(CONFIG_RISCV_ISA_ZBB) || defined(NO_ALTERNATIVE) */
> 

Yes, I will add it in next version. Thanks for the comments.

BRs,
Xiao

> Overall great changes, just that small optimization to get rid of the
> jump.
> 
> - Charlie
> 
> > +
> > +#include <asm-generic/bitops/ffz.h>
> >  #include <asm-generic/bitops/fls64.h>
> >  #include <asm-generic/bitops/sched.h>
> > -#include <asm-generic/bitops/ffs.h>
> >
> >  #include <asm-generic/bitops/hweight.h>
> >
diff mbox series

Patch

diff --git a/arch/riscv/include/asm/bitops.h b/arch/riscv/include/asm/bitops.h
index 3540b690944b..c97e774cb647 100644
--- a/arch/riscv/include/asm/bitops.h
+++ b/arch/riscv/include/asm/bitops.h
@@ -15,13 +15,273 @@ 
 #include <asm/barrier.h>
 #include <asm/bitsperlong.h>
 
+#if !defined(CONFIG_RISCV_ISA_ZBB) || defined(NO_ALTERNATIVE)
 #include <asm-generic/bitops/__ffs.h>
-#include <asm-generic/bitops/ffz.h>
-#include <asm-generic/bitops/fls.h>
 #include <asm-generic/bitops/__fls.h>
+#include <asm-generic/bitops/ffs.h>
+#include <asm-generic/bitops/fls.h>
+
+#else
+#include <asm/alternative-macros.h>
+#include <asm/hwcap.h>
+
+#if (BITS_PER_LONG == 64)
+#define CTZW	"ctzw "
+#define CLZW	"clzw "
+#elif (BITS_PER_LONG == 32)
+#define CTZW	"ctz "
+#define CLZW	"clz "
+#else
+#error "Unexpected BITS_PER_LONG"
+#endif
+
+static __always_inline unsigned long variable__ffs(unsigned long word)
+{
+	int num;
+
+	asm_volatile_goto(
+		ALTERNATIVE("j %l[legacy]", "nop", 0, RISCV_ISA_EXT_ZBB, 1)
+		: : : : legacy);
+
+	asm volatile (
+		".option push\n"
+		".option arch,+zbb\n"
+		"ctz %0, %1\n"
+		".option pop\n"
+		: "=r" (word) : "r" (word) :);
+
+	return word;
+
+legacy:
+	num = 0;
+#if BITS_PER_LONG == 64
+	if ((word & 0xffffffff) == 0) {
+		num += 32;
+		word >>= 32;
+	}
+#endif
+	if ((word & 0xffff) == 0) {
+		num += 16;
+		word >>= 16;
+	}
+	if ((word & 0xff) == 0) {
+		num += 8;
+		word >>= 8;
+	}
+	if ((word & 0xf) == 0) {
+		num += 4;
+		word >>= 4;
+	}
+	if ((word & 0x3) == 0) {
+		num += 2;
+		word >>= 2;
+	}
+	if ((word & 0x1) == 0)
+		num += 1;
+	return num;
+}
+
+/**
+ * __ffs - find first set bit in a long word
+ * @word: The word to search
+ *
+ * Undefined if no set bit exists, so code should check against 0 first.
+ */
+#define __ffs(word)				\
+	(__builtin_constant_p(word) ?		\
+	 (unsigned long)__builtin_ctzl(word) :	\
+	 variable__ffs(word))
+
+static __always_inline unsigned long variable__fls(unsigned long word)
+{
+	int num;
+
+	asm_volatile_goto(
+		ALTERNATIVE("j %l[legacy]", "nop", 0, RISCV_ISA_EXT_ZBB, 1)
+		: : : : legacy);
+
+	asm volatile (
+		".option push\n"
+		".option arch,+zbb\n"
+		"clz %0, %1\n"
+		".option pop\n"
+		: "=r" (word) : "r" (word) :);
+
+	return BITS_PER_LONG - 1 - word;
+
+legacy:
+	num = BITS_PER_LONG - 1;
+#if BITS_PER_LONG == 64
+	if (!(word & (~0ul << 32))) {
+		num -= 32;
+		word <<= 32;
+	}
+#endif
+	if (!(word & (~0ul << (BITS_PER_LONG-16)))) {
+		num -= 16;
+		word <<= 16;
+	}
+	if (!(word & (~0ul << (BITS_PER_LONG-8)))) {
+		num -= 8;
+		word <<= 8;
+	}
+	if (!(word & (~0ul << (BITS_PER_LONG-4)))) {
+		num -= 4;
+		word <<= 4;
+	}
+	if (!(word & (~0ul << (BITS_PER_LONG-2)))) {
+		num -= 2;
+		word <<= 2;
+	}
+	if (!(word & (~0ul << (BITS_PER_LONG-1))))
+		num -= 1;
+	return num;
+}
+
+/**
+ * __fls - find last set bit in a long word
+ * @word: the word to search
+ *
+ * Undefined if no set bit exists, so code should check against 0 first.
+ */
+#define __fls(word)							\
+	(__builtin_constant_p(word) ?					\
+	 (unsigned long)(BITS_PER_LONG - 1 - __builtin_clzl(word)) :	\
+	 variable__fls(word))
+
+static __always_inline int variable_ffs(int x)
+{
+	int r;
+
+	asm_volatile_goto(
+		ALTERNATIVE("j %l[legacy]", "nop", 0, RISCV_ISA_EXT_ZBB, 1)
+		: : : : legacy);
+
+	asm volatile (
+		".option push\n"
+		".option arch,+zbb\n"
+		"bnez %1, 1f\n"
+		"li %0, 0\n"
+		"j 2f\n"
+		"1:\n"
+		CTZW "%0, %1\n"
+		"addi %0, %0, 1\n"
+		"2:\n"
+		".option pop\n"
+		: "=r" (r) : "r" (x) :);
+
+	return r;
+
+legacy:
+	r = 1;
+	if (!x)
+		return 0;
+	if (!(x & 0xffff)) {
+		x >>= 16;
+		r += 16;
+	}
+	if (!(x & 0xff)) {
+		x >>= 8;
+		r += 8;
+	}
+	if (!(x & 0xf)) {
+		x >>= 4;
+		r += 4;
+	}
+	if (!(x & 3)) {
+		x >>= 2;
+		r += 2;
+	}
+	if (!(x & 1)) {
+		x >>= 1;
+		r += 1;
+	}
+	return r;
+}
+
+/**
+ * ffs - find first set bit in a word
+ * @x: the word to search
+ *
+ * This is defined the same way as the libc and compiler builtin ffs routines.
+ *
+ * ffs(value) returns 0 if value is 0 or the position of the first set bit if
+ * value is nonzero. The first (least significant) bit is at position 1.
+ */
+#define ffs(x) (__builtin_constant_p(x) ? __builtin_ffs(x) : variable_ffs(x))
+
+static __always_inline int variable_fls(unsigned int x)
+{
+	int r;
+
+	asm_volatile_goto(
+		ALTERNATIVE("j %l[legacy]", "nop", 0, RISCV_ISA_EXT_ZBB, 1)
+		: : : : legacy);
+
+	asm volatile (
+		".option push\n"
+		".option arch,+zbb\n"
+		"bnez %1, 1f\n"
+		"li %0, 0\n"
+		"j 2f\n"
+		"1:\n"
+		CLZW "%0, %1\n"
+		"neg %0, %0\n"
+		"addi %0, %0, 32\n"
+		"2:\n"
+		".option pop\n"
+		: "=r" (r) : "r" (x) :);
+
+	return r;
+
+legacy:
+	r = 32;
+	if (!x)
+		return 0;
+	if (!(x & 0xffff0000u)) {
+		x <<= 16;
+		r -= 16;
+	}
+	if (!(x & 0xff000000u)) {
+		x <<= 8;
+		r -= 8;
+	}
+	if (!(x & 0xf0000000u)) {
+		x <<= 4;
+		r -= 4;
+	}
+	if (!(x & 0xc0000000u)) {
+		x <<= 2;
+		r -= 2;
+	}
+	if (!(x & 0x80000000u)) {
+		x <<= 1;
+		r -= 1;
+	}
+	return r;
+}
+
+/**
+ * fls - find last set bit in a word
+ * @x: the word to search
+ *
+ * This is defined in a similar way as ffs, but returns the position of the most
+ * significant set bit.
+ *
+ * fls(value) returns 0 if value is 0 or the position of the last set bit if
+ * value is nonzero. The last (most significant) bit is at position 32.
+ */
+#define fls(x)								\
+	(__builtin_constant_p(x) ?					\
+	 (int)(((x) != 0) ?						\
+	  (sizeof(unsigned int) * 8 - __builtin_clz(x)) : 0) :		\
+	 variable_fls(x))
+
+#endif
+
+#include <asm-generic/bitops/ffz.h>
 #include <asm-generic/bitops/fls64.h>
 #include <asm-generic/bitops/sched.h>
-#include <asm-generic/bitops/ffs.h>
 
 #include <asm-generic/bitops/hweight.h>
 
diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile
index a1157c2a7170..d68cacd4e3af 100644
--- a/drivers/firmware/efi/libstub/Makefile
+++ b/drivers/firmware/efi/libstub/Makefile
@@ -28,7 +28,7 @@  cflags-$(CONFIG_ARM)		+= -DEFI_HAVE_STRLEN -DEFI_HAVE_STRNLEN \
 				   -DEFI_HAVE_MEMCHR -DEFI_HAVE_STRRCHR \
 				   -DEFI_HAVE_STRCMP -fno-builtin -fpic \
 				   $(call cc-option,-mno-single-pic-base)
-cflags-$(CONFIG_RISCV)		+= -fpic
+cflags-$(CONFIG_RISCV)		+= -fpic -DNO_ALTERNATIVE
 cflags-$(CONFIG_LOONGARCH)	+= -fpie
 
 cflags-$(CONFIG_EFI_PARAMS_FROM_FDT)	+= -I$(srctree)/scripts/dtc/libfdt