Message ID | 1581580379-54109-3-git-send-email-robert.hu@linux.intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Add AVX512F optimization option and buffer_zero_avx512() | expand |
On 13/02/20 08:52, Robert Hoo wrote: > + > +} > +#pragma GCC pop_options > +#endif > + > + > /* Note that for test_buffer_is_zero_next_accel, the most preferred > * ISA must have the least significant bit. > */ > -#define CACHE_AVX2 1 > -#define CACHE_SSE4 2 > -#define CACHE_SSE2 4 > +#define CACHE_AVX512F 1 > +#define CACHE_AVX2 2 > +#define CACHE_SSE4 4 > +#define CACHE_SSE2 6 This should be 8, not 6. Paolo > > /* Make sure that these variables are appropriately initialized when > * SSE2 is enabled on the compiler command-line, but the compiler is > @@ -226,6 +268,11 @@ static void init_accel(unsigned cache) > fn = buffer_zero_avx2; > } > #endif > +#ifdef CONFIG_AVX512F_OPT > + if (cache & CACHE_AVX512F) { > + fn = buffer_zero_avx512; > + } > +#endif > buffer_accel = fn; > } > > @@ -255,6 +302,9 @@ static void __attribute__((constructor)) init_cpuid_cache(void) > if ((bv & 6) == 6 && (b & bit_AVX2)) { > cache |= CACHE_AVX2; > } > + if ((bv & 6) == 6 && (b & bit_AVX512F)) { > + cache |= CACHE_AVX512F; > + } > }
On Thu, 2020-02-13 at 11:30 +0100, Paolo Bonzini wrote: > On 13/02/20 08:52, Robert Hoo wrote: > > + > > +} > > +#pragma GCC pop_options > > +#endif > > + > > + > > /* Note that for test_buffer_is_zero_next_accel, the most > > preferred > > * ISA must have the least significant bit. > > */ > > -#define CACHE_AVX2 1 > > -#define CACHE_SSE4 2 > > -#define CACHE_SSE2 4 > > +#define CACHE_AVX512F 1 > > +#define CACHE_AVX2 2 > > +#define CACHE_SSE4 4 > > +#define CACHE_SSE2 6 > > This should be 8, not 6. > > Paolo Thanks Paolo, going to fix it in v2. > > > > > /* Make sure that these variables are appropriately initialized > > when > > * SSE2 is enabled on the compiler command-line, but the compiler > > is > > @@ -226,6 +268,11 @@ static void init_accel(unsigned cache) > > fn = buffer_zero_avx2; > > } > > #endif > > +#ifdef CONFIG_AVX512F_OPT > > + if (cache & CACHE_AVX512F) { > > + fn = buffer_zero_avx512; > > + } > > +#endif > > buffer_accel = fn; > > } > > > > @@ -255,6 +302,9 @@ static void __attribute__((constructor)) > > init_cpuid_cache(void) > > if ((bv & 6) == 6 && (b & bit_AVX2)) { > > cache |= CACHE_AVX2; > > } > > + if ((bv & 6) == 6 && (b & bit_AVX512F)) { > > + cache |= CACHE_AVX512F; > > + } > > } > >
On 2/12/20 11:52 PM, Robert Hoo wrote: > And initialize buffer_is_zero() with it, when Intel AVX512F is > available on host. > > This function utilizes Intel AVX512 fundamental instructions which > perform over previous AVX2 instructions. Is it not still true that any AVX512 insn will cause the entire cpu package, not just the current core, to drop frequency by 20%? As far as I know one should only use the 512-bit instructions when you can overcome that frequency drop, which seems unlikely in this case. That said... > + if (unlikely(len < 64)) { /*buff less than 512 bits, unlikely*/ > + return buffer_zero_int(buf, len); > + } First, len < 64 has been eliminated already in select_accel_fn. Second, len < 256 is not handled properly by the code below... > + /* Begin with an unaligned head of 64 bytes. */ > + t = _mm512_loadu_si512(buf); > + p = (__m512i *)(((uintptr_t)buf + 5 * 64) & -64); > + e = (__m512i *)(((uintptr_t)buf + len) & -64); > + > + /* Loop over 64-byte aligned blocks of 256. */ > + while (p < e) { > + __builtin_prefetch(p); > + if (unlikely(_mm512_test_epi64_mask(t, t))) { > + return false; > + } > + t = p[-4] | p[-3] | p[-2] | p[-1]; > + p += 4; > + } > + > + t |= _mm512_loadu_si512(buf + len - 4 * 64); > + t |= _mm512_loadu_si512(buf + len - 3 * 64); > + t |= _mm512_loadu_si512(buf + len - 2 * 64); > + t |= _mm512_loadu_si512(buf + len - 1 * 64); ... because this final sequence loads 256 bytes. Rather than make a second test vs 256 in buffer_zero_avx512, I wonder if it would be better to have select_accel_fn do the job. Have a global variable buffer_accel_size alongside buffer_accel so there's only one branch (mis)predict to worry about. FWIW, something that the compiler should do, but doesn't currently, is use vpternlogq to perform a 3-input OR. Something like /* 0xfe -> orABC */ t = _mm512_ternarylogic_epi64(t, p[-4], p[-3], 0xfe); t = _mm512_ternarylogic_epi64(t, p[-2], p[-1], 0xfe); r~
Thanks Richard:-) Sorry for late reply. On Thu, 2020-02-13 at 10:20 -0800, Richard Henderson wrote: > On 2/12/20 11:52 PM, Robert Hoo wrote: > > And initialize buffer_is_zero() with it, when Intel AVX512F is > > available on host. > > > > This function utilizes Intel AVX512 fundamental instructions which > > perform over previous AVX2 instructions. > > Is it not still true that any AVX512 insn will cause the entire cpu > package, > not just the current core, to drop frequency by 20%? > > As far as I know one should only use the 512-bit instructions when > you can > overcome that frequency drop, which seems unlikely in this > case. That said... > I don't think so. AVX512 has been applied in various places. > > + if (unlikely(len < 64)) { /*buff less than 512 bits, > > unlikely*/ > > + return buffer_zero_int(buf, len); > > + } > > First, len < 64 has been eliminated already in select_accel_fn. > Second, len < 256 is not handled properly by the code below... > Right. I'm going to fix this in v2. > > > + /* Begin with an unaligned head of 64 bytes. */ > > + t = _mm512_loadu_si512(buf); > > + p = (__m512i *)(((uintptr_t)buf + 5 * 64) & -64); > > + e = (__m512i *)(((uintptr_t)buf + len) & -64); > > + > > + /* Loop over 64-byte aligned blocks of 256. */ > > + while (p < e) { > > + __builtin_prefetch(p); > > + if (unlikely(_mm512_test_epi64_mask(t, t))) { > > + return false; > > + } > > + t = p[-4] | p[-3] | p[-2] | p[-1]; > > + p += 4; > > + } > > + > > + t |= _mm512_loadu_si512(buf + len - 4 * 64); > > + t |= _mm512_loadu_si512(buf + len - 3 * 64); > > + t |= _mm512_loadu_si512(buf + len - 2 * 64); > > + t |= _mm512_loadu_si512(buf + len - 1 * 64); > > ... because this final sequence loads 256 bytes. > > Rather than make a second test vs 256 in buffer_zero_avx512, I wonder > if it > would be better to have select_accel_fn do the job. Have a global > variable > buffer_accel_size alongside buffer_accel so there's only one branch > (mis)predict to worry about. > Thanks Richard, very enlightening! Inspired by your suggestion, I'm thinking go further: use immediate rather than a global variable, so that saves 1 memory(/cache) access. #ifdef CONFIG_AVX512F_OPT #define OPTIMIZE_LEN 256 #else #define OPTIMIZE_LEN 64 #endif > FWIW, something that the compiler should do, but doesn't currently, > is use > vpternlogq to perform a 3-input OR. Something like > > /* 0xfe -> orABC */ > t = _mm512_ternarylogic_epi64(t, p[-4], p[-3], 0xfe); > t = _mm512_ternarylogic_epi64(t, p[-2], p[-1], 0xfe); > Very enlightening. Yes, seems compiler doesn't do this. I tried explicitly use this, however, looks it will have more instructions generated, and unit test shows it performs less than then conventional code. Let me keep the conventional code for this moment, will ask around and dig further outside this patch. > > r~
On 2/23/20 11:07 PM, Robert Hoo wrote: > Inspired by your suggestion, I'm thinking go further: use immediate > rather than a global variable, so that saves 1 memory(/cache) access. > > #ifdef CONFIG_AVX512F_OPT > #define OPTIMIZE_LEN 256 > #else > #define OPTIMIZE_LEN 64 > #endif With that, the testing in tests/test-bufferiszero.c, looping through the implementations, is invalidated. Because once you start compiling for avx512, you're no longer testing sse2 et al with the same inputs. IF we want to change the length to suit avx512, we would want to change it unconditionally. And then you could also tidy up avx2 to avoid the extra comparisons there. r~
On Mon, 2020-02-24 at 08:13 -0800, Richard Henderson wrote: > On 2/23/20 11:07 PM, Robert Hoo wrote: > > Inspired by your suggestion, I'm thinking go further: use immediate > > rather than a global variable, so that saves 1 memory(/cache) > > access. > > > > #ifdef CONFIG_AVX512F_OPT > > #define OPTIMIZE_LEN 256 > > #else > > #define OPTIMIZE_LEN 64 > > #endif > > With that, the testing in tests/test-bufferiszero.c, looping through > the > implementations, is invalidated. Because once you start compiling > for avx512, > you're no longer testing sse2 et al with the same inputs. > Right. Thanks pointing out. I didn't noticed that. More precisely, it would cause no longer testing sse2 et al with < 256 length. > IF we want to change the length to suit avx512, we would want to > change it > unconditionally. And then you could also tidy up avx2 to avoid the > extra > comparisons there. Considering the length's dependency on sse2/sse4/avx2/avx512 and the algorithms, as well as future's possible changes, additions, I'd rather roll back to your original suggestion, use a companion variable with each accel_fn(). How do you like it? > > > r~
On 2/24/20 11:34 PM, Robert Hoo wrote: > Considering the length's dependency on sse2/sse4/avx2/avx512 and the > algorithms, as well as future's possible changes, additions, I'd rather > roll back to your original suggestion, use a companion variable with > each accel_fn(). How do you like it? How do I like it? With a modification to init_accel() so that the function and the minimum length are selected at the same time. r~
diff --git a/include/qemu/cpuid.h b/include/qemu/cpuid.h index 6930170..09fc245 100644 --- a/include/qemu/cpuid.h +++ b/include/qemu/cpuid.h @@ -45,6 +45,9 @@ #ifndef bit_AVX2 #define bit_AVX2 (1 << 5) #endif +#ifndef bit_AVX512F +#define bit_AVX512F (1 << 16) +#endif #ifndef bit_BMI2 #define bit_BMI2 (1 << 8) #endif diff --git a/util/bufferiszero.c b/util/bufferiszero.c index bfb2605..cbb854a 100644 --- a/util/bufferiszero.c +++ b/util/bufferiszero.c @@ -187,12 +187,54 @@ buffer_zero_avx2(const void *buf, size_t len) #pragma GCC pop_options #endif /* CONFIG_AVX2_OPT */ +#ifdef CONFIG_AVX512F_OPT +#pragma GCC push_options +#pragma GCC target("avx512f") +#include <immintrin.h> + +static bool +buffer_zero_avx512(const void *buf, size_t len) +{ + __m512i t; + __m512i *p, *e; + + if (unlikely(len < 64)) { /*buff less than 512 bits, unlikely*/ + return buffer_zero_int(buf, len); + } + /* Begin with an unaligned head of 64 bytes. */ + t = _mm512_loadu_si512(buf); + p = (__m512i *)(((uintptr_t)buf + 5 * 64) & -64); + e = (__m512i *)(((uintptr_t)buf + len) & -64); + + /* Loop over 64-byte aligned blocks of 256. */ + while (p < e) { + __builtin_prefetch(p); + if (unlikely(_mm512_test_epi64_mask(t, t))) { + return false; + } + t = p[-4] | p[-3] | p[-2] | p[-1]; + p += 4; + } + + t |= _mm512_loadu_si512(buf + len - 4 * 64); + t |= _mm512_loadu_si512(buf + len - 3 * 64); + t |= _mm512_loadu_si512(buf + len - 2 * 64); + t |= _mm512_loadu_si512(buf + len - 1 * 64); + + return !_mm512_test_epi64_mask(t, t); + +} +#pragma GCC pop_options +#endif + + /* Note that for test_buffer_is_zero_next_accel, the most preferred * ISA must have the least significant bit. */ -#define CACHE_AVX2 1 -#define CACHE_SSE4 2 -#define CACHE_SSE2 4 +#define CACHE_AVX512F 1 +#define CACHE_AVX2 2 +#define CACHE_SSE4 4 +#define CACHE_SSE2 6 /* Make sure that these variables are appropriately initialized when * SSE2 is enabled on the compiler command-line, but the compiler is @@ -226,6 +268,11 @@ static void init_accel(unsigned cache) fn = buffer_zero_avx2; } #endif +#ifdef CONFIG_AVX512F_OPT + if (cache & CACHE_AVX512F) { + fn = buffer_zero_avx512; + } +#endif buffer_accel = fn; } @@ -255,6 +302,9 @@ static void __attribute__((constructor)) init_cpuid_cache(void) if ((bv & 6) == 6 && (b & bit_AVX2)) { cache |= CACHE_AVX2; } + if ((bv & 6) == 6 && (b & bit_AVX512F)) { + cache |= CACHE_AVX512F; + } } } cpuid_cache = cache;
And initialize buffer_is_zero() with it, when Intel AVX512F is available on host. This function utilizes Intel AVX512 fundamental instructions which perform over previous AVX2 instructions. Signed-off-by: Robert Hoo <robert.hu@linux.intel.com> --- include/qemu/cpuid.h | 3 +++ util/bufferiszero.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 56 insertions(+), 3 deletions(-)