Message ID | 1453280742-20718-3-git-send-email-liang.z.li@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 20/01/2016 10:05, Liang Li wrote: > buffer_find_nonzero_offset() is a hot function during live migration. > Now it use SSE2 instructions for optimization. For platform supports > AVX2 instructions, use AVX2 instructions for optimization can help > to improve the performance about 30% comparing to SSE2. > > Zero page check can be faster with this optimization, the test result > shows that for an 8GiB RAM idle guest just boots, this patch can help > to shorten the total live migration time about 6%. > > This patch use the ifunc mechanism to select the proper function when > running, for platform supports AVX2, execute the AVX2 instructions, > else, execute the original instructions. > > Signed-off-by: Liang Li <liang.z.li@intel.com> Reviewed-by: Paolo Bonzini <pbonzini@redhat.com> > --- > include/qemu-common.h | 8 +--- > util/cutils.c | 118 ++++++++++++++++++++++++++++++++++++++++++++++++-- > 2 files changed, 115 insertions(+), 11 deletions(-) > > diff --git a/include/qemu-common.h b/include/qemu-common.h > index 22b010c..f4c8c24 100644 > --- a/include/qemu-common.h > +++ b/include/qemu-common.h > @@ -483,13 +483,7 @@ void qemu_hexdump(const char *buf, FILE *fp, const char *prefix, size_t size); > #endif > > #define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR 8 > -static inline bool > -can_use_buffer_find_nonzero_offset(const void *buf, size_t len) > -{ > - return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR > - * sizeof(VECTYPE)) == 0 > - && ((uintptr_t) buf) % sizeof(VECTYPE) == 0); > -} > +bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len); > size_t buffer_find_nonzero_offset(const void *buf, size_t len); > > /* > diff --git a/util/cutils.c b/util/cutils.c > index cfeb848..5c8ee5c 100644 > --- a/util/cutils.c > +++ b/util/cutils.c > @@ -161,6 +161,14 @@ int qemu_fdatasync(int fd) > #endif > } > > +static bool > +can_use_buffer_find_nonzero_offset_inner(const void *buf, size_t len) > +{ > + return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR > + * sizeof(VECTYPE)) == 0 > + && ((uintptr_t) buf) % sizeof(VECTYPE) == 0); > +} > + > /* > * Searches for an area with non-zero content in a buffer > * > @@ -169,8 +177,8 @@ int qemu_fdatasync(int fd) > * and addr must be a multiple of sizeof(VECTYPE) due to > * restriction of optimizations in this function. > * > - * can_use_buffer_find_nonzero_offset() can be used to check > - * these requirements. > + * can_use_buffer_find_nonzero_offset_inner() can be used to > + * check these requirements. > * > * The return value is the offset of the non-zero area rounded > * down to a multiple of sizeof(VECTYPE) for the first > @@ -181,13 +189,13 @@ int qemu_fdatasync(int fd) > * If the buffer is all zero the return value is equal to len. > */ > > -size_t buffer_find_nonzero_offset(const void *buf, size_t len) > +static size_t buffer_find_nonzero_offset_inner(const void *buf, size_t len) > { > const VECTYPE *p = buf; > const VECTYPE zero = (VECTYPE){0}; > size_t i; > > - assert(can_use_buffer_find_nonzero_offset(buf, len)); > + assert(can_use_buffer_find_nonzero_offset_inner(buf, len)); > > if (!len) { > return 0; > @@ -216,6 +224,108 @@ size_t buffer_find_nonzero_offset(const void *buf, size_t len) > return i * sizeof(VECTYPE); > } > > +#ifdef CONFIG_AVX2_OPT > +#pragma GCC push_options > +#pragma GCC target("avx2") > +#include <cpuid.h> > +#include <immintrin.h> > + > +#define AVX2_VECTYPE __m256i > +#define AVX2_SPLAT(p) _mm256_set1_epi8(*(p)) > +#define AVX2_ALL_EQ(v1, v2) \ > + (_mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, v2)) == 0xFFFFFFFF) > +#define AVX2_VEC_OR(v1, v2) (_mm256_or_si256(v1, v2)) > + > +static bool > +can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len) > +{ > + return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR > + * sizeof(AVX2_VECTYPE)) == 0 > + && ((uintptr_t) buf) % sizeof(AVX2_VECTYPE) == 0); > +} > + > +static size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t len) > +{ > + const AVX2_VECTYPE *p = buf; > + const AVX2_VECTYPE zero = (AVX2_VECTYPE){0}; > + size_t i; > + > + assert(can_use_buffer_find_nonzero_offset_avx2(buf, len)); > + > + if (!len) { > + return 0; > + } > + > + for (i = 0; i < BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; i++) { > + if (!AVX2_ALL_EQ(p[i], zero)) { > + return i * sizeof(AVX2_VECTYPE); > + } > + } > + > + for (i = BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; > + i < len / sizeof(AVX2_VECTYPE); > + i += BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR) { > + AVX2_VECTYPE tmp0 = AVX2_VEC_OR(p[i + 0], p[i + 1]); > + AVX2_VECTYPE tmp1 = AVX2_VEC_OR(p[i + 2], p[i + 3]); > + AVX2_VECTYPE tmp2 = AVX2_VEC_OR(p[i + 4], p[i + 5]); > + AVX2_VECTYPE tmp3 = AVX2_VEC_OR(p[i + 6], p[i + 7]); > + AVX2_VECTYPE tmp01 = AVX2_VEC_OR(tmp0, tmp1); > + AVX2_VECTYPE tmp23 = AVX2_VEC_OR(tmp2, tmp3); > + if (!AVX2_ALL_EQ(AVX2_VEC_OR(tmp01, tmp23), zero)) { > + break; > + } > + } > + > + return i * sizeof(AVX2_VECTYPE); > +} > + > +static bool avx2_support(void) > +{ > + int a, b, c, d; > + > + if (__get_cpuid_max(0, NULL) < 7) { > + return false; > + } > + > + __cpuid_count(7, 0, a, b, c, d); > + > + return b & bit_AVX2; > +} > + > +bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len) \ > + __attribute__ ((ifunc("can_use_buffer_find_nonzero_offset_ifunc"))); > +size_t buffer_find_nonzero_offset(const void *buf, size_t len) \ > + __attribute__ ((ifunc("buffer_find_nonzero_offset_ifunc"))); > + > +static void *buffer_find_nonzero_offset_ifunc(void) > +{ > + typeof(buffer_find_nonzero_offset) *func = (avx2_support()) ? > + buffer_find_nonzero_offset_avx2 : buffer_find_nonzero_offset_inner; > + > + return func; > +} > + > +static void *can_use_buffer_find_nonzero_offset_ifunc(void) > +{ > + typeof(can_use_buffer_find_nonzero_offset) *func = (avx2_support()) ? > + can_use_buffer_find_nonzero_offset_avx2 : > + can_use_buffer_find_nonzero_offset_inner; > + > + return func; > +} > +#pragma GCC pop_options > +#else > +bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len) > +{ > + return can_use_buffer_find_nonzero_offset_inner(buf, len); > +} > + > +size_t buffer_find_nonzero_offset(const void *buf, size_t len) > +{ > + return buffer_find_nonzero_offset_inner(buf, len); > +} > +#endif > + > /* > * Checks if a buffer is all zeroes > * >
diff --git a/include/qemu-common.h b/include/qemu-common.h index 22b010c..f4c8c24 100644 --- a/include/qemu-common.h +++ b/include/qemu-common.h @@ -483,13 +483,7 @@ void qemu_hexdump(const char *buf, FILE *fp, const char *prefix, size_t size); #endif #define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR 8 -static inline bool -can_use_buffer_find_nonzero_offset(const void *buf, size_t len) -{ - return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR - * sizeof(VECTYPE)) == 0 - && ((uintptr_t) buf) % sizeof(VECTYPE) == 0); -} +bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len); size_t buffer_find_nonzero_offset(const void *buf, size_t len); /* diff --git a/util/cutils.c b/util/cutils.c index cfeb848..5c8ee5c 100644 --- a/util/cutils.c +++ b/util/cutils.c @@ -161,6 +161,14 @@ int qemu_fdatasync(int fd) #endif } +static bool +can_use_buffer_find_nonzero_offset_inner(const void *buf, size_t len) +{ + return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR + * sizeof(VECTYPE)) == 0 + && ((uintptr_t) buf) % sizeof(VECTYPE) == 0); +} + /* * Searches for an area with non-zero content in a buffer * @@ -169,8 +177,8 @@ int qemu_fdatasync(int fd) * and addr must be a multiple of sizeof(VECTYPE) due to * restriction of optimizations in this function. * - * can_use_buffer_find_nonzero_offset() can be used to check - * these requirements. + * can_use_buffer_find_nonzero_offset_inner() can be used to + * check these requirements. * * The return value is the offset of the non-zero area rounded * down to a multiple of sizeof(VECTYPE) for the first @@ -181,13 +189,13 @@ int qemu_fdatasync(int fd) * If the buffer is all zero the return value is equal to len. */ -size_t buffer_find_nonzero_offset(const void *buf, size_t len) +static size_t buffer_find_nonzero_offset_inner(const void *buf, size_t len) { const VECTYPE *p = buf; const VECTYPE zero = (VECTYPE){0}; size_t i; - assert(can_use_buffer_find_nonzero_offset(buf, len)); + assert(can_use_buffer_find_nonzero_offset_inner(buf, len)); if (!len) { return 0; @@ -216,6 +224,108 @@ size_t buffer_find_nonzero_offset(const void *buf, size_t len) return i * sizeof(VECTYPE); } +#ifdef CONFIG_AVX2_OPT +#pragma GCC push_options +#pragma GCC target("avx2") +#include <cpuid.h> +#include <immintrin.h> + +#define AVX2_VECTYPE __m256i +#define AVX2_SPLAT(p) _mm256_set1_epi8(*(p)) +#define AVX2_ALL_EQ(v1, v2) \ + (_mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, v2)) == 0xFFFFFFFF) +#define AVX2_VEC_OR(v1, v2) (_mm256_or_si256(v1, v2)) + +static bool +can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len) +{ + return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR + * sizeof(AVX2_VECTYPE)) == 0 + && ((uintptr_t) buf) % sizeof(AVX2_VECTYPE) == 0); +} + +static size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t len) +{ + const AVX2_VECTYPE *p = buf; + const AVX2_VECTYPE zero = (AVX2_VECTYPE){0}; + size_t i; + + assert(can_use_buffer_find_nonzero_offset_avx2(buf, len)); + + if (!len) { + return 0; + } + + for (i = 0; i < BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; i++) { + if (!AVX2_ALL_EQ(p[i], zero)) { + return i * sizeof(AVX2_VECTYPE); + } + } + + for (i = BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; + i < len / sizeof(AVX2_VECTYPE); + i += BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR) { + AVX2_VECTYPE tmp0 = AVX2_VEC_OR(p[i + 0], p[i + 1]); + AVX2_VECTYPE tmp1 = AVX2_VEC_OR(p[i + 2], p[i + 3]); + AVX2_VECTYPE tmp2 = AVX2_VEC_OR(p[i + 4], p[i + 5]); + AVX2_VECTYPE tmp3 = AVX2_VEC_OR(p[i + 6], p[i + 7]); + AVX2_VECTYPE tmp01 = AVX2_VEC_OR(tmp0, tmp1); + AVX2_VECTYPE tmp23 = AVX2_VEC_OR(tmp2, tmp3); + if (!AVX2_ALL_EQ(AVX2_VEC_OR(tmp01, tmp23), zero)) { + break; + } + } + + return i * sizeof(AVX2_VECTYPE); +} + +static bool avx2_support(void) +{ + int a, b, c, d; + + if (__get_cpuid_max(0, NULL) < 7) { + return false; + } + + __cpuid_count(7, 0, a, b, c, d); + + return b & bit_AVX2; +} + +bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len) \ + __attribute__ ((ifunc("can_use_buffer_find_nonzero_offset_ifunc"))); +size_t buffer_find_nonzero_offset(const void *buf, size_t len) \ + __attribute__ ((ifunc("buffer_find_nonzero_offset_ifunc"))); + +static void *buffer_find_nonzero_offset_ifunc(void) +{ + typeof(buffer_find_nonzero_offset) *func = (avx2_support()) ? + buffer_find_nonzero_offset_avx2 : buffer_find_nonzero_offset_inner; + + return func; +} + +static void *can_use_buffer_find_nonzero_offset_ifunc(void) +{ + typeof(can_use_buffer_find_nonzero_offset) *func = (avx2_support()) ? + can_use_buffer_find_nonzero_offset_avx2 : + can_use_buffer_find_nonzero_offset_inner; + + return func; +} +#pragma GCC pop_options +#else +bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len) +{ + return can_use_buffer_find_nonzero_offset_inner(buf, len); +} + +size_t buffer_find_nonzero_offset(const void *buf, size_t len) +{ + return buffer_find_nonzero_offset_inner(buf, len); +} +#endif + /* * Checks if a buffer is all zeroes *
buffer_find_nonzero_offset() is a hot function during live migration. Now it use SSE2 instructions for optimization. For platform supports AVX2 instructions, use AVX2 instructions for optimization can help to improve the performance about 30% comparing to SSE2. Zero page check can be faster with this optimization, the test result shows that for an 8GiB RAM idle guest just boots, this patch can help to shorten the total live migration time about 6%. This patch use the ifunc mechanism to select the proper function when running, for platform supports AVX2, execute the AVX2 instructions, else, execute the original instructions. Signed-off-by: Liang Li <liang.z.li@intel.com> --- include/qemu-common.h | 8 +--- util/cutils.c | 118 ++++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 115 insertions(+), 11 deletions(-)