Message ID | 1460023087-31509-2-git-send-email-vijayak@caviumnetworks.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
> +#elif defined __aarch64__ > +#include "arm_neon.h" > + > +#define NEON_VECTYPE uint64x2_t > +#define NEON_LOAD_N_ORR(v1, v2) (vld1q_u64(&v1) | vld1q_u64(&v2)) Why is the load and orr necessary? Is ((v1) | (v2)) enough? > +#define NEON_ORR(v1, v2) ((v1) | (v2)) > +#define NEON_NOT_EQ_ZERO(v1) \ > + ((vgetq_lane_u64(v1, 0) != 0) || (vgetq_lane_u64(v1, 1) != 0)) > + > +#define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR_NEON 16 Unless you have numbers saying that a 16-unroll is better than an 8-unroll (and then you should put those in the commit message), you do not need to duplicate code, just add aarch64 definitions for the existing code. --- I've now read the rest of the patches, and you're adding prefetch code that is ARM-specific. Please provide numbers separately for each patch, not just in the cover letter. The cover letter is lost when the patch is committed, while the commit messages remain. On top of this, "With these changes, total migration time comes down from 10 seconds to 2.5 seconds" is not a reproducible experiment. What was the RAM size? Was the guest just booted and idle, or was there a workload? What was the host? Thanks, Paolo > +/* > + * Zero page/buffer checking using SIMD(Neon) > + */ > + > +static bool > +can_use_buffer_find_nonzero_offset_neon(const void *buf, size_t len) > +{ > + return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR_NEON > + * sizeof(NEON_VECTYPE)) == 0 > + && ((uintptr_t) buf) % sizeof(NEON_VECTYPE) == 0); > +} > + > +static size_t buffer_find_nonzero_offset_neon(const void *buf, size_t len) > +{ > + size_t i; > + NEON_VECTYPE qword0, qword1, qword2, qword3, qword4, qword5, qword6; > + uint64_t const *data = buf; > + > + if (!len) { > + return 0; > + } > + > + assert(can_use_buffer_find_nonzero_offset_neon(buf, len)); > + len /= sizeof(unsigned long); > + > + for (i = 0; i < len; i += BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR_NEON) > { > + qword0 = NEON_LOAD_N_ORR(data[i], data[i + 2]); > + qword1 = NEON_LOAD_N_ORR(data[i + 4], data[i + 6]); > + qword2 = NEON_LOAD_N_ORR(data[i + 8], data[i + 10]); > + qword3 = NEON_LOAD_N_ORR(data[i + 12], data[i + 14]); > + qword4 = NEON_ORR(qword0, qword1); > + qword5 = NEON_ORR(qword2, qword3); > + qword6 = NEON_ORR(qword4, qword5); > + > + if (NEON_NOT_EQ_ZERO(qword6)) { > + break; > + } > + } > + > + return i * sizeof(unsigned long); > +} > + > +static inline bool neon_support(void) > +{ > + /* > + * Check if neon feature is supported. > + * By default neon is supported for aarch64. > + */ > + return true; Then everything below this function is not necessary. Paolo > +} > + > +bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len) > +{ > + return neon_support() ? can_use_buffer_find_nonzero_offset_neon(buf, > len) : > + can_use_buffer_find_nonzero_offset_inner(buf, len); > +} > + > +size_t buffer_find_nonzero_offset(const void *buf, size_t len) > +{ > + return neon_support() ? buffer_find_nonzero_offset_neon(buf, len) : > + buffer_find_nonzero_offset_inner(buf, len); > +} > #else > bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len) > { > -- > 1.7.9.5 > >
On 7 April 2016 at 11:30, Paolo Bonzini <pbonzini@redhat.com> wrote: > >> +#elif defined __aarch64__ >> +#include "arm_neon.h" >> + >> +#define NEON_VECTYPE uint64x2_t >> +#define NEON_LOAD_N_ORR(v1, v2) (vld1q_u64(&v1) | vld1q_u64(&v2)) > > Why is the load and orr necessary? Is ((v1) | (v2)) enough? > >> +#define NEON_ORR(v1, v2) ((v1) | (v2)) >> +#define NEON_NOT_EQ_ZERO(v1) \ >> + ((vgetq_lane_u64(v1, 0) != 0) || (vgetq_lane_u64(v1, 1) != 0)) >> + >> +#define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR_NEON 16 > > Unless you have numbers saying that a 16-unroll is better than an 8-unroll > (and then you should put those in the commit message), you do not need to > duplicate code, just add aarch64 definitions for the existing code. This pure-neon code is also not doing the initial short-loop to test for non-zero buffers, which means it's not an apples-to-apples comparison. It seems unlikely that workload balances are going to be different on ARM vs x86 such that it's worth doing the small loop on one but not the other. (This is also why it's helpful to explain your benchmarking method -- the short loop will slow things down for some cases like "large and untouched RAM", but be faster again for cases like "large RAM of which most pages have been dirtied".) thanks -- PMM
On 7 April 2016 at 10:58, <vijayak@caviumnetworks.com> wrote: > From: Vijay <vijayak@cavium.com> > > Use Neon instructions to perform zero checking of > buffer. This is helps in reducing downtime during > live migration. > > Signed-off-by: Vijaya Kumar K <vijayak@caviumnetworks.com> > Signed-off-by: Suresh <ksuresh@caviumnetworks.com> > --- > util/cutils.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 74 insertions(+) > > diff --git a/util/cutils.c b/util/cutils.c > index 43d1afb..bb61c91 100644 > --- a/util/cutils.c > +++ b/util/cutils.c > @@ -352,6 +352,80 @@ static void *can_use_buffer_find_nonzero_offset_ifunc(void) > return func; > } > #pragma GCC pop_options > + > +#elif defined __aarch64__ > +#include "arm_neon.h" > + > +#define NEON_VECTYPE uint64x2_t > +#define NEON_LOAD_N_ORR(v1, v2) (vld1q_u64(&v1) | vld1q_u64(&v2)) > +#define NEON_ORR(v1, v2) ((v1) | (v2)) > +#define NEON_NOT_EQ_ZERO(v1) \ > + ((vgetq_lane_u64(v1, 0) != 0) || (vgetq_lane_u64(v1, 1) != 0)) > + > +#define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR_NEON 16 This says 16 lots of loads of uint64x2_t... > + for (i = 0; i < len; i += BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR_NEON) { > + qword0 = NEON_LOAD_N_ORR(data[i], data[i + 2]); > + qword1 = NEON_LOAD_N_ORR(data[i + 4], data[i + 6]); > + qword2 = NEON_LOAD_N_ORR(data[i + 8], data[i + 10]); > + qword3 = NEON_LOAD_N_ORR(data[i + 12], data[i + 14]); > + qword4 = NEON_ORR(qword0, qword1); > + qword5 = NEON_ORR(qword2, qword3); > + qword6 = NEON_ORR(qword4, qword5); ...but the loop is only loading 8 lots of uint64x2_t. thanks -- PMM
On 04/07/2016 02:58 AM, vijayak@caviumnetworks.com wrote: > +#elif defined __aarch64__ > +#include "arm_neon.h" A better test is __NEON__, which asserts that neon is available at compile time (which will be true basically always for aarch64), and then you don't need a runime test for neon. You also get support for armv7 with neon. > +#define NEON_VECTYPE uint64x2_t > +#define NEON_LOAD_N_ORR(v1, v2) (vld1q_u64(&v1) | vld1q_u64(&v2)) > +#define NEON_ORR(v1, v2) ((v1) | (v2)) > +#define NEON_NOT_EQ_ZERO(v1) \ > + ((vgetq_lane_u64(v1, 0) != 0) || (vgetq_lane_u64(v1, 1) != 0)) FWIW, I think that vmaxvq_u32 would be a better reduction for aarch64. Extracting the individual lanes isn't as efficient as one would like. For armv7, folding via vget_lane_u64(vget_high_u64(v1) | vget_low_u64(v1), 0) is probably best. r~
On 9 April 2016 at 23:45, Richard Henderson <rth@twiddle.net> wrote: > On 04/07/2016 02:58 AM, vijayak@caviumnetworks.com wrote: >> >> +#elif defined __aarch64__ >> +#include "arm_neon.h" > > > A better test is __NEON__, which asserts that neon is available at compile > time (which will be true basically always for aarch64), and then you don't > need a runime test for neon. You don't need a runtime test for neon on aarch64 anyway, because it will always be present. > You also get support for armv7 with neon. But if you do care about armv7 then you do need a runtime test, because the defacto standard compile options are for armhf which has FP but doesn't assume Neon. Personally I think we should not worry about armv7 here, because it's not actually a likely virtualization server platform, and we shouldn't include code in QEMU we're not even compile testing. So I think __aarch64__ here is fine. thanks -- PMM
diff --git a/util/cutils.c b/util/cutils.c index 43d1afb..bb61c91 100644 --- a/util/cutils.c +++ b/util/cutils.c @@ -352,6 +352,80 @@ static void *can_use_buffer_find_nonzero_offset_ifunc(void) return func; } #pragma GCC pop_options + +#elif defined __aarch64__ +#include "arm_neon.h" + +#define NEON_VECTYPE uint64x2_t +#define NEON_LOAD_N_ORR(v1, v2) (vld1q_u64(&v1) | vld1q_u64(&v2)) +#define NEON_ORR(v1, v2) ((v1) | (v2)) +#define NEON_NOT_EQ_ZERO(v1) \ + ((vgetq_lane_u64(v1, 0) != 0) || (vgetq_lane_u64(v1, 1) != 0)) + +#define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR_NEON 16 + +/* + * Zero page/buffer checking using SIMD(Neon) + */ + +static bool +can_use_buffer_find_nonzero_offset_neon(const void *buf, size_t len) +{ + return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR_NEON + * sizeof(NEON_VECTYPE)) == 0 + && ((uintptr_t) buf) % sizeof(NEON_VECTYPE) == 0); +} + +static size_t buffer_find_nonzero_offset_neon(const void *buf, size_t len) +{ + size_t i; + NEON_VECTYPE qword0, qword1, qword2, qword3, qword4, qword5, qword6; + uint64_t const *data = buf; + + if (!len) { + return 0; + } + + assert(can_use_buffer_find_nonzero_offset_neon(buf, len)); + len /= sizeof(unsigned long); + + for (i = 0; i < len; i += BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR_NEON) { + qword0 = NEON_LOAD_N_ORR(data[i], data[i + 2]); + qword1 = NEON_LOAD_N_ORR(data[i + 4], data[i + 6]); + qword2 = NEON_LOAD_N_ORR(data[i + 8], data[i + 10]); + qword3 = NEON_LOAD_N_ORR(data[i + 12], data[i + 14]); + qword4 = NEON_ORR(qword0, qword1); + qword5 = NEON_ORR(qword2, qword3); + qword6 = NEON_ORR(qword4, qword5); + + if (NEON_NOT_EQ_ZERO(qword6)) { + break; + } + } + + return i * sizeof(unsigned long); +} + +static inline bool neon_support(void) +{ + /* + * Check if neon feature is supported. + * By default neon is supported for aarch64. + */ + return true; +} + +bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len) +{ + return neon_support() ? can_use_buffer_find_nonzero_offset_neon(buf, len) : + can_use_buffer_find_nonzero_offset_inner(buf, len); +} + +size_t buffer_find_nonzero_offset(const void *buf, size_t len) +{ + return neon_support() ? buffer_find_nonzero_offset_neon(buf, len) : + buffer_find_nonzero_offset_inner(buf, len); +} #else bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len) {