Message ID | 1459777195-7907-3-git-send-email-vijayak@caviumnetworks.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 4 April 2016 at 14:39, <vijayak@caviumnetworks.com> wrote: > From: Vijay <vijayak@cavium.com> > > Use Neon instructions to perform zero checking of > buffer. This is helps in reducing downtime during > live migration. > > Signed-off-by: Vijaya Kumar K <vijayak@caviumnetworks.com> > --- > util/cutils.c | 81 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 81 insertions(+) > > diff --git a/util/cutils.c b/util/cutils.c > index 43d1afb..d343b9a 100644 > --- a/util/cutils.c > +++ b/util/cutils.c > @@ -352,6 +352,87 @@ static void *can_use_buffer_find_nonzero_offset_ifunc(void) > return func; > } > #pragma GCC pop_options > + > +#elif defined __aarch64__ > +#include "arm_neon.h" Can we rely on all compilers having this, or do we need to test in configure? > + > +#define NEON_VECTYPE uint64x2_t > +#define NEON_LOAD_N_ORR(v1, v2) vorrq_u64(vld1q_u64(&v1), vld1q_u64(&v2)) > +#define NEON_ORR(v1, v2) vorrq_u64(v1, v2) > +#define NEON_EQ_ZERO(v1) \ > + ((vgetq_lane_u64(vceqzq_u64(v1), 0) == 0) || \ > + (vgetq_lane_u64(vceqzq_u64(v1), 1)) == 0) The intrinsics are a bit confusing, but shouldn't we be testing that both lanes of v1 are 0, rather than whether either of them is? (so "&&", not "||"). > + > +#define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR_NEON 16 > + > +/* > + * Zero page/buffer checking using SIMD(Neon) > + */ > + > +static bool > +can_use_buffer_find_nonzero_offset_neon(const void *buf, size_t len) > +{ > + return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR_NEON > + * sizeof(NEON_VECTYPE)) == 0 > + && ((uintptr_t) buf) % sizeof(NEON_VECTYPE) == 0); > +} > + > +static size_t buffer_find_nonzero_offset_neon(const void *buf, size_t len) > +{ > + size_t i; > + NEON_VECTYPE d0, d1, d2, d3, d4, d5, d6; > + NEON_VECTYPE d7, d8, d9, d10, d11, d12, d13, d14; > + uint64_t const *data = buf; > + > + assert(can_use_buffer_find_nonzero_offset_neon(buf, len)); > + len /= sizeof(unsigned long); > + > + for (i = 0; i < len; i += 32) { > + d0 = NEON_LOAD_N_ORR(data[i], data[i + 2]); > + d1 = NEON_LOAD_N_ORR(data[i + 4], data[i + 6]); > + d2 = NEON_LOAD_N_ORR(data[i + 8], data[i + 10]); > + d3 = NEON_LOAD_N_ORR(data[i + 12], data[i + 14]); > + d4 = NEON_ORR(d0, d1); > + d5 = NEON_ORR(d2, d3); > + d6 = NEON_ORR(d4, d5); > + > + d7 = NEON_LOAD_N_ORR(data[i + 16], data[i + 18]); > + d8 = NEON_LOAD_N_ORR(data[i + 20], data[i + 22]); > + d9 = NEON_LOAD_N_ORR(data[i + 24], data[i + 26]); > + d10 = NEON_LOAD_N_ORR(data[i + 28], data[i + 30]); > + d11 = NEON_ORR(d7, d8); > + d12 = NEON_ORR(d9, d10); > + d13 = NEON_ORR(d11, d12); > + > + d14 = NEON_ORR(d6, d13); > + if (NEON_EQ_ZERO(d14)) { > + break; > + } > + } Both the other optimised find_nonzero implementations in this file have two loops, not just one. Is it OK that this implementation has only a single loop? Paolo: do you know why we have two loops in the other implementations? > + > + return i * sizeof(unsigned long); > +} > + > +static inline bool neon_support(void) > +{ > + /* > + * Check if neon feature is supported. > + * By default neon is supported for aarch64. > + */ > + return true; > +} There doesn't seem much point in this. We can assume Neon exists on any CPU we're going to run on (it's part of the ABI, the kernel assumes it, etc etc). So you can just implement the functions without the indirection functions below. > + > +bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len) > +{ > + return neon_support() ? can_use_buffer_find_nonzero_offset_neon(buf, len) : > + can_use_buffer_find_nonzero_offset_inner(buf, len); > +} > + > +size_t buffer_find_nonzero_offset(const void *buf, size_t len) > +{ > + return neon_support() ? buffer_find_nonzero_offset_neon(buf, len) : > + buffer_find_nonzero_offset_inner(buf, len); > +} > #else > bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len) > { > -- thanks -- PMM
On 05/04/2016 16:36, Peter Maydell wrote: >> > + >> > +#define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR_NEON 16 >> > + >> > +/* >> > + * Zero page/buffer checking using SIMD(Neon) >> > + */ >> > + >> > +static bool >> > +can_use_buffer_find_nonzero_offset_neon(const void *buf, size_t len) >> > +{ >> > + return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR_NEON >> > + * sizeof(NEON_VECTYPE)) == 0 >> > + && ((uintptr_t) buf) % sizeof(NEON_VECTYPE) == 0); >> > +} >> > + >> > +static size_t buffer_find_nonzero_offset_neon(const void *buf, size_t len) >> > +{ >> > + size_t i; >> > + NEON_VECTYPE d0, d1, d2, d3, d4, d5, d6; >> > + NEON_VECTYPE d7, d8, d9, d10, d11, d12, d13, d14; >> > + uint64_t const *data = buf; >> > + >> > + assert(can_use_buffer_find_nonzero_offset_neon(buf, len)); >> > + len /= sizeof(unsigned long); >> > + >> > + for (i = 0; i < len; i += 32) { >> > + d0 = NEON_LOAD_N_ORR(data[i], data[i + 2]); >> > + d1 = NEON_LOAD_N_ORR(data[i + 4], data[i + 6]); >> > + d2 = NEON_LOAD_N_ORR(data[i + 8], data[i + 10]); >> > + d3 = NEON_LOAD_N_ORR(data[i + 12], data[i + 14]); >> > + d4 = NEON_ORR(d0, d1); >> > + d5 = NEON_ORR(d2, d3); >> > + d6 = NEON_ORR(d4, d5); >> > + >> > + d7 = NEON_LOAD_N_ORR(data[i + 16], data[i + 18]); >> > + d8 = NEON_LOAD_N_ORR(data[i + 20], data[i + 22]); >> > + d9 = NEON_LOAD_N_ORR(data[i + 24], data[i + 26]); >> > + d10 = NEON_LOAD_N_ORR(data[i + 28], data[i + 30]); >> > + d11 = NEON_ORR(d7, d8); >> > + d12 = NEON_ORR(d9, d10); >> > + d13 = NEON_ORR(d11, d12); >> > + >> > + d14 = NEON_ORR(d6, d13); >> > + if (NEON_EQ_ZERO(d14)) { >> > + break; >> > + } >> > + } > Both the other optimised find_nonzero implementations in this > file have two loops, not just one. Is it OK that this > implementation has only a single loop? > > Paolo: do you know why we have two loops in the other > implementations? Because usually the first one or two iterations are enough to exit the function if the page is nonzero. It's measurably slower to go through the unrolled loop in that case. On the other hand, once the first few iterations found only zero bytes, the buffer is very likely entirely zero and the unrolled loop helps. But in theory it should be enough to add a new #elif branch like this: #include "arm_neon.h" #define VECTYPE uint64x2_t #define VEC_OR(a, b) ((a) | (b)) #define ALL_EQ(a, b) /* ??? :) */ around the /* vector definitions */ comment in util/cutils.c. GCC should do everything else. Paolo
On 4 April 2016 at 14:39, <vijayak@caviumnetworks.com> wrote: > From: Vijay <vijayak@cavium.com> > > Use Neon instructions to perform zero checking of > buffer. This is helps in reducing downtime during > live migration. One other comment I forgot: > +#define NEON_VECTYPE uint64x2_t This is a 128-bit type... > +static size_t buffer_find_nonzero_offset_neon(const void *buf, size_t len) > +{ > + size_t i; > + NEON_VECTYPE d0, d1, d2, d3, d4, d5, d6; > + NEON_VECTYPE d7, d8, d9, d10, d11, d12, d13, d14; ...so it's a bit confusing to use d0, d1, etc, which implies a 64-bit value. thanks -- PMM
On 5 April 2016 at 16:21, Paolo Bonzini <pbonzini@redhat.com> wrote: > But in theory it should be enough to add a new #elif branch like this: > > #include "arm_neon.h" > #define VECTYPE uint64x2_t > #define VEC_OR(a, b) ((a) | (b)) > #define ALL_EQ(a, b) /* ??? :) */ #define ALL_EQ(a, b) (vgetq_lane_u64(a, 0) == vgetq_lane_u64(b, 0) && \ vgetq_lane_u64(a, 1) == vgetq_lane_u64(b, 1)) will do I think (probably suboptimal for a true vector compare but works OK here as we're actually only interested in comparing against constant zero; the compiler generates "load 64bit value from vector register to integer; cbnz" for each half of the vector). Worth benchmarking that (and the variant where we use the C code but move the loop unrolling out to 16) against the handwritten intrinsics version. thanks -- PMM
On Tue, Apr 5, 2016 at 8:06 PM, Peter Maydell <peter.maydell@linaro.org> wrote: > On 4 April 2016 at 14:39, <vijayak@caviumnetworks.com> wrote: >> From: Vijay <vijayak@cavium.com> >> >> Use Neon instructions to perform zero checking of >> buffer. This is helps in reducing downtime during >> live migration. >> >> Signed-off-by: Vijaya Kumar K <vijayak@caviumnetworks.com> >> --- >> util/cutils.c | 81 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ >> 1 file changed, 81 insertions(+) >> >> diff --git a/util/cutils.c b/util/cutils.c >> index 43d1afb..d343b9a 100644 >> --- a/util/cutils.c >> +++ b/util/cutils.c >> @@ -352,6 +352,87 @@ static void *can_use_buffer_find_nonzero_offset_ifunc(void) >> return func; >> } >> #pragma GCC pop_options >> + >> +#elif defined __aarch64__ >> +#include "arm_neon.h" > > Can we rely on all compilers having this, or do we need to > test in configure? GCC and armcc support the same intrinsics. Both needs inclusion of arm_neon.h. > >> + >> +#define NEON_VECTYPE uint64x2_t >> +#define NEON_LOAD_N_ORR(v1, v2) vorrq_u64(vld1q_u64(&v1), vld1q_u64(&v2)) >> +#define NEON_ORR(v1, v2) vorrq_u64(v1, v2) >> +#define NEON_EQ_ZERO(v1) \ >> + ((vgetq_lane_u64(vceqzq_u64(v1), 0) == 0) || \ >> + (vgetq_lane_u64(vceqzq_u64(v1), 1)) == 0) > > The intrinsics are a bit confusing, but shouldn't we be > testing that both lanes of v1 are 0, rather than whether > either of them is? (so "&&", not "||"). Above check is correct. vceqzq() sets all bits to 1 if value is 0. So if one lane is 0, then it means it is non-zero buffer. I think redefining this macro as below would be better and avoid vceqzq_u64() #define NEON_NOT_EQ_ZERO(v1) \ ((vgetq_lane_u64(v1, 0) != 0) || (vgetq_lane_u64(v1, 1)) != 0) > >> + >> +#define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR_NEON 16 >> + >> +/* >> + * Zero page/buffer checking using SIMD(Neon) >> + */ >> + >> +static bool >> +can_use_buffer_find_nonzero_offset_neon(const void *buf, size_t len) >> +{ >> + return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR_NEON >> + * sizeof(NEON_VECTYPE)) == 0 >> + && ((uintptr_t) buf) % sizeof(NEON_VECTYPE) == 0); >> +} >> + >> +static size_t buffer_find_nonzero_offset_neon(const void *buf, size_t len) >> +{ >> + size_t i; >> + NEON_VECTYPE d0, d1, d2, d3, d4, d5, d6; >> + NEON_VECTYPE d7, d8, d9, d10, d11, d12, d13, d14; >> + uint64_t const *data = buf; >> + >> + assert(can_use_buffer_find_nonzero_offset_neon(buf, len)); >> + len /= sizeof(unsigned long); >> + >> + for (i = 0; i < len; i += 32) { >> + d0 = NEON_LOAD_N_ORR(data[i], data[i + 2]); >> + d1 = NEON_LOAD_N_ORR(data[i + 4], data[i + 6]); >> + d2 = NEON_LOAD_N_ORR(data[i + 8], data[i + 10]); >> + d3 = NEON_LOAD_N_ORR(data[i + 12], data[i + 14]); >> + d4 = NEON_ORR(d0, d1); >> + d5 = NEON_ORR(d2, d3); >> + d6 = NEON_ORR(d4, d5); >> + >> + d7 = NEON_LOAD_N_ORR(data[i + 16], data[i + 18]); >> + d8 = NEON_LOAD_N_ORR(data[i + 20], data[i + 22]); >> + d9 = NEON_LOAD_N_ORR(data[i + 24], data[i + 26]); >> + d10 = NEON_LOAD_N_ORR(data[i + 28], data[i + 30]); >> + d11 = NEON_ORR(d7, d8); >> + d12 = NEON_ORR(d9, d10); >> + d13 = NEON_ORR(d11, d12); >> + >> + d14 = NEON_ORR(d6, d13); >> + if (NEON_EQ_ZERO(d14)) { >> + break; >> + } >> + } > > Both the other optimised find_nonzero implementations in this > file have two loops, not just one. Is it OK that this > implementation has only a single loop? > > Paolo: do you know why we have two loops in the other > implementations? Paolo was right as he mentioned in the previous email. But with two loops, I don't see much benefit. So restricted to one loop. > >> + >> + return i * sizeof(unsigned long); >> +} >> + >> +static inline bool neon_support(void) >> +{ >> + /* >> + * Check if neon feature is supported. >> + * By default neon is supported for aarch64. >> + */ >> + return true; >> +} > > There doesn't seem much point in this. We can assume Neon exists > on any CPU we're going to run on (it's part of the ABI, the kernel > assumes it, etc etc). So you can just implement the functions without > the indirection functions below. > Hmm. One reason was compilation fails if we don't call can_use_buffer_find_nonzero_offset_inner() function from inside neon implementation. So I added this similar to AVX2 intel. Also thought if any platform does not implement Neon, then can simply skip changes this function. >> + >> +bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len) >> +{ >> + return neon_support() ? can_use_buffer_find_nonzero_offset_neon(buf, len) : >> + can_use_buffer_find_nonzero_offset_inner(buf, len); >> +} >> + >> +size_t buffer_find_nonzero_offset(const void *buf, size_t len) >> +{ >> + return neon_support() ? buffer_find_nonzero_offset_neon(buf, len) : >> + buffer_find_nonzero_offset_inner(buf, len); >> +} >> #else >> bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len) >> { >> -- > > thanks > -- PMM
Hi Jed, On Mon, Mar 20, 2017 at 2:35 AM, Wangjintang <wangjintang@huawei.com> wrote: > Hi, > > We see that armv8's prefetch instruction decode have been skipped in qemu. For some user, they need prefetch instruction, for example, they use qemu to generate the instruction trace. We want to merge this patch to community, it's ok or not? Thanks. > Your patch seems to be missing. Can you retry with the content of the patch pasted in the email? Thanks, -- Pranith
On 24 March 2017 at 06:14, Wangjintang <wangjintang@huawei.com> wrote: > Hi Pranith, > > Thanks for your reply. patch as below, new added code default is off, please review. > The major thinking is about translate Armv8's prefetch instruction into intermediate code, at the same time don't effect the rm/rn register. > > > diff --git a/translate-a64.c b/translate-a64.c > index 814f30f..86da8ee 100644 > --- a/translate-a64.c > +++ b/translate-a64.c > @@ -2061,7 +2061,11 @@ static void disas_ld_lit(DisasContext *s, uint32_t insn) > } else { > if (opc == 3) { > /* PRFM (literal) : prefetch */ > + #ifdef TCG_AARCH64_PREFETCH_TRANSLATE > + ; > + #else > return; > + #endif > } No, these changes look wrong. PRFM instructions do not need to do anything and should definitely not be emitting any intermediate code. In particular if you let execution fall through and try do_gpr_ld() then it will really do a load, which might cause an exception -- this is specifically forbidden for PRFM. Architecturally the ARM ARM says "it is valid for the PE to treat any or all prefetch instructions as a NOP", which is what QEMU does. The existing code is correct. In general you should not expect to be able to deduce the guest instructions from the intermediate code representation. thanks -- PMM
Hi Peter, More detail illustration at below. > -----Original Message----- > From: Peter Maydell [mailto:peter.maydell@linaro.org] > Sent: Friday, March 24, 2017 6:06 PM > To: Wangjintang > Cc: Pranith Kumar; Shlomo Pongratz (A); Wanghaibin (Benjamin); qemu-arm; > qemu-devel; Ori Chalak (A) > Subject: Re: [Qemu-arm] [patch 1/1]about armv8's prefetch decode > No, these changes look wrong. PRFM instructions do not need to > do anything and should definitely not be emitting any intermediate > code. In particular if you let execution fall through and try > do_gpr_ld() then it will really do a load, which might cause > an exception -- this is specifically forbidden for PRFM. > Architecturally the ARM ARM says "it is valid for the PE to > treat any or all prefetch instructions as a NOP", which is > what QEMU does. > > The existing code is correct. In general you should not > expect to be able to deduce the guest instructions from > the intermediate code representation. > "it is valid for the PE to treat any or all prefetch instructions as a NOP", from software view, it's right. the patch regard the prefetch as load instruction, at the same time don't affect rm/rt register. Only the PRFM instruction been emitted to intermediate code and do a really load, then we can get the memory address relative to the prefetch instruction. Because the rm/rt register don't been modified, so the application can run correctly. BTW, the new added code default is disable. So for the common user, have no affect to them. In our case, we need all the instruction trace & ld/st instruction's access memory address, the trace as the input for chip cycle-accurate model. Similar with flexus + qemu. Current code that skip generate prefetch instructions' intermediate code, So we can get prefetch instruction, but can't get the prefetch instruction relative memory address. We have tested that the ratio of prefetch instructions is about 2%~3% during run Dhrystone in system mode. The ratio is high. ________________ ________________ | | | | | | | | | Qemu | | chip | | | instruction trace | cycle-accurate | | | -----------------> | model | | | memory trace | | |________________| |________________| Ori Chalak's explain this as below: " Indeed, prefetch instruction affects only the micro architecture, and hence not needed for running correctly the generated code. However, we developed a performance simulator for a detailed ARMv8 CPU model, and use Qemu to resolve the functionality. And for this purpose we need to translate all instructions that may affect the pipeline behavior, caches, etc. This is not the major usage of Qemu, however there may be others doing this and it may help them. http://www.linux-kvm.org/images/4/45/01x09-Christopher_Covington-Using_Upstream_QEMU_for_CASS.pdf " Best Regards, Wang jintang / Jed Huawei Technologies Co., Ltd. Email: wangjintang@huawei.com http://www.huawei.com
On 25 March 2017 at 02:22, Wangjintang <wangjintang@huawei.com> wrote: > the patch regard the prefetch as load instruction, at the same time > don't affect rm/rt register. Only the PRFM instruction been emitted to > intermediate code and do a really load, then we can get the memory > address relative to the prefetch instruction. Because the rm/rt register > don't been modified, so the application can run correctly. It will still fault if the address is not valid, which is not a permitted behaviour. > In our case, we need all the instruction trace & ld/st instruction's > access memory address, the trace as the input for chip cycle-accurate > model. Similar with flexus + qemu. > Current code that skip generate prefetch instructions' intermediate code, > So we can get prefetch instruction, but can't get the prefetch instruction > relative memory address. I understand the use case you would like, but if we want to support that kind of thing we should do it with a much more significant and consistent degree of support for tracing of guest code actions, not with a small ad-hoc change that happens to fix the immediate thing you're running into for your specific problem. thanks -- PMM
diff --git a/util/cutils.c b/util/cutils.c index 43d1afb..d343b9a 100644 --- a/util/cutils.c +++ b/util/cutils.c @@ -352,6 +352,87 @@ static void *can_use_buffer_find_nonzero_offset_ifunc(void) return func; } #pragma GCC pop_options + +#elif defined __aarch64__ +#include "arm_neon.h" + +#define NEON_VECTYPE uint64x2_t +#define NEON_LOAD_N_ORR(v1, v2) vorrq_u64(vld1q_u64(&v1), vld1q_u64(&v2)) +#define NEON_ORR(v1, v2) vorrq_u64(v1, v2) +#define NEON_EQ_ZERO(v1) \ + ((vgetq_lane_u64(vceqzq_u64(v1), 0) == 0) || \ + (vgetq_lane_u64(vceqzq_u64(v1), 1)) == 0) + +#define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR_NEON 16 + +/* + * Zero page/buffer checking using SIMD(Neon) + */ + +static bool +can_use_buffer_find_nonzero_offset_neon(const void *buf, size_t len) +{ + return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR_NEON + * sizeof(NEON_VECTYPE)) == 0 + && ((uintptr_t) buf) % sizeof(NEON_VECTYPE) == 0); +} + +static size_t buffer_find_nonzero_offset_neon(const void *buf, size_t len) +{ + size_t i; + NEON_VECTYPE d0, d1, d2, d3, d4, d5, d6; + NEON_VECTYPE d7, d8, d9, d10, d11, d12, d13, d14; + uint64_t const *data = buf; + + assert(can_use_buffer_find_nonzero_offset_neon(buf, len)); + len /= sizeof(unsigned long); + + for (i = 0; i < len; i += 32) { + d0 = NEON_LOAD_N_ORR(data[i], data[i + 2]); + d1 = NEON_LOAD_N_ORR(data[i + 4], data[i + 6]); + d2 = NEON_LOAD_N_ORR(data[i + 8], data[i + 10]); + d3 = NEON_LOAD_N_ORR(data[i + 12], data[i + 14]); + d4 = NEON_ORR(d0, d1); + d5 = NEON_ORR(d2, d3); + d6 = NEON_ORR(d4, d5); + + d7 = NEON_LOAD_N_ORR(data[i + 16], data[i + 18]); + d8 = NEON_LOAD_N_ORR(data[i + 20], data[i + 22]); + d9 = NEON_LOAD_N_ORR(data[i + 24], data[i + 26]); + d10 = NEON_LOAD_N_ORR(data[i + 28], data[i + 30]); + d11 = NEON_ORR(d7, d8); + d12 = NEON_ORR(d9, d10); + d13 = NEON_ORR(d11, d12); + + d14 = NEON_ORR(d6, d13); + if (NEON_EQ_ZERO(d14)) { + break; + } + } + + return i * sizeof(unsigned long); +} + +static inline bool neon_support(void) +{ + /* + * Check if neon feature is supported. + * By default neon is supported for aarch64. + */ + return true; +} + +bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len) +{ + return neon_support() ? can_use_buffer_find_nonzero_offset_neon(buf, len) : + can_use_buffer_find_nonzero_offset_inner(buf, len); +} + +size_t buffer_find_nonzero_offset(const void *buf, size_t len) +{ + return neon_support() ? buffer_find_nonzero_offset_neon(buf, len) : + buffer_find_nonzero_offset_inner(buf, len); +} #else bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len) {