Message ID | 20200504192954.1387-1-luoyonggang@gmail.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | ppc: Use hard-float in ppc fp_hlper as early as possible. This would increase the performance better than enable hard-float it in soft-float.c; Just using fadd fsub fmul fdiv as a simple bench demo. With this patch, performance are increased 2x. and 1.3x | expand |
Bench result; orignal: -> FLOPS 3.00 GCC version: 4.3.3 Ops count: 1073217024 Time spent: 27.768 sec MFLOPS: 38.65 FLOPS 3.00 GCC version: 4.3.3 Ops count: 1073217024 Time spent: 28.359 sec MFLOPS: 37.84 soft-hard-float: GCC version: 4.3.3 Ops count: 1073217024 Time spent: 14.874 sec MFLOPS: 72.15 FLOPS 3.00 GCC version: 4.3.3 Ops count: 1073217024 Time spent: 14.249 sec MFLOPS: 75.32 direct-hard-float: -> FLOPS 3.00 GCC version: 4.3.3 Ops count: 1073217024 Time spent: 13.021 sec MFLOPS: 82.42 FLOPS 3.00 GCC version: 4.3.3 Ops count: 1073217024 Time spent: 12.472 sec MFLOPS: 86.05 FLOPS 3.00 GCC version: 4.3.3 Ops count: 1073217024 Time spent: 11.803 sec MFLOPS: 90.93 FLOPS 3.00 GCC version: 4.3.3 Ops count: 1073217024 Time spent: 11.945 sec MFLOPS: 89.85 bench program: ``` #include <stdio.h> #include <stdlib.h> #ifdef __vxworks #include <sys/resource.h> #include <vxworks.h> #include <timers.h> #include <time.h> #elif defined(_MSC_VER) #include <Windows.h> #include <time.h> #else #include <time.h> #endif /* cl -O2 test_flops.c gcc -O2 test_flops.c -o test_flops */ #ifndef DIM #define DIM 1024 const long long int nop = 1073217024; #else #define COUNT long long int nop = 0; #endif void printm(double A[DIM][DIM]) { int i,j; for (i=0; i<DIM; i++) { for (j=0; j<DIM; j++) printf("%6.3f", A[i][j]); printf("\n"); } } void initm(double A[DIM][DIM]) { int i,j; srand(38741); for (i = 0; i < DIM; i++) for (j = 0; j < DIM; j++) A[i][j] = (double)rand() / (double)RAND_MAX - 0.5; } void dge(double A[DIM][DIM]) { int i, j, k; double c; for (k = 1; k < DIM; k++) { for (i = k; i < DIM; i++) { c = A[i][k-1] / A[k-1][k-1]; #ifdef COUNT nop += 1; #endif for (j = 0; j < DIM; j++) { A[i][j] -= c * A[k-1][j]; #ifdef COUNT nop += 2; #endif } } } } double X[DIM][DIM]; /* * return a timestamp with sub-second precision * QueryPerformanceCounter and clock_gettime have an undefined starting point (null/zero) * and can wrap around, i.e. be nulled again. */ double get_seconds() { #ifdef _MSC_VER static LARGE_INTEGER frequency; if (frequency.QuadPart == 0) QueryPerformanceFrequency(&frequency); LARGE_INTEGER now; QueryPerformanceCounter(&now); return (now.QuadPart * 1.0) / frequency.QuadPart; #else struct timespec now; clock_gettime(CLOCK_REALTIME, &now); return now.tv_sec + now.tv_nsec * 1e-9; #endif } int main (int argc, char **argv) { double a = 1.0; double b = 2.0; double c = a + b; double t; int count = 1; int i; printf("FLOPS %.2lf\n", c); #ifdef _MSC_VER printf("MSC_VER version: %d\n", _MSC_VER); #else printf("GCC version: " __VERSION__ "\n"); #endif initm(X); t = get_seconds(); #ifndef __vxworks if (argc > 1) { sscanf(argv[1], "%d", &count); } #endif for (i = 0; i < count; i += 1) { dge(X); } t = get_seconds() - t; printf("Ops count: %llu\n", nop * count); printf("Time spent: %.3lf sec\n", t); printf("MFLOPS: %.2f\n", 1e-6 * nop * count / t ); #ifdef PRINTM printm(X); #endif return 0; } ``` On Tue, May 5, 2020 at 3:30 AM <luoyonggang@gmail.com> wrote: > From: Yonggang Luo <luoyonggang@gmail.com> > > Just post as an idea to improve PPC fp performance. > With this idea, we have no need to adjust the helper orders. > > Signed-off-by: Yonggang Luo <luoyonggang@gmail.com> > --- > target/ppc/fpu_helper.c | 44 +++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 44 insertions(+) > > diff --git a/target/ppc/fpu_helper.c b/target/ppc/fpu_helper.c > index 2bd49a2cdf..79051e4540 100644 > --- a/target/ppc/fpu_helper.c > +++ b/target/ppc/fpu_helper.c > @@ -926,6 +926,17 @@ static void float_invalid_op_addsub(CPUPPCState *env, > bool set_fpcc, > /* fadd - fadd. */ > float64 helper_fadd(CPUPPCState *env, float64 arg1, float64 arg2) > { > + CPU_DoubleU u1, u2; > + > + u1.d = arg1; > + u2.d = arg2; > + CPU_DoubleU retDouble; > + retDouble.nd = u1.nd + u2.nd; > + if (likely(float64_is_zero_or_normal(retDouble.d))) > + { > + /* TODO: Handling inexact */ > + return retDouble.d; > + } > float64 ret = float64_add(arg1, arg2, &env->fp_status); > int status = get_float_exception_flags(&env->fp_status); > > @@ -941,6 +952,17 @@ float64 helper_fadd(CPUPPCState *env, float64 arg1, > float64 arg2) > /* fsub - fsub. */ > float64 helper_fsub(CPUPPCState *env, float64 arg1, float64 arg2) > { > + CPU_DoubleU u1, u2; > + > + u1.d = arg1; > + u2.d = arg2; > + CPU_DoubleU retDouble; > + retDouble.nd = u1.nd - u2.nd; > + if (likely(float64_is_zero_or_normal(retDouble.d))) > + { > + /* TODO: Handling inexact */ > + return retDouble.d; > + } > float64 ret = float64_sub(arg1, arg2, &env->fp_status); > int status = get_float_exception_flags(&env->fp_status); > > @@ -967,6 +989,17 @@ static void float_invalid_op_mul(CPUPPCState *env, > bool set_fprc, > /* fmul - fmul. */ > float64 helper_fmul(CPUPPCState *env, float64 arg1, float64 arg2) > { > + CPU_DoubleU u1, u2; > + > + u1.d = arg1; > + u2.d = arg2; > + CPU_DoubleU retDouble; > + retDouble.nd = u1.nd * u2.nd; > + if (likely(float64_is_zero_or_normal(retDouble.d))) > + { > + /* TODO: Handling inexact */ > + return retDouble.d; > + } > float64 ret = float64_mul(arg1, arg2, &env->fp_status); > int status = get_float_exception_flags(&env->fp_status); > > @@ -997,6 +1030,17 @@ static void float_invalid_op_div(CPUPPCState *env, > bool set_fprc, > /* fdiv - fdiv. */ > float64 helper_fdiv(CPUPPCState *env, float64 arg1, float64 arg2) > { > + CPU_DoubleU u1, u2; > + > + u1.d = arg1; > + u2.d = arg2; > + CPU_DoubleU retDouble; > + retDouble.nd = u1.nd / u2.nd; > + if (likely(float64_is_zero_or_normal(retDouble.d))) > + { > + /* TODO: Handling inexact */ > + return retDouble.d; > + } > float64 ret = float64_div(arg1, arg2, &env->fp_status); > int status = get_float_exception_flags(&env->fp_status); > > -- > 2.23.0.windows.1 > >
On 5/4/20 12:29 PM, luoyonggang@gmail.com wrote: > Re: [PATCH] ppc: Use hard-float in ppc fp_hlper as early as possible. This would increase the performance better than enable hard-float it in soft-float.c; Just using fadd fsub fmul fdiv as a simple bench demo. With this patch, performance are increased 2x. and 1.3x than the one enable hard-float in soft-float.c Both version are not considerate inexact fp exception yet. Use a return after the one sentence title to separate it from the body of the description. > float64 helper_fadd(CPUPPCState *env, float64 arg1, float64 arg2) > { > + CPU_DoubleU u1, u2; > + > + u1.d = arg1; > + u2.d = arg2; > + CPU_DoubleU retDouble; > + retDouble.nd = u1.nd + u2.nd; > + if (likely(float64_is_zero_or_normal(retDouble.d))) > + { > + /* TODO: Handling inexact */ > + return retDouble.d; > + } First, you need to verify that the current rounding mode is float_round_nearest_even. Otherwise you are actively computing wrong results for other rounding modes. Second, including zero result in your acceptance test misses out on underflow exceptions. Third, what is your plan for inexact? There's no point in continuing this thread unless you fill in the TODO a bit more. https://cafehayek.com/wp-content/uploads/2014/03/miracle_cartoon.jpg r~
пон, 4. мај 2020. у 21:31 <luoyonggang@gmail.com> је написао/ла: > > From: Yonggang Luo <luoyonggang@gmail.com> > > Just post as an idea to improve PPC fp performance. > With this idea, we have no need to adjust the helper orders. > > Signed-off-by: Yonggang Luo <luoyonggang@gmail.com> > --- > target/ppc/fpu_helper.c | 44 +++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 44 insertions(+) > > diff --git a/target/ppc/fpu_helper.c b/target/ppc/fpu_helper.c > index 2bd49a2cdf..79051e4540 100644 > --- a/target/ppc/fpu_helper.c > +++ b/target/ppc/fpu_helper.c > @@ -926,6 +926,17 @@ static void float_invalid_op_addsub(CPUPPCState *env, bool set_fpcc, > /* fadd - fadd. */ > float64 helper_fadd(CPUPPCState *env, float64 arg1, float64 arg2) > { > + CPU_DoubleU u1, u2; > + > + u1.d = arg1; > + u2.d = arg2; > + CPU_DoubleU retDouble; > + retDouble.nd = u1.nd + u2.nd; Besides what Richard mentioned, you neglect here "flush-denormals-to-zero" property of FPUs. You implicitly assume that the host has the same behavior as the target (ppc). But that simply may not be the case, leading to the wrong result. Yours, Aleksandar > + if (likely(float64_is_zero_or_normal(retDouble.d))) > + { > + /* TODO: Handling inexact */ > + return retDouble.d; > + } > float64 ret = float64_add(arg1, arg2, &env->fp_status); > int status = get_float_exception_flags(&env->fp_status); > > @@ -941,6 +952,17 @@ float64 helper_fadd(CPUPPCState *env, float64 arg1, float64 arg2) > /* fsub - fsub. */ > float64 helper_fsub(CPUPPCState *env, float64 arg1, float64 arg2) > { > + CPU_DoubleU u1, u2; > + > + u1.d = arg1; > + u2.d = arg2; > + CPU_DoubleU retDouble; > + retDouble.nd = u1.nd - u2.nd; > + if (likely(float64_is_zero_or_normal(retDouble.d))) > + { > + /* TODO: Handling inexact */ > + return retDouble.d; > + } > float64 ret = float64_sub(arg1, arg2, &env->fp_status); > int status = get_float_exception_flags(&env->fp_status); > > @@ -967,6 +989,17 @@ static void float_invalid_op_mul(CPUPPCState *env, bool set_fprc, > /* fmul - fmul. */ > float64 helper_fmul(CPUPPCState *env, float64 arg1, float64 arg2) > { > + CPU_DoubleU u1, u2; > + > + u1.d = arg1; > + u2.d = arg2; > + CPU_DoubleU retDouble; > + retDouble.nd = u1.nd * u2.nd; > + if (likely(float64_is_zero_or_normal(retDouble.d))) > + { > + /* TODO: Handling inexact */ > + return retDouble.d; > + } > float64 ret = float64_mul(arg1, arg2, &env->fp_status); > int status = get_float_exception_flags(&env->fp_status); > > @@ -997,6 +1030,17 @@ static void float_invalid_op_div(CPUPPCState *env, bool set_fprc, > /* fdiv - fdiv. */ > float64 helper_fdiv(CPUPPCState *env, float64 arg1, float64 arg2) > { > + CPU_DoubleU u1, u2; > + > + u1.d = arg1; > + u2.d = arg2; > + CPU_DoubleU retDouble; > + retDouble.nd = u1.nd / u2.nd; > + if (likely(float64_is_zero_or_normal(retDouble.d))) > + { > + /* TODO: Handling inexact */ > + return retDouble.d; > + } > float64 ret = float64_div(arg1, arg2, &env->fp_status); > int status = get_float_exception_flags(&env->fp_status); > > -- > 2.23.0.windows.1 > >
diff --git a/target/ppc/fpu_helper.c b/target/ppc/fpu_helper.c index 2bd49a2cdf..79051e4540 100644 --- a/target/ppc/fpu_helper.c +++ b/target/ppc/fpu_helper.c @@ -926,6 +926,17 @@ static void float_invalid_op_addsub(CPUPPCState *env, bool set_fpcc, /* fadd - fadd. */ float64 helper_fadd(CPUPPCState *env, float64 arg1, float64 arg2) { + CPU_DoubleU u1, u2; + + u1.d = arg1; + u2.d = arg2; + CPU_DoubleU retDouble; + retDouble.nd = u1.nd + u2.nd; + if (likely(float64_is_zero_or_normal(retDouble.d))) + { + /* TODO: Handling inexact */ + return retDouble.d; + } float64 ret = float64_add(arg1, arg2, &env->fp_status); int status = get_float_exception_flags(&env->fp_status); @@ -941,6 +952,17 @@ float64 helper_fadd(CPUPPCState *env, float64 arg1, float64 arg2) /* fsub - fsub. */ float64 helper_fsub(CPUPPCState *env, float64 arg1, float64 arg2) { + CPU_DoubleU u1, u2; + + u1.d = arg1; + u2.d = arg2; + CPU_DoubleU retDouble; + retDouble.nd = u1.nd - u2.nd; + if (likely(float64_is_zero_or_normal(retDouble.d))) + { + /* TODO: Handling inexact */ + return retDouble.d; + } float64 ret = float64_sub(arg1, arg2, &env->fp_status); int status = get_float_exception_flags(&env->fp_status); @@ -967,6 +989,17 @@ static void float_invalid_op_mul(CPUPPCState *env, bool set_fprc, /* fmul - fmul. */ float64 helper_fmul(CPUPPCState *env, float64 arg1, float64 arg2) { + CPU_DoubleU u1, u2; + + u1.d = arg1; + u2.d = arg2; + CPU_DoubleU retDouble; + retDouble.nd = u1.nd * u2.nd; + if (likely(float64_is_zero_or_normal(retDouble.d))) + { + /* TODO: Handling inexact */ + return retDouble.d; + } float64 ret = float64_mul(arg1, arg2, &env->fp_status); int status = get_float_exception_flags(&env->fp_status); @@ -997,6 +1030,17 @@ static void float_invalid_op_div(CPUPPCState *env, bool set_fprc, /* fdiv - fdiv. */ float64 helper_fdiv(CPUPPCState *env, float64 arg1, float64 arg2) { + CPU_DoubleU u1, u2; + + u1.d = arg1; + u2.d = arg2; + CPU_DoubleU retDouble; + retDouble.nd = u1.nd / u2.nd; + if (likely(float64_is_zero_or_normal(retDouble.d))) + { + /* TODO: Handling inexact */ + return retDouble.d; + } float64 ret = float64_div(arg1, arg2, &env->fp_status); int status = get_float_exception_flags(&env->fp_status);