Message ID | 20180215115344.27625-1-tvrtko.ursulin@linux.intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Quoting Tvrtko Ursulin (2018-02-15 11:53:44) > From: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > > A subtest to verify that the engine busyness is reported with expected > accuracy on platforms where the feature is available. > > We test three patterns: 2%, 50% and 98% load per engine. > > v2: > * Use spin batch instead of nop calibration. > * Various tweaks. > > v3: > * Change loops to be time based. > * Use __igt_spin_batch_new inside timing sensitive loops. > * Fixed PWM sleep handling. > > v4: > * Use restarting spin batch. > * Calibrate more carefully by looking at the real PWM loop. > > v5: > * Made standalone. > * Better info messages. > * Tweak sleep compensation. > > Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > --- > tests/perf_pmu.c | 192 +++++++++++++++++++++++++++++++++++++++++++++++++------ > 1 file changed, 174 insertions(+), 18 deletions(-) > > diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c > index a7501ca5f7a4..fa9b54793a52 100644 > --- a/tests/perf_pmu.c > +++ b/tests/perf_pmu.c > @@ -35,6 +35,7 @@ > #include <dirent.h> > #include <time.h> > #include <poll.h> > +#include <sched.h> > > #include "igt.h" > #include "igt_core.h" > @@ -385,6 +386,22 @@ busy_check_all(int gem_fd, const struct intel_execution_engine2 *e, > gem_quiescent_gpu(gem_fd); > } > > +static void > +__submit_spin_batch(int gem_fd, igt_spin_t *spin, > + const struct intel_execution_engine2 *e) > +{ > + struct drm_i915_gem_exec_object2 obj = { > + .handle = spin->handle > + }; > + struct drm_i915_gem_execbuffer2 eb = { > + .buffer_count = 1, > + .buffers_ptr = to_user_pointer(&obj), > + .flags = e2ring(gem_fd, e), > + }; > + > + gem_execbuf(gem_fd, &eb); > +} > + > static void > most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e, > const unsigned int num_engines, unsigned int flags) > @@ -405,15 +422,7 @@ most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e, > if (e == e_) { > idle_idx = i; > } else if (spin) { > - struct drm_i915_gem_exec_object2 obj = { > - .handle = spin->handle > - }; > - struct drm_i915_gem_execbuffer2 eb = { > - .buffer_count = 1, > - .buffers_ptr = to_user_pointer(&obj), > - .flags = e2ring(gem_fd, e_), > - }; > - gem_execbuf(gem_fd, &eb); > + __submit_spin_batch(gem_fd, spin, e_); > } else { > spin = igt_spin_batch_new(gem_fd, 0, > e2ring(gem_fd, e_), 0); > @@ -469,15 +478,7 @@ all_busy_check_all(int gem_fd, const unsigned int num_engines, > continue; > > if (spin) { > - struct drm_i915_gem_exec_object2 obj = { > - .handle = spin->handle > - }; > - struct drm_i915_gem_execbuffer2 eb = { > - .buffer_count = 1, > - .buffers_ptr = to_user_pointer(&obj), > - .flags = e2ring(gem_fd, e), > - }; > - gem_execbuf(gem_fd, &eb); > + __submit_spin_batch(gem_fd, spin, e); > } else { > spin = igt_spin_batch_new(gem_fd, 0, > e2ring(gem_fd, e), 0); > @@ -1390,6 +1391,150 @@ test_enable_race(int gem_fd, const struct intel_execution_engine2 *e) > gem_quiescent_gpu(gem_fd); > } > > +static double __error(double val, double ref) > +{ > + igt_assert(ref != 0.0); igt_assert(ref > 1e-5 /* smallval */) ? Pretty sure a negative ref is also cause for confusion :) > + return (100.0 * val / ref) - 100.0; > +} > + > +static void __rearm_spin_batch(igt_spin_t *spin) > +{ > + const uint32_t mi_arb_chk = 0x5 << 23; > + > + *spin->batch = mi_arb_chk; > + __sync_synchronize(); > +} > + > +#define div_round_up(a, b) (((a) + (b) - 1) / (b)) > + > +static void > +accuracy(int gem_fd, const struct intel_execution_engine2 *e, > + unsigned long target_busy_pct) > +{ > + const unsigned int min_test_loops = 7; > + const unsigned long min_test_us = 1e6; > + unsigned long busy_us = 2500; > + unsigned long idle_us = 100 * (busy_us - target_busy_pct * > + busy_us / 100) / target_busy_pct; > + unsigned long pwm_calibration_us; > + unsigned long test_us; > + double busy_r; > + uint64_t val[2]; > + uint64_t ts[2]; > + int fd; > + > + /* Sampling platforms cannot reach the high accuracy criteria. */ > + igt_require(intel_gen(intel_get_drm_devid(gem_fd)) >= 8); igt_require(gem_has_execlists(gem_fd)); > + > + while (idle_us < 2500) { > + busy_us *= 2; > + idle_us *= 2; > + } > + > + pwm_calibration_us = min_test_loops * (busy_us + idle_us); > + while (pwm_calibration_us < min_test_us) > + pwm_calibration_us += busy_us + idle_us; > + test_us = min_test_loops * (idle_us + busy_us); > + while (test_us < min_test_us) > + test_us += busy_us + idle_us; > + > + igt_info("calibration=%luus, test=%luus; busy=%luus, idle=%luus\n", > + pwm_calibration_us, test_us, busy_us, idle_us); Would also be nice to show the adjusted %%. > + assert_within_epsilon((double)busy_us / (busy_us + idle_us), > + (double)target_busy_pct / 100.0, tolerance); > + > + /* Emit PWM pattern on the engine from a child. */ > + igt_fork(child, 1) { > + struct sched_param rt = { .sched_priority = 99 }; > + const unsigned long timeout[] = { pwm_calibration_us * 1000, > + test_us * 2 * 1000 }; > + unsigned long sleep_busy = busy_us; > + unsigned long sleep_idle = idle_us; > + igt_spin_t *spin; > + > + /* We need the best sleep accuracy we can get. */ > + igt_require(sched_setscheduler(0, > + SCHED_FIFO | SCHED_RESET_ON_FORK, > + &rt) == 0); Can't use igt_require() or igt_assert() from children. So just igt_warn if not applied. Just SCHED_FIFO is enough as the child doesn't/shouldn't fork. > + > + /* Allocate our spin batch and idle it. */ > + spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0); > + igt_spin_batch_end(spin); > + gem_sync(gem_fd, spin->handle); > + > + /* 1st pass is calibration, second pass is the test. */ > + for (int pass = 0; pass < ARRAY_SIZE(timeout); pass++) { > + unsigned long busy_ns = 0, idle_ns = 0; > + struct timespec test_start = { }; > + unsigned long loops = 0; > + double err_busy, err_idle; > + > + igt_nsec_elapsed(&test_start); > + do { > + struct timespec t_busy = { }; > + > + igt_nsec_elapsed(&t_busy); > + > + /* Restart the spinbatch. */ > + __rearm_spin_batch(spin); > + __submit_spin_batch(gem_fd, spin, e); > + measured_usleep(sleep_busy); > + igt_spin_batch_end(spin); > + gem_sync(gem_fd, spin->handle); > + > + busy_ns += igt_nsec_elapsed(&t_busy); > + > + idle_ns += measured_usleep(sleep_idle); > + > + loops++; > + } while (igt_nsec_elapsed(&test_start) < timeout[pass]); > + > + busy_ns = div_round_up(busy_ns, loops); > + idle_ns = div_round_up(idle_ns, loops); > + > + err_busy = __error(busy_ns / 1000, busy_us); > + err_idle = __error(idle_ns / 1000, idle_us); > + > + igt_info("%u: busy %lu/%lu %.2f%%, idle %lu/%lu %.2f%%\n", > + pass, > + busy_ns / 1000, busy_us, err_busy, > + idle_ns / 1000, idle_us, err_idle); Ok, makes sense. > + > + if (pass == 0) { > + sleep_busy = (double)busy_us - > + (double)busy_us * err_busy / 100.0; > + sleep_idle = (double)idle_us - > + (double)idle_us * err_idle / 100.0; > + igt_info("calibrated sleeps: busy=%lu, idle=%lu\n", > + sleep_busy, sleep_idle); > + } > + } > + > + igt_spin_batch_free(gem_fd, spin); > + } > + > + /* Let the child run. */ > + usleep(pwm_calibration_us * 2); > + > + /* Collect engine busyness for an interesting part of child runtime. */ > + fd = open_pmu(I915_PMU_ENGINE_BUSY(e->class, e->instance)); > + val[0] = __pmu_read_single(fd, &ts[0]); > + usleep(test_us / 2); > + val[1] = __pmu_read_single(fd, &ts[1]); > + close(fd); > + > + igt_waitchildren(); > + > + busy_r = (double)(val[1] - val[0]) / (ts[1] - ts[0]); > + > + igt_info("error=%.2f%% (%.2f%% vs %lu%%)\n", > + __error(busy_r, target_busy_pct / 100.0), > + busy_r * 100.0, target_busy_pct); > + > + assert_within_epsilon(busy_r, (double)target_busy_pct / 100.0, 0.15); > +} A fine compromise! :) Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> -Chris
diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c index a7501ca5f7a4..fa9b54793a52 100644 --- a/tests/perf_pmu.c +++ b/tests/perf_pmu.c @@ -35,6 +35,7 @@ #include <dirent.h> #include <time.h> #include <poll.h> +#include <sched.h> #include "igt.h" #include "igt_core.h" @@ -385,6 +386,22 @@ busy_check_all(int gem_fd, const struct intel_execution_engine2 *e, gem_quiescent_gpu(gem_fd); } +static void +__submit_spin_batch(int gem_fd, igt_spin_t *spin, + const struct intel_execution_engine2 *e) +{ + struct drm_i915_gem_exec_object2 obj = { + .handle = spin->handle + }; + struct drm_i915_gem_execbuffer2 eb = { + .buffer_count = 1, + .buffers_ptr = to_user_pointer(&obj), + .flags = e2ring(gem_fd, e), + }; + + gem_execbuf(gem_fd, &eb); +} + static void most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e, const unsigned int num_engines, unsigned int flags) @@ -405,15 +422,7 @@ most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e, if (e == e_) { idle_idx = i; } else if (spin) { - struct drm_i915_gem_exec_object2 obj = { - .handle = spin->handle - }; - struct drm_i915_gem_execbuffer2 eb = { - .buffer_count = 1, - .buffers_ptr = to_user_pointer(&obj), - .flags = e2ring(gem_fd, e_), - }; - gem_execbuf(gem_fd, &eb); + __submit_spin_batch(gem_fd, spin, e_); } else { spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e_), 0); @@ -469,15 +478,7 @@ all_busy_check_all(int gem_fd, const unsigned int num_engines, continue; if (spin) { - struct drm_i915_gem_exec_object2 obj = { - .handle = spin->handle - }; - struct drm_i915_gem_execbuffer2 eb = { - .buffer_count = 1, - .buffers_ptr = to_user_pointer(&obj), - .flags = e2ring(gem_fd, e), - }; - gem_execbuf(gem_fd, &eb); + __submit_spin_batch(gem_fd, spin, e); } else { spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0); @@ -1390,6 +1391,150 @@ test_enable_race(int gem_fd, const struct intel_execution_engine2 *e) gem_quiescent_gpu(gem_fd); } +static double __error(double val, double ref) +{ + igt_assert(ref != 0.0); + return (100.0 * val / ref) - 100.0; +} + +static void __rearm_spin_batch(igt_spin_t *spin) +{ + const uint32_t mi_arb_chk = 0x5 << 23; + + *spin->batch = mi_arb_chk; + __sync_synchronize(); +} + +#define div_round_up(a, b) (((a) + (b) - 1) / (b)) + +static void +accuracy(int gem_fd, const struct intel_execution_engine2 *e, + unsigned long target_busy_pct) +{ + const unsigned int min_test_loops = 7; + const unsigned long min_test_us = 1e6; + unsigned long busy_us = 2500; + unsigned long idle_us = 100 * (busy_us - target_busy_pct * + busy_us / 100) / target_busy_pct; + unsigned long pwm_calibration_us; + unsigned long test_us; + double busy_r; + uint64_t val[2]; + uint64_t ts[2]; + int fd; + + /* Sampling platforms cannot reach the high accuracy criteria. */ + igt_require(intel_gen(intel_get_drm_devid(gem_fd)) >= 8); + + while (idle_us < 2500) { + busy_us *= 2; + idle_us *= 2; + } + + pwm_calibration_us = min_test_loops * (busy_us + idle_us); + while (pwm_calibration_us < min_test_us) + pwm_calibration_us += busy_us + idle_us; + test_us = min_test_loops * (idle_us + busy_us); + while (test_us < min_test_us) + test_us += busy_us + idle_us; + + igt_info("calibration=%luus, test=%luus; busy=%luus, idle=%luus\n", + pwm_calibration_us, test_us, busy_us, idle_us); + + assert_within_epsilon((double)busy_us / (busy_us + idle_us), + (double)target_busy_pct / 100.0, tolerance); + + /* Emit PWM pattern on the engine from a child. */ + igt_fork(child, 1) { + struct sched_param rt = { .sched_priority = 99 }; + const unsigned long timeout[] = { pwm_calibration_us * 1000, + test_us * 2 * 1000 }; + unsigned long sleep_busy = busy_us; + unsigned long sleep_idle = idle_us; + igt_spin_t *spin; + + /* We need the best sleep accuracy we can get. */ + igt_require(sched_setscheduler(0, + SCHED_FIFO | SCHED_RESET_ON_FORK, + &rt) == 0); + + /* Allocate our spin batch and idle it. */ + spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0); + igt_spin_batch_end(spin); + gem_sync(gem_fd, spin->handle); + + /* 1st pass is calibration, second pass is the test. */ + for (int pass = 0; pass < ARRAY_SIZE(timeout); pass++) { + unsigned long busy_ns = 0, idle_ns = 0; + struct timespec test_start = { }; + unsigned long loops = 0; + double err_busy, err_idle; + + igt_nsec_elapsed(&test_start); + do { + struct timespec t_busy = { }; + + igt_nsec_elapsed(&t_busy); + + /* Restart the spinbatch. */ + __rearm_spin_batch(spin); + __submit_spin_batch(gem_fd, spin, e); + measured_usleep(sleep_busy); + igt_spin_batch_end(spin); + gem_sync(gem_fd, spin->handle); + + busy_ns += igt_nsec_elapsed(&t_busy); + + idle_ns += measured_usleep(sleep_idle); + + loops++; + } while (igt_nsec_elapsed(&test_start) < timeout[pass]); + + busy_ns = div_round_up(busy_ns, loops); + idle_ns = div_round_up(idle_ns, loops); + + err_busy = __error(busy_ns / 1000, busy_us); + err_idle = __error(idle_ns / 1000, idle_us); + + igt_info("%u: busy %lu/%lu %.2f%%, idle %lu/%lu %.2f%%\n", + pass, + busy_ns / 1000, busy_us, err_busy, + idle_ns / 1000, idle_us, err_idle); + + if (pass == 0) { + sleep_busy = (double)busy_us - + (double)busy_us * err_busy / 100.0; + sleep_idle = (double)idle_us - + (double)idle_us * err_idle / 100.0; + igt_info("calibrated sleeps: busy=%lu, idle=%lu\n", + sleep_busy, sleep_idle); + } + } + + igt_spin_batch_free(gem_fd, spin); + } + + /* Let the child run. */ + usleep(pwm_calibration_us * 2); + + /* Collect engine busyness for an interesting part of child runtime. */ + fd = open_pmu(I915_PMU_ENGINE_BUSY(e->class, e->instance)); + val[0] = __pmu_read_single(fd, &ts[0]); + usleep(test_us / 2); + val[1] = __pmu_read_single(fd, &ts[1]); + close(fd); + + igt_waitchildren(); + + busy_r = (double)(val[1] - val[0]) / (ts[1] - ts[0]); + + igt_info("error=%.2f%% (%.2f%% vs %lu%%)\n", + __error(busy_r, target_busy_pct / 100.0), + busy_r * 100.0, target_busy_pct); + + assert_within_epsilon(busy_r, (double)target_busy_pct / 100.0, 0.15); +} + igt_main { const unsigned int num_other_metrics = @@ -1418,6 +1563,8 @@ igt_main invalid_init(); for_each_engine_class_instance(fd, e) { + const unsigned int pct[] = { 2, 50, 98 }; + /** * Test that a single engine metric can be initialized or it * is correctly rejected. @@ -1524,6 +1671,15 @@ igt_main */ igt_subtest_f("enable-race-%s", e->name) test_enable_race(fd, e); + + /** + * Check engine busyness accuracy is as expected. + */ + for (i = 0; i < ARRAY_SIZE(pct); i++) { + igt_subtest_f("busy-accuracy-%u-%s", + pct[i], e->name) + accuracy(fd, e, pct[i]); + } } /**