diff mbox series

drm/i915/gem: Allow users to disable waitboost

Message ID 20230920215624.3482244-1-vinay.belgaumkar@intel.com (mailing list archive)
State New, archived
Headers show
Series drm/i915/gem: Allow users to disable waitboost | expand

Commit Message

Vinay Belgaumkar Sept. 20, 2023, 9:56 p.m. UTC
Provide a bit to disable waitboost while waiting on a gem object.
Waitboost results in increased power consumption by requesting RP0
while waiting for the request to complete. Add a bit in the gem_wait()
IOCTL where this can be disabled.

This is related to the libva API change here -
Link: https://github.com/XinfengZhang/libva/commit/3d90d18c67609a73121bb71b20ee4776b54b61a7

Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Signed-off-by: Vinay Belgaumkar <vinay.belgaumkar@intel.com>
---
 drivers/gpu/drm/i915/gem/i915_gem_wait.c | 9 ++++++---
 drivers/gpu/drm/i915/i915_request.c      | 3 ++-
 drivers/gpu/drm/i915/i915_request.h      | 1 +
 include/uapi/drm/i915_drm.h              | 1 +
 4 files changed, 10 insertions(+), 4 deletions(-)

Comments

Tvrtko Ursulin Sept. 21, 2023, 10:41 a.m. UTC | #1
On 20/09/2023 22:56, Vinay Belgaumkar wrote:
> Provide a bit to disable waitboost while waiting on a gem object.
> Waitboost results in increased power consumption by requesting RP0
> while waiting for the request to complete. Add a bit in the gem_wait()
> IOCTL where this can be disabled.
> 
> This is related to the libva API change here -
> Link: https://github.com/XinfengZhang/libva/commit/3d90d18c67609a73121bb71b20ee4776b54b61a7

This link does not appear to lead to userspace code using this uapi?

> 
> Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
> Signed-off-by: Vinay Belgaumkar <vinay.belgaumkar@intel.com>
> ---
>   drivers/gpu/drm/i915/gem/i915_gem_wait.c | 9 ++++++---
>   drivers/gpu/drm/i915/i915_request.c      | 3 ++-
>   drivers/gpu/drm/i915/i915_request.h      | 1 +
>   include/uapi/drm/i915_drm.h              | 1 +
>   4 files changed, 10 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_wait.c b/drivers/gpu/drm/i915/gem/i915_gem_wait.c
> index d4b918fb11ce..955885ec859d 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_wait.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_wait.c
> @@ -72,7 +72,8 @@ i915_gem_object_wait_reservation(struct dma_resv *resv,
>   	struct dma_fence *fence;
>   	long ret = timeout ?: 1;
>   
> -	i915_gem_object_boost(resv, flags);
> +	if (!(flags & I915_WAITBOOST_DISABLE))
> +		i915_gem_object_boost(resv, flags);
>   
>   	dma_resv_iter_begin(&cursor, resv,
>   			    dma_resv_usage_rw(flags & I915_WAIT_ALL));
> @@ -236,7 +237,7 @@ i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>   	ktime_t start;
>   	long ret;
>   
> -	if (args->flags != 0)
> +	if (args->flags != 0 || args->flags != I915_GEM_WAITBOOST_DISABLE)
>   		return -EINVAL;
>   
>   	obj = i915_gem_object_lookup(file, args->bo_handle);
> @@ -248,7 +249,9 @@ i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>   	ret = i915_gem_object_wait(obj,
>   				   I915_WAIT_INTERRUPTIBLE |
>   				   I915_WAIT_PRIORITY |
> -				   I915_WAIT_ALL,
> +				   I915_WAIT_ALL |
> +				   (args->flags & I915_GEM_WAITBOOST_DISABLE ?
> +				    I915_WAITBOOST_DISABLE : 0),
>   				   to_wait_timeout(args->timeout_ns));
>   
>   	if (args->timeout_ns > 0) {
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index f59081066a19..2957409b4b2a 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -2044,7 +2044,8 @@ long i915_request_wait_timeout(struct i915_request *rq,
>   	 * but at a cost of spending more power processing the workload
>   	 * (bad for battery).
>   	 */
> -	if (flags & I915_WAIT_PRIORITY && !i915_request_started(rq))
> +	if (!(flags & I915_WAITBOOST_DISABLE) && (flags & I915_WAIT_PRIORITY) &&
> +	    !i915_request_started(rq))
>   		intel_rps_boost(rq);
>   
>   	wait.tsk = current;
> diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
> index 0ac55b2e4223..3cc00e8254dc 100644
> --- a/drivers/gpu/drm/i915/i915_request.h
> +++ b/drivers/gpu/drm/i915/i915_request.h
> @@ -445,6 +445,7 @@ long i915_request_wait(struct i915_request *rq,
>   #define I915_WAIT_INTERRUPTIBLE	BIT(0)
>   #define I915_WAIT_PRIORITY	BIT(1) /* small priority bump for the request */
>   #define I915_WAIT_ALL		BIT(2) /* used by i915_gem_object_wait() */
> +#define I915_WAITBOOST_DISABLE	BIT(3) /* used by i915_gem_object_wait() */
>   
>   void i915_request_show(struct drm_printer *m,
>   		       const struct i915_request *rq,
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index 7000e5910a1d..4adee70e39cf 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -1928,6 +1928,7 @@ struct drm_i915_gem_wait {
>   	/** Handle of BO we shall wait on */
>   	__u32 bo_handle;
>   	__u32 flags;
> +#define I915_GEM_WAITBOOST_DISABLE      (1u<<0)

Probably would be good to avoid mentioning waitboost in the uapi since 
so far it wasn't an explicit feature/contract. Something like 
I915_GEM_WAIT_BACKGROUND_PRIORITY? Low priority?

I also wonder if there could be a possible angle to help Rob (+cc) 
upstream the syncobj/fence deadline code if our media driver might make 
use of that somehow.

Like if either we could wire up the deadline into GEM_WAIT (in a 
backward compatible manner), or if media could use sync fd wait instead. 
Assuming they have an out fence already, which may not be true.

Regards,

Tvrtko

>   	/** Number of nanoseconds to wait, Returns time remaining. */
>   	__s64 timeout_ns;
>   };
kernel test robot Sept. 26, 2023, 2:58 a.m. UTC | #2
Hello,

kernel test robot noticed a -3.2% regression of phoronix-test-suite.paraview.WaveletContour.1024x768.mipolys___sec on:


commit: 54fef7ea35dadd66193b98805b0bc42ef2b279db ("[PATCH] drm/i915/gem: Allow users to disable waitboost")
url: https://github.com/intel-lab-lkp/linux/commits/Vinay-Belgaumkar/drm-i915-gem-Allow-users-to-disable-waitboost/20230921-060357
base: git://anongit.freedesktop.org/drm-intel for-linux-next
patch link: https://lore.kernel.org/all/20230920215624.3482244-1-vinay.belgaumkar@intel.com/
patch subject: [PATCH] drm/i915/gem: Allow users to disable waitboost

testcase: phoronix-test-suite
test machine: 12 threads 1 sockets Intel(R) Core(TM) i7-8700 CPU @ 3.20GHz (Coffee Lake) with 32G memory
parameters:

	need_x: true
	test: paraview-1.0.2
	option_a: Wavelet Contour
	option_b: 1024 x 768
	cpufreq_governor: performance


In addition to that, the commit also has significant impact on the following tests:

+------------------+----------------------------------------------------------------------------------------------------------------+
| testcase: change | phoronix-test-suite: phoronix-test-suite.x11perf.PutImageXY500x500Square.operations___second 12.8% improvement |
| test machine     | 12 threads 1 sockets Intel(R) Core(TM) i7-8700 CPU @ 3.20GHz (Coffee Lake) with 32G memory                     |
| test parameters  | cpufreq_governor=performance                                                                                   |
|                  | need_x=true                                                                                                    |
|                  | option_a=PutImage XY 500x500 Square                                                                            |
|                  | test=x11perf-1.1.1                                                                                             |
+------------------+----------------------------------------------------------------------------------------------------------------+
| testcase: change | phoronix-test-suite: phoronix-test-suite.openarena.2560x1440.milliseconds -12.2% regression                    |
| test machine     | 12 threads 1 sockets Intel(R) Core(TM) i7-8700 CPU @ 3.20GHz (Coffee Lake) with 32G memory                     |
| test parameters  | cpufreq_governor=performance                                                                                   |
|                  | need_x=true                                                                                                    |
|                  | option_a=2560 x 1440                                                                                           |
|                  | test=openarena-1.5.5                                                                                           |
+------------------+----------------------------------------------------------------------------------------------------------------+


If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <oliver.sang@intel.com>
| Closes: https://lore.kernel.org/oe-lkp/202309261055.b74df987-oliver.sang@intel.com


Details are as below:
-------------------------------------------------------------------------------------------------->


The kernel config and materials to reproduce are available at:
https://download.01.org/0day-ci/archive/20230926/202309261055.b74df987-oliver.sang@intel.com

=========================================================================================
compiler/cpufreq_governor/kconfig/need_x/option_a/option_b/rootfs/tbox_group/test/testcase:
  gcc-12/performance/x86_64-rhel-8.3/true/Wavelet Contour/1024 x 768/debian-x86_64-phoronix/lkp-cfl-d2/paraview-1.0.2/phoronix-test-suite

commit: 
  16a9359401 ("drm/i915: Implement transcoder LRR for TGL+")
  54fef7ea35 ("drm/i915/gem: Allow users to disable waitboost")

16a9359401edcbc0 54fef7ea35dadd66193b98805b0 
---------------- --------------------------- 
         %stddev     %change         %stddev
             \          |                \  
      0.05 ±  4%      +0.0        0.06 ±  2%  mpstat.cpu.all.soft%
     66.17 ± 60%    +145.6%     162.50 ± 33%  turbostat.C10
     28.61            -3.3%      27.68        phoronix-test-suite.paraview.WaveletContour.1024x768.frames___sec
    298.15            -3.2%     288.49        phoronix-test-suite.paraview.WaveletContour.1024x768.mipolys___sec
    535005            +8.6%     580810        phoronix-test-suite.time.minor_page_faults
      6278            +8.3%       6797        phoronix-test-suite.time.voluntary_context_switches
    801166            +5.6%     845675        proc-vmstat.numa_hit
    799382            +5.1%     840353        proc-vmstat.numa_local
     59648            +2.6%      61211        proc-vmstat.pgactivate
   1539307            +2.7%    1580759        proc-vmstat.pgalloc_normal
    734297            +6.8%     783862        proc-vmstat.pgfault
   1343231            +3.1%    1385353        proc-vmstat.pgfree
     39042            +3.7%      40480 ±  2%  proc-vmstat.pgreuse
 1.106e+08            +2.1%  1.129e+08        perf-stat.i.cache-references
 4.255e+09            +1.9%  4.336e+09        perf-stat.i.cpu-cycles
    147872            +2.4%     151392        perf-stat.i.dTLB-store-misses
    230242            +2.2%     235419        perf-stat.i.iTLB-load-misses
    569455            +2.4%     583234        perf-stat.i.iTLB-loads
      0.35            +1.9%       0.36        perf-stat.i.metric.GHz
     12547            +8.6%      13625        perf-stat.i.minor-faults
    609443 ±  2%      +4.0%     633739 ±  2%  perf-stat.i.node-loads
   1701083 ±  2%      +5.0%    1786794 ±  2%  perf-stat.i.node-stores
     12566            +8.6%      13644        perf-stat.i.page-faults
 1.085e+08            +2.1%  1.108e+08        perf-stat.ps.cache-references
 4.179e+09            +1.9%  4.259e+09        perf-stat.ps.cpu-cycles
    225995            +2.3%     231096        perf-stat.ps.iTLB-load-misses
    558694            +2.4%     572272        perf-stat.ps.iTLB-loads
     12320            +8.6%      13380        perf-stat.ps.minor-faults
    598059 ±  2%      +4.0%     621901 ±  2%  perf-stat.ps.node-loads
   1670891 ±  2%      +5.0%    1754927 ±  2%  perf-stat.ps.node-stores
     12339            +8.6%      13399        perf-stat.ps.page-faults
     14.86 ± 42%     -11.7        3.12 ±152%  perf-profile.calltrace.cycles-pp.cpu_startup_entry.start_secondary.secondary_startup_64_no_verify
     14.86 ± 42%     -11.7        3.12 ±152%  perf-profile.calltrace.cycles-pp.do_idle.cpu_startup_entry.start_secondary.secondary_startup_64_no_verify
     14.86 ± 42%     -11.7        3.12 ±152%  perf-profile.calltrace.cycles-pp.start_secondary.secondary_startup_64_no_verify
     14.86 ± 42%     -11.0        3.82 ±161%  perf-profile.calltrace.cycles-pp.secondary_startup_64_no_verify
     11.53 ± 67%     -10.1        1.39 ±223%  perf-profile.calltrace.cycles-pp.cpuidle_enter.cpuidle_idle_call.do_idle.cpu_startup_entry.start_secondary
     11.53 ± 67%      -9.4        2.08 ±223%  perf-profile.calltrace.cycles-pp.cpuidle_enter_state.cpuidle_enter.cpuidle_idle_call.do_idle.cpu_startup_entry
     11.53 ± 67%      -9.1        2.43 ±143%  perf-profile.calltrace.cycles-pp.cpuidle_idle_call.do_idle.cpu_startup_entry.start_secondary.secondary_startup_64_no_verify
      6.02 ± 95%      -5.3        0.70 ±223%  perf-profile.calltrace.cycles-pp.intel_idle.cpuidle_enter_state.cpuidle_enter.cpuidle_idle_call.do_idle
      5.51 ± 56%      -4.8        0.70 ±223%  perf-profile.calltrace.cycles-pp.intel_idle_ibrs.cpuidle_enter_state.cpuidle_enter.cpuidle_idle_call.do_idle
      7.02 ±128%      -2.6        4.44 ±147%  perf-profile.calltrace.cycles-pp.do_group_exit.get_signal.arch_do_signal_or_restart.exit_to_user_mode_loop.exit_to_user_mode_prepare
      7.02 ±128%      -2.6        4.44 ±147%  perf-profile.calltrace.cycles-pp.do_exit.do_group_exit.get_signal.arch_do_signal_or_restart.exit_to_user_mode_loop
      7.02 ±128%      -2.6        4.44 ±147%  perf-profile.calltrace.cycles-pp.exit_to_user_mode_prepare.syscall_exit_to_user_mode.do_syscall_64.entry_SYSCALL_64_after_hwframe
      7.02 ±128%      -2.6        4.44 ±147%  perf-profile.calltrace.cycles-pp.exit_to_user_mode_loop.exit_to_user_mode_prepare.syscall_exit_to_user_mode.do_syscall_64.entry_SYSCALL_64_after_hwframe
      7.02 ±128%      -2.6        4.44 ±147%  perf-profile.calltrace.cycles-pp.arch_do_signal_or_restart.exit_to_user_mode_loop.exit_to_user_mode_prepare.syscall_exit_to_user_mode.do_syscall_64
      7.02 ±128%      -2.6        4.44 ±147%  perf-profile.calltrace.cycles-pp.get_signal.arch_do_signal_or_restart.exit_to_user_mode_loop.exit_to_user_mode_prepare.syscall_exit_to_user_mode
     14.86 ± 42%     -11.7        3.12 ±152%  perf-profile.children.cycles-pp.start_secondary
     14.86 ± 42%     -11.0        3.82 ±161%  perf-profile.children.cycles-pp.secondary_startup_64_no_verify
     14.86 ± 42%     -11.0        3.82 ±161%  perf-profile.children.cycles-pp.cpu_startup_entry
     14.86 ± 42%     -11.0        3.82 ±161%  perf-profile.children.cycles-pp.do_idle
     11.53 ± 67%      -9.4        2.08 ±223%  perf-profile.children.cycles-pp.cpuidle_enter
     11.53 ± 67%      -9.4        2.08 ±223%  perf-profile.children.cycles-pp.cpuidle_enter_state
     11.53 ± 67%      -8.4        3.12 ±152%  perf-profile.children.cycles-pp.cpuidle_idle_call
      6.02 ± 95%      -5.3        0.70 ±223%  perf-profile.children.cycles-pp.intel_idle
      5.51 ± 56%      -4.8        0.70 ±223%  perf-profile.children.cycles-pp.intel_idle_ibrs
      7.02 ±128%      -2.6        4.44 ±147%  perf-profile.children.cycles-pp.exit_to_user_mode_prepare
      7.02 ±128%      -2.6        4.44 ±147%  perf-profile.children.cycles-pp.exit_to_user_mode_loop
      7.02 ±128%      -2.6        4.44 ±147%  perf-profile.children.cycles-pp.arch_do_signal_or_restart
      7.02 ±128%      -2.6        4.44 ±147%  perf-profile.children.cycles-pp.get_signal
      6.02 ± 95%      -5.3        0.70 ±223%  perf-profile.self.cycles-pp.intel_idle
      5.51 ± 56%      -4.8        0.70 ±223%  perf-profile.self.cycles-pp.intel_idle_ibrs


***************************************************************************************************
lkp-cfl-d2: 12 threads 1 sockets Intel(R) Core(TM) i7-8700 CPU @ 3.20GHz (Coffee Lake) with 32G memory
=========================================================================================
compiler/cpufreq_governor/kconfig/need_x/option_a/rootfs/tbox_group/test/testcase:
  gcc-12/performance/x86_64-rhel-8.3/true/PutImage XY 500x500 Square/debian-x86_64-phoronix/lkp-cfl-d2/x11perf-1.1.1/phoronix-test-suite

commit: 
  16a9359401 ("drm/i915: Implement transcoder LRR for TGL+")
  54fef7ea35 ("drm/i915/gem: Allow users to disable waitboost")

16a9359401edcbc0 54fef7ea35dadd66193b98805b0 
---------------- --------------------------- 
         %stddev     %change         %stddev
             \          |                \  
     11950 ±  3%     +93.9%      23173 ±  3%  meminfo.Unevictable
      1.02 ±  3%      -0.6        0.43 ±  3%  mpstat.cpu.all.iowait%
     21273            -5.3%      20141        vmstat.system.in
      4887 ± 32%     -67.7%       1579 ± 32%  phoronix-test-suite.time.involuntary_context_switches
    147212           +26.1%     185677        phoronix-test-suite.time.minor_page_faults
     76.83           +10.6%      85.00        phoronix-test-suite.time.percent_of_cpu_this_job_got
     96.48 ±  4%     +15.0%     110.93 ±  2%  phoronix-test-suite.time.user_time
    106.83           +12.8%     120.50        phoronix-test-suite.x11perf.PutImageXY500x500Square.operations___second
      3.73 ±  3%      -0.6        3.12 ±  2%  turbostat.C1E%
     58316 ±  5%     -20.6%      46326 ±  3%  turbostat.C3
      0.86 ±  2%      -0.1        0.74 ±  3%  turbostat.C3%
      1.24 ±  8%     -18.9%       1.01 ± 10%  turbostat.CPU%c3
     38.67 ±  2%      +5.0       43.70 ±  2%  turbostat.CPUGFX%
     23.44            +4.0%      24.38        turbostat.CorWatt
      2.30 ±  4%     -71.6%       0.65 ±  2%  turbostat.GFXWatt
     26.19            -2.7%      25.49        turbostat.PkgWatt
      1.41            +2.4%       1.44        turbostat.RAMWatt
     61.83 ± 23%     -20.2       41.62 ± 31%  perf-profile.calltrace.cycles-pp.intel_idle_ibrs.cpuidle_enter_state.cpuidle_enter.cpuidle_idle_call.do_idle
     61.85 ± 23%     -20.2       41.64 ± 31%  perf-profile.children.cycles-pp.intel_idle_ibrs
      1.24 ± 59%      -1.0        0.23 ± 27%  perf-profile.children.cycles-pp.worker_thread
      0.84 ± 60%      -0.7        0.18 ± 48%  perf-profile.children.cycles-pp.asm_common_interrupt
      0.83 ± 61%      -0.7        0.18 ± 48%  perf-profile.children.cycles-pp.common_interrupt
      0.35 ± 74%      -0.3        0.06 ± 85%  perf-profile.children.cycles-pp.__common_interrupt
      0.35 ± 75%      -0.3        0.06 ± 85%  perf-profile.children.cycles-pp.handle_edge_irq
      0.30 ± 78%      -0.3        0.05 ± 81%  perf-profile.children.cycles-pp.handle_irq_event
      0.37 ± 18%      -0.2        0.18 ± 46%  perf-profile.children.cycles-pp.newidle_balance
      0.12 ± 32%      -0.1        0.04 ± 79%  perf-profile.children.cycles-pp.exit_mm
      0.12 ± 30%      -0.1        0.05 ± 73%  perf-profile.children.cycles-pp.native_apic_msr_eoi
      0.03 ±144%      +0.1        0.10 ± 38%  perf-profile.children.cycles-pp.dma_resv_iter_first_unlocked
      0.04 ±154%      +0.1        0.19 ± 40%  perf-profile.children.cycles-pp.i915_gem_busy_ioctl
     61.82 ± 23%     -20.2       41.63 ± 31%  perf-profile.self.cycles-pp.intel_idle_ibrs
      0.12 ± 26%      -0.1        0.05 ± 74%  perf-profile.self.cycles-pp.native_apic_msr_eoi
     31861            +5.9%      33748        proc-vmstat.nr_active_file
     29007            +7.6%      31200        proc-vmstat.nr_mapped
    183598            +1.6%     186520        proc-vmstat.nr_shmem
      2987 ±  3%     +93.9%       5793 ±  3%  proc-vmstat.nr_unevictable
     31861            +5.9%      33748        proc-vmstat.nr_zone_active_file
      2987 ±  3%     +93.9%       5793 ±  3%  proc-vmstat.nr_zone_unevictable
    721353 ±  2%     +13.3%     816958        proc-vmstat.numa_hit
    721406 ±  2%     +13.2%     816970        proc-vmstat.numa_local
     21301 ±  4%     +13.1%      24099        proc-vmstat.pgactivate
   2459642 ±  3%     +15.5%    2840123 ±  2%  proc-vmstat.pgalloc_normal
    530507 ±  3%      +8.0%     573060        proc-vmstat.pgfault
   2336660 ±  3%     +16.3%    2717873 ±  2%  proc-vmstat.pgfree
      9973 ±  2%    +143.6%      24292 ±  3%  proc-vmstat.unevictable_pgs_culled
      9875          +146.0%      24292 ±  3%  proc-vmstat.unevictable_pgs_rescued
      9876          +146.1%      24304 ±  3%  proc-vmstat.unevictable_pgs_scanned
 1.413e+09           +12.4%  1.588e+09        perf-stat.i.branch-instructions
      2.09            -0.1        1.99 ±  2%  perf-stat.i.branch-miss-rate%
  23305261            +1.8%   23715959        perf-stat.i.branch-misses
      4.57 ±  4%      +1.5        6.06 ±  2%  perf-stat.i.cache-miss-rate%
   4971151 ±  5%     +71.3%    8515308 ±  2%  perf-stat.i.cache-misses
 1.771e+08            +6.4%  1.884e+08        perf-stat.i.cache-references
      0.91            -3.5%       0.88 ±  2%  perf-stat.i.cpi
 4.936e+09            +7.8%  5.319e+09        perf-stat.i.cpu-cycles
     39.95 ±  3%     -15.1%      33.90 ±  2%  perf-stat.i.cpu-migrations
      1671 ±  4%     -34.5%       1095 ±  6%  perf-stat.i.cycles-between-cache-misses
 2.815e+09           +11.8%  3.147e+09        perf-stat.i.dTLB-loads
      0.02 ±  2%      -0.0        0.02 ±  4%  perf-stat.i.dTLB-store-miss-rate%
 6.478e+08           +10.4%  7.153e+08        perf-stat.i.dTLB-stores
     40.15 ±  2%      -2.9       37.29        perf-stat.i.iTLB-load-miss-rate%
 1.063e+10           +12.2%  1.192e+10        perf-stat.i.instructions
     40912 ±  4%     +25.5%      51361 ±  2%  perf-stat.i.instructions-per-iTLB-miss
      2.18            +2.7%       2.23        perf-stat.i.ipc
      0.41            +7.8%       0.44        perf-stat.i.metric.GHz
    421.03           +11.6%     469.88        perf-stat.i.metric.M/sec
      3358            +5.5%       3542        perf-stat.i.minor-faults
      0.00 ± 11%      -0.0        0.00 ±  6%  perf-stat.i.node-load-miss-rate%
      6.40 ±  8%     +17.6%       7.52 ±  5%  perf-stat.i.node-load-misses
    216221 ±  6%     +61.9%     350134        perf-stat.i.node-loads
      0.00 ± 16%      +0.1        0.12 ±220%  perf-stat.i.node-store-miss-rate%
      6.72 ±  6%  +6.9e+12%  4.615e+11 ±223%  perf-stat.i.node-store-misses
    694301 ±  9%     -14.3%     594881 ±  3%  perf-stat.i.node-stores
      3361            +5.5%       3545        perf-stat.i.page-faults
      0.47 ±  5%     +52.6%       0.71 ±  2%  perf-stat.overall.MPKI
      1.65            -0.2        1.49        perf-stat.overall.branch-miss-rate%
      2.81 ±  6%      +1.7        4.52 ±  2%  perf-stat.overall.cache-miss-rate%
      0.46            -4.0%       0.45        perf-stat.overall.cpi
    994.89 ±  5%     -37.2%     624.75 ±  2%  perf-stat.overall.cycles-between-cache-misses
      0.03            -0.0        0.02        perf-stat.overall.dTLB-load-miss-rate%
      0.01            -0.0        0.01        perf-stat.overall.dTLB-store-miss-rate%
     30.99 ±  3%      -2.3       28.68 ±  4%  perf-stat.overall.iTLB-load-miss-rate%
     25510 ± 11%     +20.2%      30672 ±  5%  perf-stat.overall.instructions-per-iTLB-miss
      2.15            +4.1%       2.24        perf-stat.overall.ipc
      0.00 ±  8%      -0.0        0.00 ±  5%  perf-stat.overall.node-load-miss-rate%
      0.00 ±  8%     +16.7       16.67 ±223%  perf-stat.overall.node-store-miss-rate%
 1.403e+09           +12.5%  1.578e+09        perf-stat.ps.branch-instructions
  23153579            +1.7%   23558574        perf-stat.ps.branch-misses
   4938806 ±  5%     +71.3%    8457880 ±  2%  perf-stat.ps.cache-misses
 1.758e+08            +6.4%  1.871e+08        perf-stat.ps.cache-references
 4.901e+09            +7.8%  5.282e+09        perf-stat.ps.cpu-cycles
     39.66 ±  3%     -15.1%      33.67 ±  2%  perf-stat.ps.cpu-migrations
 2.795e+09           +11.8%  3.125e+09        perf-stat.ps.dTLB-loads
 6.432e+08           +10.4%  7.103e+08        perf-stat.ps.dTLB-stores
 1.055e+10           +12.2%  1.184e+10        perf-stat.ps.instructions
      3336            +5.5%       3518        perf-stat.ps.minor-faults
      6.35 ±  8%     +17.6%       7.47 ±  5%  perf-stat.ps.node-load-misses
    214756 ±  6%     +61.9%     347761        perf-stat.ps.node-loads
      6.67 ±  6%  +6.9e+12%  4.577e+11 ±223%  perf-stat.ps.node-store-misses
    689455 ±  9%     -14.3%     590802 ±  3%  perf-stat.ps.node-stores
      3338            +5.5%       3521        perf-stat.ps.page-faults
 1.467e+12 ±  4%     +15.7%  1.698e+12 ±  2%  perf-stat.total.instructions



***************************************************************************************************
lkp-cfl-d2: 12 threads 1 sockets Intel(R) Core(TM) i7-8700 CPU @ 3.20GHz (Coffee Lake) with 32G memory
=========================================================================================
compiler/cpufreq_governor/kconfig/need_x/option_a/rootfs/tbox_group/test/testcase:
  gcc-12/performance/x86_64-rhel-8.3/true/2560 x 1440/debian-x86_64-phoronix/lkp-cfl-d2/openarena-1.5.5/phoronix-test-suite

commit: 
  16a9359401 ("drm/i915: Implement transcoder LRR for TGL+")
  54fef7ea35 ("drm/i915/gem: Allow users to disable waitboost")

16a9359401edcbc0 54fef7ea35dadd66193b98805b0 
---------------- --------------------------- 
         %stddev     %change         %stddev
             \          |                \  
      1459 ± 27%     +34.9%       1967 ± 13%  sched_debug.cpu.curr->pid.max
      1.63            -0.5        1.14 ±  2%  mpstat.cpu.all.iowait%
      1.17            -0.1        1.06        mpstat.cpu.all.irq%
    189.00 ± 11%     -53.6%      87.67 ± 17%  perf-c2c.DRAM.local
     20.67 ± 28%     -87.9%       2.50 ± 72%  perf-c2c.HITM.local
      6016            +5.7%       6358        vmstat.io.bi
     30168            -5.0%      28666        vmstat.system.cs
     27678            -2.7%      26933        vmstat.system.in
    337.58           +13.2%     382.30        phoronix-test-suite.openarena.2560x1440.frames_per_second
      2.95           -12.2%       2.59        phoronix-test-suite.openarena.2560x1440.milliseconds
    191369           +23.8%     236938        phoronix-test-suite.time.minor_page_faults
     51.00            +6.5%      54.33        phoronix-test-suite.time.percent_of_cpu_this_job_got
     56192           -63.1%      20758        phoronix-test-suite.time.voluntary_context_switches
     90640           -64.0%      32673 ±  7%  turbostat.C1
      0.68 ±  2%      -0.3        0.38 ±  5%  turbostat.C1%
     46385           -12.3%      40688 ±  2%  turbostat.C3
      1.72            -0.2        1.51 ±  2%  turbostat.C3%
      3.58 ±  6%     -17.6%       2.95 ± 11%  turbostat.CPU%c3
      5.71           -11.1%       5.08        turbostat.GFXWatt
     29983 ±  4%     -63.0%      11105 ± 24%  turbostat.POLL
      0.06 ±  8%      -0.0        0.01 ± 35%  turbostat.POLL%
      1.31 ± 15%     +70.2%       2.23 ±  7%  turbostat.Pkg%pc2
     29.56            -1.4%      29.14        turbostat.PkgWatt
      2123            -5.7%       2001        proc-vmstat.nr_active_anon
     37869            +2.8%      38912        proc-vmstat.nr_active_file
     22828            -5.8%      21501        proc-vmstat.nr_unevictable
      2123            -5.7%       2001        proc-vmstat.nr_zone_active_anon
     37869            +2.8%      38912        proc-vmstat.nr_zone_active_file
     22828            -5.8%      21501        proc-vmstat.nr_zone_unevictable
    624578            +4.2%     650937        proc-vmstat.numa_hit
    624578            +4.2%     650937        proc-vmstat.numa_local
    771624            +3.5%     798755        proc-vmstat.pgalloc_normal
    429393            +7.2%     460476        proc-vmstat.pgfault
    620059            +4.1%     645242        proc-vmstat.pgfree
      8.52 ±126%      -7.9        0.62 ±223%  perf-profile.calltrace.cycles-pp.wp_page_copy.__handle_mm_fault.handle_mm_fault.do_user_addr_fault.exc_page_fault
      6.09 ±114%      -5.4        0.67 ±223%  perf-profile.calltrace.cycles-pp.intel_idle.cpuidle_enter_state.cpuidle_enter.cpuidle_idle_call.do_idle
      3.76 ±110%      -3.8        0.00        perf-profile.calltrace.cycles-pp.error_entry
      8.52 ±126%      -7.9        0.62 ±223%  perf-profile.children.cycles-pp.wp_page_copy
      6.09 ±114%      -5.4        0.67 ±223%  perf-profile.children.cycles-pp.intel_idle
      5.79 ±100%      -4.6        1.24 ±223%  perf-profile.children.cycles-pp.do_filp_open
      5.79 ±100%      -4.6        1.24 ±223%  perf-profile.children.cycles-pp.path_openat
      3.76 ±110%      -3.8        0.00        perf-profile.children.cycles-pp.error_entry
      3.76 ±110%      -2.6        1.11 ±223%  perf-profile.children.cycles-pp.asm_sysvec_apic_timer_interrupt
      3.76 ±110%      -2.6        1.11 ±223%  perf-profile.children.cycles-pp.sysvec_apic_timer_interrupt
      3.76 ±110%      -2.6        1.11 ±223%  perf-profile.children.cycles-pp.__sysvec_apic_timer_interrupt
      3.76 ±110%      -2.6        1.11 ±223%  perf-profile.children.cycles-pp.hrtimer_interrupt
      6.09 ±114%      -5.4        0.67 ±223%  perf-profile.self.cycles-pp.intel_idle
      3.76 ±110%      -3.8        0.00        perf-profile.self.cycles-pp.error_entry
      4.97 ±109%      -2.2        2.78 ±223%  perf-profile.self.cycles-pp.zap_pte_range
  8.07e+08            +2.7%   8.29e+08        perf-stat.i.branch-instructions
  31612011            +4.7%   33095236        perf-stat.i.cache-misses
     31337            -4.0%      30085        perf-stat.i.context-switches
      1.57            +7.7%       1.69 ±  2%  perf-stat.i.cpi
 4.798e+09            +3.2%   4.95e+09        perf-stat.i.cpu-cycles
      0.14 ±  3%      +0.0        0.16 ±  3%  perf-stat.i.dTLB-load-miss-rate%
 1.131e+09            +4.1%  1.177e+09        perf-stat.i.dTLB-loads
      0.03 ±  2%      +0.0        0.03 ±  2%  perf-stat.i.dTLB-store-miss-rate%
    134590            +6.1%     142835        perf-stat.i.dTLB-store-misses
 5.907e+08            +4.5%  6.175e+08        perf-stat.i.dTLB-stores
   3858714            +6.5%    4109072        perf-stat.i.iTLB-loads
 4.646e+09            +3.6%  4.812e+09        perf-stat.i.instructions
      0.40            +3.2%       0.41        perf-stat.i.metric.GHz
    824.17            +6.4%     876.81        perf-stat.i.metric.K/sec
    224.04            +3.7%     232.30        perf-stat.i.metric.M/sec
      5577           +13.3%       6319        perf-stat.i.minor-faults
   1661010            +8.8%    1806501        perf-stat.i.node-loads
   4199874            +5.4%    4426929        perf-stat.i.node-stores
      5582           +13.3%       6325        perf-stat.i.page-faults
    151.84            -1.5%     149.61        perf-stat.overall.cycles-between-cache-misses
 7.953e+08            +2.6%   8.16e+08        perf-stat.ps.branch-instructions
  31131128            +4.6%   32563072        perf-stat.ps.cache-misses
     30863            -4.1%      29604        perf-stat.ps.context-switches
 4.727e+09            +3.1%  4.871e+09        perf-stat.ps.cpu-cycles
 1.114e+09            +4.0%  1.158e+09        perf-stat.ps.dTLB-loads
    132539            +6.0%     140536        perf-stat.ps.dTLB-store-misses
 5.818e+08            +4.4%  6.076e+08        perf-stat.ps.dTLB-stores
   3798756            +6.4%    4042216        perf-stat.ps.iTLB-loads
 4.577e+09            +3.5%  4.736e+09        perf-stat.ps.instructions
      5496           +13.2%       6220        perf-stat.ps.minor-faults
   1635550            +8.7%    1777301        perf-stat.ps.node-loads
   4135301            +5.3%    4355124        perf-stat.ps.node-stores
      5501           +13.2%       6226        perf-stat.ps.page-faults
 3.045e+11            -2.8%  2.961e+11        perf-stat.total.instructions





Disclaimer:
Results have been estimated based on internal Intel analysis and are provided
for informational purposes only. Any difference in system hardware or software
design or configuration may affect actual performance.
Vinay Belgaumkar Sept. 27, 2023, 7:34 p.m. UTC | #3
On 9/21/2023 3:41 AM, Tvrtko Ursulin wrote:
>
> On 20/09/2023 22:56, Vinay Belgaumkar wrote:
>> Provide a bit to disable waitboost while waiting on a gem object.
>> Waitboost results in increased power consumption by requesting RP0
>> while waiting for the request to complete. Add a bit in the gem_wait()
>> IOCTL where this can be disabled.
>>
>> This is related to the libva API change here -
>> Link: 
>> https://github.com/XinfengZhang/libva/commit/3d90d18c67609a73121bb71b20ee4776b54b61a7
>
> This link does not appear to lead to userspace code using this uapi?
We have asked Carl (cc'd) to post a patch for the same.
>
>>
>> Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
>> Signed-off-by: Vinay Belgaumkar <vinay.belgaumkar@intel.com>
>> ---
>>   drivers/gpu/drm/i915/gem/i915_gem_wait.c | 9 ++++++---
>>   drivers/gpu/drm/i915/i915_request.c      | 3 ++-
>>   drivers/gpu/drm/i915/i915_request.h      | 1 +
>>   include/uapi/drm/i915_drm.h              | 1 +
>>   4 files changed, 10 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_wait.c 
>> b/drivers/gpu/drm/i915/gem/i915_gem_wait.c
>> index d4b918fb11ce..955885ec859d 100644
>> --- a/drivers/gpu/drm/i915/gem/i915_gem_wait.c
>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_wait.c
>> @@ -72,7 +72,8 @@ i915_gem_object_wait_reservation(struct dma_resv 
>> *resv,
>>       struct dma_fence *fence;
>>       long ret = timeout ?: 1;
>>   -    i915_gem_object_boost(resv, flags);
>> +    if (!(flags & I915_WAITBOOST_DISABLE))
>> +        i915_gem_object_boost(resv, flags);
>>         dma_resv_iter_begin(&cursor, resv,
>>                   dma_resv_usage_rw(flags & I915_WAIT_ALL));
>> @@ -236,7 +237,7 @@ i915_gem_wait_ioctl(struct drm_device *dev, void 
>> *data, struct drm_file *file)
>>       ktime_t start;
>>       long ret;
>>   -    if (args->flags != 0)
>> +    if (args->flags != 0 || args->flags != I915_GEM_WAITBOOST_DISABLE)
>>           return -EINVAL;
>>         obj = i915_gem_object_lookup(file, args->bo_handle);
>> @@ -248,7 +249,9 @@ i915_gem_wait_ioctl(struct drm_device *dev, void 
>> *data, struct drm_file *file)
>>       ret = i915_gem_object_wait(obj,
>>                      I915_WAIT_INTERRUPTIBLE |
>>                      I915_WAIT_PRIORITY |
>> -                   I915_WAIT_ALL,
>> +                   I915_WAIT_ALL |
>> +                   (args->flags & I915_GEM_WAITBOOST_DISABLE ?
>> +                    I915_WAITBOOST_DISABLE : 0),
>>                      to_wait_timeout(args->timeout_ns));
>>         if (args->timeout_ns > 0) {
>> diff --git a/drivers/gpu/drm/i915/i915_request.c 
>> b/drivers/gpu/drm/i915/i915_request.c
>> index f59081066a19..2957409b4b2a 100644
>> --- a/drivers/gpu/drm/i915/i915_request.c
>> +++ b/drivers/gpu/drm/i915/i915_request.c
>> @@ -2044,7 +2044,8 @@ long i915_request_wait_timeout(struct 
>> i915_request *rq,
>>        * but at a cost of spending more power processing the workload
>>        * (bad for battery).
>>        */
>> -    if (flags & I915_WAIT_PRIORITY && !i915_request_started(rq))
>> +    if (!(flags & I915_WAITBOOST_DISABLE) && (flags & 
>> I915_WAIT_PRIORITY) &&
>> +        !i915_request_started(rq))
>>           intel_rps_boost(rq);
>>         wait.tsk = current;
>> diff --git a/drivers/gpu/drm/i915/i915_request.h 
>> b/drivers/gpu/drm/i915/i915_request.h
>> index 0ac55b2e4223..3cc00e8254dc 100644
>> --- a/drivers/gpu/drm/i915/i915_request.h
>> +++ b/drivers/gpu/drm/i915/i915_request.h
>> @@ -445,6 +445,7 @@ long i915_request_wait(struct i915_request *rq,
>>   #define I915_WAIT_INTERRUPTIBLE    BIT(0)
>>   #define I915_WAIT_PRIORITY    BIT(1) /* small priority bump for the 
>> request */
>>   #define I915_WAIT_ALL        BIT(2) /* used by 
>> i915_gem_object_wait() */
>> +#define I915_WAITBOOST_DISABLE    BIT(3) /* used by 
>> i915_gem_object_wait() */
>>     void i915_request_show(struct drm_printer *m,
>>                  const struct i915_request *rq,
>> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
>> index 7000e5910a1d..4adee70e39cf 100644
>> --- a/include/uapi/drm/i915_drm.h
>> +++ b/include/uapi/drm/i915_drm.h
>> @@ -1928,6 +1928,7 @@ struct drm_i915_gem_wait {
>>       /** Handle of BO we shall wait on */
>>       __u32 bo_handle;
>>       __u32 flags;
>> +#define I915_GEM_WAITBOOST_DISABLE      (1u<<0)
>
> Probably would be good to avoid mentioning waitboost in the uapi since 
> so far it wasn't an explicit feature/contract. Something like 
> I915_GEM_WAIT_BACKGROUND_PRIORITY? Low priority?
sure.
>
> I also wonder if there could be a possible angle to help Rob (+cc) 
> upstream the syncobj/fence deadline code if our media driver might 
> make use of that somehow.
>
> Like if either we could wire up the deadline into GEM_WAIT (in a 
> backward compatible manner), or if media could use sync fd wait 
> instead. Assuming they have an out fence already, which may not be true.

Makes sense. We could add a SET_DEADLINE flag or something similar and 
pass in the deadline when appropriate.

Thanks,

Vinay.

>
> Regards,
>
> Tvrtko
>
>>       /** Number of nanoseconds to wait, Returns time remaining. */
>>       __s64 timeout_ns;
>>   };
Tvrtko Ursulin Sept. 28, 2023, 12:48 p.m. UTC | #4
On 27/09/2023 20:34, Belgaumkar, Vinay wrote:
> 
> On 9/21/2023 3:41 AM, Tvrtko Ursulin wrote:
>>
>> On 20/09/2023 22:56, Vinay Belgaumkar wrote:
>>> Provide a bit to disable waitboost while waiting on a gem object.
>>> Waitboost results in increased power consumption by requesting RP0
>>> while waiting for the request to complete. Add a bit in the gem_wait()
>>> IOCTL where this can be disabled.
>>>
>>> This is related to the libva API change here -
>>> Link: 
>>> https://github.com/XinfengZhang/libva/commit/3d90d18c67609a73121bb71b20ee4776b54b61a7
>>
>> This link does not appear to lead to userspace code using this uapi?
> We have asked Carl (cc'd) to post a patch for the same.

Ack.

>>> Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
>>> Signed-off-by: Vinay Belgaumkar <vinay.belgaumkar@intel.com>
>>> ---
>>>   drivers/gpu/drm/i915/gem/i915_gem_wait.c | 9 ++++++---
>>>   drivers/gpu/drm/i915/i915_request.c      | 3 ++-
>>>   drivers/gpu/drm/i915/i915_request.h      | 1 +
>>>   include/uapi/drm/i915_drm.h              | 1 +
>>>   4 files changed, 10 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_wait.c 
>>> b/drivers/gpu/drm/i915/gem/i915_gem_wait.c
>>> index d4b918fb11ce..955885ec859d 100644
>>> --- a/drivers/gpu/drm/i915/gem/i915_gem_wait.c
>>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_wait.c
>>> @@ -72,7 +72,8 @@ i915_gem_object_wait_reservation(struct dma_resv 
>>> *resv,
>>>       struct dma_fence *fence;
>>>       long ret = timeout ?: 1;
>>>   -    i915_gem_object_boost(resv, flags);
>>> +    if (!(flags & I915_WAITBOOST_DISABLE))
>>> +        i915_gem_object_boost(resv, flags);
>>>         dma_resv_iter_begin(&cursor, resv,
>>>                   dma_resv_usage_rw(flags & I915_WAIT_ALL));
>>> @@ -236,7 +237,7 @@ i915_gem_wait_ioctl(struct drm_device *dev, void 
>>> *data, struct drm_file *file)
>>>       ktime_t start;
>>>       long ret;
>>>   -    if (args->flags != 0)
>>> +    if (args->flags != 0 || args->flags != I915_GEM_WAITBOOST_DISABLE)
>>>           return -EINVAL;
>>>         obj = i915_gem_object_lookup(file, args->bo_handle);
>>> @@ -248,7 +249,9 @@ i915_gem_wait_ioctl(struct drm_device *dev, void 
>>> *data, struct drm_file *file)
>>>       ret = i915_gem_object_wait(obj,
>>>                      I915_WAIT_INTERRUPTIBLE |
>>>                      I915_WAIT_PRIORITY |
>>> -                   I915_WAIT_ALL,
>>> +                   I915_WAIT_ALL |
>>> +                   (args->flags & I915_GEM_WAITBOOST_DISABLE ?
>>> +                    I915_WAITBOOST_DISABLE : 0),
>>>                      to_wait_timeout(args->timeout_ns));
>>>         if (args->timeout_ns > 0) {
>>> diff --git a/drivers/gpu/drm/i915/i915_request.c 
>>> b/drivers/gpu/drm/i915/i915_request.c
>>> index f59081066a19..2957409b4b2a 100644
>>> --- a/drivers/gpu/drm/i915/i915_request.c
>>> +++ b/drivers/gpu/drm/i915/i915_request.c
>>> @@ -2044,7 +2044,8 @@ long i915_request_wait_timeout(struct 
>>> i915_request *rq,
>>>        * but at a cost of spending more power processing the workload
>>>        * (bad for battery).
>>>        */
>>> -    if (flags & I915_WAIT_PRIORITY && !i915_request_started(rq))
>>> +    if (!(flags & I915_WAITBOOST_DISABLE) && (flags & 
>>> I915_WAIT_PRIORITY) &&
>>> +        !i915_request_started(rq))
>>>           intel_rps_boost(rq);
>>>         wait.tsk = current;
>>> diff --git a/drivers/gpu/drm/i915/i915_request.h 
>>> b/drivers/gpu/drm/i915/i915_request.h
>>> index 0ac55b2e4223..3cc00e8254dc 100644
>>> --- a/drivers/gpu/drm/i915/i915_request.h
>>> +++ b/drivers/gpu/drm/i915/i915_request.h
>>> @@ -445,6 +445,7 @@ long i915_request_wait(struct i915_request *rq,
>>>   #define I915_WAIT_INTERRUPTIBLE    BIT(0)
>>>   #define I915_WAIT_PRIORITY    BIT(1) /* small priority bump for the 
>>> request */
>>>   #define I915_WAIT_ALL        BIT(2) /* used by 
>>> i915_gem_object_wait() */
>>> +#define I915_WAITBOOST_DISABLE    BIT(3) /* used by 
>>> i915_gem_object_wait() */
>>>     void i915_request_show(struct drm_printer *m,
>>>                  const struct i915_request *rq,
>>> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
>>> index 7000e5910a1d..4adee70e39cf 100644
>>> --- a/include/uapi/drm/i915_drm.h
>>> +++ b/include/uapi/drm/i915_drm.h
>>> @@ -1928,6 +1928,7 @@ struct drm_i915_gem_wait {
>>>       /** Handle of BO we shall wait on */
>>>       __u32 bo_handle;
>>>       __u32 flags;
>>> +#define I915_GEM_WAITBOOST_DISABLE      (1u<<0)
>>
>> Probably would be good to avoid mentioning waitboost in the uapi since 
>> so far it wasn't an explicit feature/contract. Something like 
>> I915_GEM_WAIT_BACKGROUND_PRIORITY? Low priority?
> sure.
>>
>> I also wonder if there could be a possible angle to help Rob (+cc) 
>> upstream the syncobj/fence deadline code if our media driver might 
>> make use of that somehow.
>>
>> Like if either we could wire up the deadline into GEM_WAIT (in a 
>> backward compatible manner), or if media could use sync fd wait 
>> instead. Assuming they have an out fence already, which may not be true.
> 
> Makes sense. We could add a SET_DEADLINE flag or something similar and 
> pass in the deadline when appropriate.

Rob - do you have time and motivation to think about this angle at all 
currently? If not I guess we just proceed with the new flag for our 
GEM_WAIT.

Regards,

Tvrtko

> 
> Thanks,
> 
> Vinay.
> 
>>
>> Regards,
>>
>> Tvrtko
>>
>>>       /** Number of nanoseconds to wait, Returns time remaining. */
>>>       __s64 timeout_ns;
>>>   };
Rodrigo Vivi Oct. 13, 2023, 8:51 p.m. UTC | #5
On Thu, Sep 28, 2023 at 01:48:34PM +0100, Tvrtko Ursulin wrote:
> 
> On 27/09/2023 20:34, Belgaumkar, Vinay wrote:
> > 
> > On 9/21/2023 3:41 AM, Tvrtko Ursulin wrote:
> > > 
> > > On 20/09/2023 22:56, Vinay Belgaumkar wrote:
> > > > Provide a bit to disable waitboost while waiting on a gem object.
> > > > Waitboost results in increased power consumption by requesting RP0
> > > > while waiting for the request to complete. Add a bit in the gem_wait()
> > > > IOCTL where this can be disabled.
> > > > 
> > > > This is related to the libva API change here -
> > > > Link: https://github.com/XinfengZhang/libva/commit/3d90d18c67609a73121bb71b20ee4776b54b61a7
> > > 
> > > This link does not appear to lead to userspace code using this uapi?
> > We have asked Carl (cc'd) to post a patch for the same.
> 
> Ack.

I'm glad to see that we will have the end-to-end flow of the high-level API.

> 
> > > > Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
> > > > Signed-off-by: Vinay Belgaumkar <vinay.belgaumkar@intel.com>
> > > > ---
> > > >   drivers/gpu/drm/i915/gem/i915_gem_wait.c | 9 ++++++---
> > > >   drivers/gpu/drm/i915/i915_request.c      | 3 ++-
> > > >   drivers/gpu/drm/i915/i915_request.h      | 1 +
> > > >   include/uapi/drm/i915_drm.h              | 1 +
> > > >   4 files changed, 10 insertions(+), 4 deletions(-)
> > > > 
> > > > diff --git a/drivers/gpu/drm/i915/gem/i915_gem_wait.c
> > > > b/drivers/gpu/drm/i915/gem/i915_gem_wait.c
> > > > index d4b918fb11ce..955885ec859d 100644
> > > > --- a/drivers/gpu/drm/i915/gem/i915_gem_wait.c
> > > > +++ b/drivers/gpu/drm/i915/gem/i915_gem_wait.c
> > > > @@ -72,7 +72,8 @@ i915_gem_object_wait_reservation(struct
> > > > dma_resv *resv,
> > > >       struct dma_fence *fence;
> > > >       long ret = timeout ?: 1;
> > > >   -    i915_gem_object_boost(resv, flags);
> > > > +    if (!(flags & I915_WAITBOOST_DISABLE))
> > > > +        i915_gem_object_boost(resv, flags);
> > > >         dma_resv_iter_begin(&cursor, resv,
> > > >                   dma_resv_usage_rw(flags & I915_WAIT_ALL));
> > > > @@ -236,7 +237,7 @@ i915_gem_wait_ioctl(struct drm_device *dev,
> > > > void *data, struct drm_file *file)
> > > >       ktime_t start;
> > > >       long ret;
> > > >   -    if (args->flags != 0)
> > > > +    if (args->flags != 0 || args->flags != I915_GEM_WAITBOOST_DISABLE)
> > > >           return -EINVAL;
> > > >         obj = i915_gem_object_lookup(file, args->bo_handle);
> > > > @@ -248,7 +249,9 @@ i915_gem_wait_ioctl(struct drm_device *dev,
> > > > void *data, struct drm_file *file)
> > > >       ret = i915_gem_object_wait(obj,
> > > >                      I915_WAIT_INTERRUPTIBLE |
> > > >                      I915_WAIT_PRIORITY |
> > > > -                   I915_WAIT_ALL,
> > > > +                   I915_WAIT_ALL |
> > > > +                   (args->flags & I915_GEM_WAITBOOST_DISABLE ?
> > > > +                    I915_WAITBOOST_DISABLE : 0),
> > > >                      to_wait_timeout(args->timeout_ns));
> > > >         if (args->timeout_ns > 0) {
> > > > diff --git a/drivers/gpu/drm/i915/i915_request.c
> > > > b/drivers/gpu/drm/i915/i915_request.c
> > > > index f59081066a19..2957409b4b2a 100644
> > > > --- a/drivers/gpu/drm/i915/i915_request.c
> > > > +++ b/drivers/gpu/drm/i915/i915_request.c
> > > > @@ -2044,7 +2044,8 @@ long i915_request_wait_timeout(struct
> > > > i915_request *rq,
> > > >        * but at a cost of spending more power processing the workload
> > > >        * (bad for battery).
> > > >        */
> > > > -    if (flags & I915_WAIT_PRIORITY && !i915_request_started(rq))
> > > > +    if (!(flags & I915_WAITBOOST_DISABLE) && (flags &
> > > > I915_WAIT_PRIORITY) &&
> > > > +        !i915_request_started(rq))
> > > >           intel_rps_boost(rq);
> > > >         wait.tsk = current;
> > > > diff --git a/drivers/gpu/drm/i915/i915_request.h
> > > > b/drivers/gpu/drm/i915/i915_request.h
> > > > index 0ac55b2e4223..3cc00e8254dc 100644
> > > > --- a/drivers/gpu/drm/i915/i915_request.h
> > > > +++ b/drivers/gpu/drm/i915/i915_request.h
> > > > @@ -445,6 +445,7 @@ long i915_request_wait(struct i915_request *rq,
> > > >   #define I915_WAIT_INTERRUPTIBLE    BIT(0)
> > > >   #define I915_WAIT_PRIORITY    BIT(1) /* small priority bump
> > > > for the request */
> > > >   #define I915_WAIT_ALL        BIT(2) /* used by
> > > > i915_gem_object_wait() */
> > > > +#define I915_WAITBOOST_DISABLE    BIT(3) /* used by

maybe name it I915_WAIT_NO_BOOST to align a bit better with the existent ones?

> > > > i915_gem_object_wait() */
> > > >     void i915_request_show(struct drm_printer *m,
> > > >                  const struct i915_request *rq,
> > > > diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> > > > index 7000e5910a1d..4adee70e39cf 100644
> > > > --- a/include/uapi/drm/i915_drm.h
> > > > +++ b/include/uapi/drm/i915_drm.h
> > > > @@ -1928,6 +1928,7 @@ struct drm_i915_gem_wait {
> > > >       /** Handle of BO we shall wait on */
> > > >       __u32 bo_handle;
> > > >       __u32 flags;
> > > > +#define I915_GEM_WAITBOOST_DISABLE      (1u<<0)
> > > 
> > > Probably would be good to avoid mentioning waitboost in the uapi
> > > since so far it wasn't an explicit feature/contract. Something like
> > > I915_GEM_WAIT_BACKGROUND_PRIORITY? Low priority?
> > sure.
> > > 
> > > I also wonder if there could be a possible angle to help Rob (+cc)
> > > upstream the syncobj/fence deadline code if our media driver might
> > > make use of that somehow.
> > > 
> > > Like if either we could wire up the deadline into GEM_WAIT (in a
> > > backward compatible manner), or if media could use sync fd wait
> > > instead. Assuming they have an out fence already, which may not be
> > > true.
> > 
> > Makes sense. We could add a SET_DEADLINE flag or something similar and
> > pass in the deadline when appropriate.
> 
> Rob - do you have time and motivation to think about this angle at all
> currently? If not I guess we just proceed with the new flag for our
> GEM_WAIT.

Well, this could be the first user for that uapi that Rob was proposing
indeed.

The downside is probably because we should implement the deadline in i915
and consider all the deadline as 0 (urgent) and boost, unless in this
case where before the gem_wait the UMD would use the set_deadline to
something higher (max?).

Well, if we have a clarity on how to proceed with the deadline we should
probably go there. But for simplicity I would be in favor of this proposed
gem_wait flag as is, because this already solves many real important cases.

> 
> Regards,
> 
> Tvrtko
> 
> > 
> > Thanks,
> > 
> > Vinay.
> > 
> > > 
> > > Regards,
> > > 
> > > Tvrtko
> > > 
> > > >       /** Number of nanoseconds to wait, Returns time remaining. */
> > > >       __s64 timeout_ns;
> > > >   };
Tvrtko Ursulin Oct. 16, 2023, 8:02 a.m. UTC | #6
On 13/10/2023 21:51, Rodrigo Vivi wrote:
> On Thu, Sep 28, 2023 at 01:48:34PM +0100, Tvrtko Ursulin wrote:
>>
>> On 27/09/2023 20:34, Belgaumkar, Vinay wrote:
>>>
>>> On 9/21/2023 3:41 AM, Tvrtko Ursulin wrote:
>>>>
>>>> On 20/09/2023 22:56, Vinay Belgaumkar wrote:
>>>>> Provide a bit to disable waitboost while waiting on a gem object.
>>>>> Waitboost results in increased power consumption by requesting RP0
>>>>> while waiting for the request to complete. Add a bit in the gem_wait()
>>>>> IOCTL where this can be disabled.
>>>>>
>>>>> This is related to the libva API change here -
>>>>> Link: https://github.com/XinfengZhang/libva/commit/3d90d18c67609a73121bb71b20ee4776b54b61a7
>>>>
>>>> This link does not appear to lead to userspace code using this uapi?
>>> We have asked Carl (cc'd) to post a patch for the same.
>>
>> Ack.
> 
> I'm glad to see that we will have the end-to-end flow of the high-level API.
> 
>>
>>>>> Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
>>>>> Signed-off-by: Vinay Belgaumkar <vinay.belgaumkar@intel.com>
>>>>> ---
>>>>>    drivers/gpu/drm/i915/gem/i915_gem_wait.c | 9 ++++++---
>>>>>    drivers/gpu/drm/i915/i915_request.c      | 3 ++-
>>>>>    drivers/gpu/drm/i915/i915_request.h      | 1 +
>>>>>    include/uapi/drm/i915_drm.h              | 1 +
>>>>>    4 files changed, 10 insertions(+), 4 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_wait.c
>>>>> b/drivers/gpu/drm/i915/gem/i915_gem_wait.c
>>>>> index d4b918fb11ce..955885ec859d 100644
>>>>> --- a/drivers/gpu/drm/i915/gem/i915_gem_wait.c
>>>>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_wait.c
>>>>> @@ -72,7 +72,8 @@ i915_gem_object_wait_reservation(struct
>>>>> dma_resv *resv,
>>>>>        struct dma_fence *fence;
>>>>>        long ret = timeout ?: 1;
>>>>>    -    i915_gem_object_boost(resv, flags);
>>>>> +    if (!(flags & I915_WAITBOOST_DISABLE))
>>>>> +        i915_gem_object_boost(resv, flags);
>>>>>          dma_resv_iter_begin(&cursor, resv,
>>>>>                    dma_resv_usage_rw(flags & I915_WAIT_ALL));
>>>>> @@ -236,7 +237,7 @@ i915_gem_wait_ioctl(struct drm_device *dev,
>>>>> void *data, struct drm_file *file)
>>>>>        ktime_t start;
>>>>>        long ret;
>>>>>    -    if (args->flags != 0)
>>>>> +    if (args->flags != 0 || args->flags != I915_GEM_WAITBOOST_DISABLE)
>>>>>            return -EINVAL;
>>>>>          obj = i915_gem_object_lookup(file, args->bo_handle);
>>>>> @@ -248,7 +249,9 @@ i915_gem_wait_ioctl(struct drm_device *dev,
>>>>> void *data, struct drm_file *file)
>>>>>        ret = i915_gem_object_wait(obj,
>>>>>                       I915_WAIT_INTERRUPTIBLE |
>>>>>                       I915_WAIT_PRIORITY |
>>>>> -                   I915_WAIT_ALL,
>>>>> +                   I915_WAIT_ALL |
>>>>> +                   (args->flags & I915_GEM_WAITBOOST_DISABLE ?
>>>>> +                    I915_WAITBOOST_DISABLE : 0),
>>>>>                       to_wait_timeout(args->timeout_ns));
>>>>>          if (args->timeout_ns > 0) {
>>>>> diff --git a/drivers/gpu/drm/i915/i915_request.c
>>>>> b/drivers/gpu/drm/i915/i915_request.c
>>>>> index f59081066a19..2957409b4b2a 100644
>>>>> --- a/drivers/gpu/drm/i915/i915_request.c
>>>>> +++ b/drivers/gpu/drm/i915/i915_request.c
>>>>> @@ -2044,7 +2044,8 @@ long i915_request_wait_timeout(struct
>>>>> i915_request *rq,
>>>>>         * but at a cost of spending more power processing the workload
>>>>>         * (bad for battery).
>>>>>         */
>>>>> -    if (flags & I915_WAIT_PRIORITY && !i915_request_started(rq))
>>>>> +    if (!(flags & I915_WAITBOOST_DISABLE) && (flags &
>>>>> I915_WAIT_PRIORITY) &&
>>>>> +        !i915_request_started(rq))
>>>>>            intel_rps_boost(rq);
>>>>>          wait.tsk = current;
>>>>> diff --git a/drivers/gpu/drm/i915/i915_request.h
>>>>> b/drivers/gpu/drm/i915/i915_request.h
>>>>> index 0ac55b2e4223..3cc00e8254dc 100644
>>>>> --- a/drivers/gpu/drm/i915/i915_request.h
>>>>> +++ b/drivers/gpu/drm/i915/i915_request.h
>>>>> @@ -445,6 +445,7 @@ long i915_request_wait(struct i915_request *rq,
>>>>>    #define I915_WAIT_INTERRUPTIBLE    BIT(0)
>>>>>    #define I915_WAIT_PRIORITY    BIT(1) /* small priority bump
>>>>> for the request */
>>>>>    #define I915_WAIT_ALL        BIT(2) /* used by
>>>>> i915_gem_object_wait() */
>>>>> +#define I915_WAITBOOST_DISABLE    BIT(3) /* used by
> 
> maybe name it I915_WAIT_NO_BOOST to align a bit better with the existent ones?

I thought it would be better to not mention wait boost in the uapi, but 
leave it as implementation detail.

My suggestion was along the lines of I915_GEM_WAIT_BACKGROUND/IDLE.

In essence saying allowing userspace to say this is not an important 
wait. Yes, it implies that other waits are (more) important, but I think 
this is still better than starting to mention wait boost in the uapi. 
Since that would kind of cement it exists, while we always just viewed 
it as an "go faster" driver internal heuristics and could freely decide 
not to employ it even for default waits.

Historically even we had a period when waits were getting elevated 
scheduling priority. We removed it, would have to dig through git and 
email history to remember exactly why, but probably along the lines that 
it is not always justified. Same as waitboost is not always justified 
and can be harmful.

So I think a generic name for the uapi leaves more freedom for the 
driver. Might be a wrong name that I am suggesting and should be 
something else, not sure.

[Comes back later.]

eec39e441c29 ("drm/i915: Remove wait priority boosting")

So it seems we only removed it because corner cases with the current 
scheduler were hard. Unfortunately improved deadline based scheduler 
never got in despite being ready so we can not restore this now.

>>>>> i915_gem_object_wait() */
>>>>>      void i915_request_show(struct drm_printer *m,
>>>>>                   const struct i915_request *rq,
>>>>> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
>>>>> index 7000e5910a1d..4adee70e39cf 100644
>>>>> --- a/include/uapi/drm/i915_drm.h
>>>>> +++ b/include/uapi/drm/i915_drm.h
>>>>> @@ -1928,6 +1928,7 @@ struct drm_i915_gem_wait {
>>>>>        /** Handle of BO we shall wait on */
>>>>>        __u32 bo_handle;
>>>>>        __u32 flags;
>>>>> +#define I915_GEM_WAITBOOST_DISABLE      (1u<<0)
>>>>
>>>> Probably would be good to avoid mentioning waitboost in the uapi
>>>> since so far it wasn't an explicit feature/contract. Something like
>>>> I915_GEM_WAIT_BACKGROUND_PRIORITY? Low priority?
>>> sure.
>>>>
>>>> I also wonder if there could be a possible angle to help Rob (+cc)
>>>> upstream the syncobj/fence deadline code if our media driver might
>>>> make use of that somehow.
>>>>
>>>> Like if either we could wire up the deadline into GEM_WAIT (in a
>>>> backward compatible manner), or if media could use sync fd wait
>>>> instead. Assuming they have an out fence already, which may not be
>>>> true.
>>>
>>> Makes sense. We could add a SET_DEADLINE flag or something similar and
>>> pass in the deadline when appropriate.
>>
>> Rob - do you have time and motivation to think about this angle at all
>> currently? If not I guess we just proceed with the new flag for our
>> GEM_WAIT.
> 
> Well, this could be the first user for that uapi that Rob was proposing
> indeed.
> 
> The downside is probably because we should implement the deadline in i915
> and consider all the deadline as 0 (urgent) and boost, unless in this
> case where before the gem_wait the UMD would use the set_deadline to
> something higher (max?).
> 
> Well, if we have a clarity on how to proceed with the deadline we should
> probably go there. But for simplicity I would be in favor of this proposed
> gem_wait flag as is, because this already solves many real important cases.

Yes I don't think we had consensus on the semantics of when no deadline 
is set, so it does sound better to proceed with i915 specific solution 
for now. The two wouldn't be incompatible if deadlines were later added.

Regards,

Tvrtko

> 
>>
>> Regards,
>>
>> Tvrtko
>>
>>>
>>> Thanks,
>>>
>>> Vinay.
>>>
>>>>
>>>> Regards,
>>>>
>>>> Tvrtko
>>>>
>>>>>        /** Number of nanoseconds to wait, Returns time remaining. */
>>>>>        __s64 timeout_ns;
>>>>>    };
Rodrigo Vivi Oct. 16, 2023, 5:58 p.m. UTC | #7
On Mon, Oct 16, 2023 at 09:02:38AM +0100, Tvrtko Ursulin wrote:
> 
> On 13/10/2023 21:51, Rodrigo Vivi wrote:
> > On Thu, Sep 28, 2023 at 01:48:34PM +0100, Tvrtko Ursulin wrote:
> > > 
> > > On 27/09/2023 20:34, Belgaumkar, Vinay wrote:
> > > > 
> > > > On 9/21/2023 3:41 AM, Tvrtko Ursulin wrote:
> > > > > 
> > > > > On 20/09/2023 22:56, Vinay Belgaumkar wrote:
> > > > > > Provide a bit to disable waitboost while waiting on a gem object.
> > > > > > Waitboost results in increased power consumption by requesting RP0
> > > > > > while waiting for the request to complete. Add a bit in the gem_wait()
> > > > > > IOCTL where this can be disabled.
> > > > > > 
> > > > > > This is related to the libva API change here -
> > > > > > Link: https://github.com/XinfengZhang/libva/commit/3d90d18c67609a73121bb71b20ee4776b54b61a7
> > > > > 
> > > > > This link does not appear to lead to userspace code using this uapi?
> > > > We have asked Carl (cc'd) to post a patch for the same.
> > > 
> > > Ack.
> > 
> > I'm glad to see that we will have the end-to-end flow of the high-level API.
> > 
> > > 
> > > > > > Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
> > > > > > Signed-off-by: Vinay Belgaumkar <vinay.belgaumkar@intel.com>
> > > > > > ---
> > > > > >    drivers/gpu/drm/i915/gem/i915_gem_wait.c | 9 ++++++---
> > > > > >    drivers/gpu/drm/i915/i915_request.c      | 3 ++-
> > > > > >    drivers/gpu/drm/i915/i915_request.h      | 1 +
> > > > > >    include/uapi/drm/i915_drm.h              | 1 +
> > > > > >    4 files changed, 10 insertions(+), 4 deletions(-)
> > > > > > 
> > > > > > diff --git a/drivers/gpu/drm/i915/gem/i915_gem_wait.c
> > > > > > b/drivers/gpu/drm/i915/gem/i915_gem_wait.c
> > > > > > index d4b918fb11ce..955885ec859d 100644
> > > > > > --- a/drivers/gpu/drm/i915/gem/i915_gem_wait.c
> > > > > > +++ b/drivers/gpu/drm/i915/gem/i915_gem_wait.c
> > > > > > @@ -72,7 +72,8 @@ i915_gem_object_wait_reservation(struct
> > > > > > dma_resv *resv,
> > > > > >        struct dma_fence *fence;
> > > > > >        long ret = timeout ?: 1;
> > > > > >    -    i915_gem_object_boost(resv, flags);
> > > > > > +    if (!(flags & I915_WAITBOOST_DISABLE))
> > > > > > +        i915_gem_object_boost(resv, flags);
> > > > > >          dma_resv_iter_begin(&cursor, resv,
> > > > > >                    dma_resv_usage_rw(flags & I915_WAIT_ALL));
> > > > > > @@ -236,7 +237,7 @@ i915_gem_wait_ioctl(struct drm_device *dev,
> > > > > > void *data, struct drm_file *file)
> > > > > >        ktime_t start;
> > > > > >        long ret;
> > > > > >    -    if (args->flags != 0)
> > > > > > +    if (args->flags != 0 || args->flags != I915_GEM_WAITBOOST_DISABLE)
> > > > > >            return -EINVAL;
> > > > > >          obj = i915_gem_object_lookup(file, args->bo_handle);
> > > > > > @@ -248,7 +249,9 @@ i915_gem_wait_ioctl(struct drm_device *dev,
> > > > > > void *data, struct drm_file *file)
> > > > > >        ret = i915_gem_object_wait(obj,
> > > > > >                       I915_WAIT_INTERRUPTIBLE |
> > > > > >                       I915_WAIT_PRIORITY |
> > > > > > -                   I915_WAIT_ALL,
> > > > > > +                   I915_WAIT_ALL |
> > > > > > +                   (args->flags & I915_GEM_WAITBOOST_DISABLE ?
> > > > > > +                    I915_WAITBOOST_DISABLE : 0),
> > > > > >                       to_wait_timeout(args->timeout_ns));
> > > > > >          if (args->timeout_ns > 0) {
> > > > > > diff --git a/drivers/gpu/drm/i915/i915_request.c
> > > > > > b/drivers/gpu/drm/i915/i915_request.c
> > > > > > index f59081066a19..2957409b4b2a 100644
> > > > > > --- a/drivers/gpu/drm/i915/i915_request.c
> > > > > > +++ b/drivers/gpu/drm/i915/i915_request.c
> > > > > > @@ -2044,7 +2044,8 @@ long i915_request_wait_timeout(struct
> > > > > > i915_request *rq,
> > > > > >         * but at a cost of spending more power processing the workload
> > > > > >         * (bad for battery).
> > > > > >         */
> > > > > > -    if (flags & I915_WAIT_PRIORITY && !i915_request_started(rq))
> > > > > > +    if (!(flags & I915_WAITBOOST_DISABLE) && (flags &
> > > > > > I915_WAIT_PRIORITY) &&
> > > > > > +        !i915_request_started(rq))
> > > > > >            intel_rps_boost(rq);
> > > > > >          wait.tsk = current;
> > > > > > diff --git a/drivers/gpu/drm/i915/i915_request.h
> > > > > > b/drivers/gpu/drm/i915/i915_request.h
> > > > > > index 0ac55b2e4223..3cc00e8254dc 100644
> > > > > > --- a/drivers/gpu/drm/i915/i915_request.h
> > > > > > +++ b/drivers/gpu/drm/i915/i915_request.h
> > > > > > @@ -445,6 +445,7 @@ long i915_request_wait(struct i915_request *rq,
> > > > > >    #define I915_WAIT_INTERRUPTIBLE    BIT(0)
> > > > > >    #define I915_WAIT_PRIORITY    BIT(1) /* small priority bump
> > > > > > for the request */
> > > > > >    #define I915_WAIT_ALL        BIT(2) /* used by
> > > > > > i915_gem_object_wait() */
> > > > > > +#define I915_WAITBOOST_DISABLE    BIT(3) /* used by
> > 
> > maybe name it I915_WAIT_NO_BOOST to align a bit better with the existent ones?
> 
> I thought it would be better to not mention wait boost in the uapi, but
> leave it as implementation detail.
> 
> My suggestion was along the lines of I915_GEM_WAIT_BACKGROUND/IDLE.

good idea!

> 
> In essence saying allowing userspace to say this is not an important wait.
> Yes, it implies that other waits are (more) important, but I think this is
> still better than starting to mention wait boost in the uapi. Since that
> would kind of cement it exists, while we always just viewed it as an "go
> faster" driver internal heuristics and could freely decide not to employ it
> even for default waits.
> 
> Historically even we had a period when waits were getting elevated
> scheduling priority. We removed it, would have to dig through git and email
> history to remember exactly why, but probably along the lines that it is not
> always justified. Same as waitboost is not always justified and can be
> harmful.
> 
> So I think a generic name for the uapi leaves more freedom for the driver.
> Might be a wrong name that I am suggesting and should be something else, not
> sure.
> 
> [Comes back later.]
> 
> eec39e441c29 ("drm/i915: Remove wait priority boosting")
> 
> So it seems we only removed it because corner cases with the current
> scheduler were hard. Unfortunately improved deadline based scheduler never
> got in despite being ready so we can not restore this now.
> 
> > > > > > i915_gem_object_wait() */
> > > > > >      void i915_request_show(struct drm_printer *m,
> > > > > >                   const struct i915_request *rq,
> > > > > > diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> > > > > > index 7000e5910a1d..4adee70e39cf 100644
> > > > > > --- a/include/uapi/drm/i915_drm.h
> > > > > > +++ b/include/uapi/drm/i915_drm.h
> > > > > > @@ -1928,6 +1928,7 @@ struct drm_i915_gem_wait {
> > > > > >        /** Handle of BO we shall wait on */
> > > > > >        __u32 bo_handle;
> > > > > >        __u32 flags;
> > > > > > +#define I915_GEM_WAITBOOST_DISABLE      (1u<<0)
> > > > > 
> > > > > Probably would be good to avoid mentioning waitboost in the uapi
> > > > > since so far it wasn't an explicit feature/contract. Something like
> > > > > I915_GEM_WAIT_BACKGROUND_PRIORITY? Low priority?
> > > > sure.
> > > > > 
> > > > > I also wonder if there could be a possible angle to help Rob (+cc)
> > > > > upstream the syncobj/fence deadline code if our media driver might
> > > > > make use of that somehow.
> > > > > 
> > > > > Like if either we could wire up the deadline into GEM_WAIT (in a
> > > > > backward compatible manner), or if media could use sync fd wait
> > > > > instead. Assuming they have an out fence already, which may not be
> > > > > true.
> > > > 
> > > > Makes sense. We could add a SET_DEADLINE flag or something similar and
> > > > pass in the deadline when appropriate.
> > > 
> > > Rob - do you have time and motivation to think about this angle at all
> > > currently? If not I guess we just proceed with the new flag for our
> > > GEM_WAIT.
> > 
> > Well, this could be the first user for that uapi that Rob was proposing
> > indeed.
> > 
> > The downside is probably because we should implement the deadline in i915
> > and consider all the deadline as 0 (urgent) and boost, unless in this
> > case where before the gem_wait the UMD would use the set_deadline to
> > something higher (max?).
> > 
> > Well, if we have a clarity on how to proceed with the deadline we should
> > probably go there. But for simplicity I would be in favor of this proposed
> > gem_wait flag as is, because this already solves many real important cases.
> 
> Yes I don't think we had consensus on the semantics of when no deadline is
> set, so it does sound better to proceed with i915 specific solution for now.
> The two wouldn't be incompatible if deadlines were later added.
> 
> Regards,
> 
> Tvrtko
> 
> > 
> > > 
> > > Regards,
> > > 
> > > Tvrtko
> > > 
> > > > 
> > > > Thanks,
> > > > 
> > > > Vinay.
> > > > 
> > > > > 
> > > > > Regards,
> > > > > 
> > > > > Tvrtko
> > > > > 
> > > > > >        /** Number of nanoseconds to wait, Returns time remaining. */
> > > > > >        __s64 timeout_ns;
> > > > > >    };
kernel test robot Oct. 27, 2023, 5:30 a.m. UTC | #8
Hello,

kernel test robot noticed "assertion_failure" on:

commit: 54fef7ea35dadd66193b98805b0bc42ef2b279db ("[PATCH] drm/i915/gem: Allow users to disable waitboost")
url: https://github.com/intel-lab-lkp/linux/commits/Vinay-Belgaumkar/drm-i915-gem-Allow-users-to-disable-waitboost/20230921-060357
base: git://anongit.freedesktop.org/drm-intel for-linux-next
patch link: https://lore.kernel.org/all/20230920215624.3482244-1-vinay.belgaumkar@intel.com/
patch subject: [PATCH] drm/i915/gem: Allow users to disable waitboost

in testcase: igt
version: igt-x86_64-0f075441-1_20230520
with following parameters:

	group: group-23



compiler: gcc-12
test machine: 20 threads 1 sockets (Commet Lake) with 16G memory

(please refer to attached dmesg/kmsg for entire log/backtrace)



If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <oliver.sang@intel.com>
| Closes: https://lore.kernel.org/oe-lkp/202310271326.abb748d0-oliver.sang@intel.com



user  :notice: [   42.847071] Starting subtest: engines-cleanup

user  :notice: [   42.854776] Starting dynamic subtest: rcs0

user  :notice: [   42.863938] (gem_ctx_persistence:833) CRITICAL: Test assertion failure function test_nonpersistent_cleanup, file ../tests/i915/gem_ctx_persistence.c:283:

user  :notice: [   42.882029] (gem_ctx_persistence:833) CRITICAL: Failed assertion: gem_wait(i915, spin->handle, &timeout) == 0

user  :notice: [   42.895541] (gem_ctx_persistence:833) CRITICAL: error: -22 != 0



The kernel config and materials to reproduce are available at:
https://download.01.org/0day-ci/archive/20231027/202310271326.abb748d0-oliver.sang@intel.com
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_wait.c b/drivers/gpu/drm/i915/gem/i915_gem_wait.c
index d4b918fb11ce..955885ec859d 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_wait.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_wait.c
@@ -72,7 +72,8 @@  i915_gem_object_wait_reservation(struct dma_resv *resv,
 	struct dma_fence *fence;
 	long ret = timeout ?: 1;
 
-	i915_gem_object_boost(resv, flags);
+	if (!(flags & I915_WAITBOOST_DISABLE))
+		i915_gem_object_boost(resv, flags);
 
 	dma_resv_iter_begin(&cursor, resv,
 			    dma_resv_usage_rw(flags & I915_WAIT_ALL));
@@ -236,7 +237,7 @@  i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 	ktime_t start;
 	long ret;
 
-	if (args->flags != 0)
+	if (args->flags != 0 || args->flags != I915_GEM_WAITBOOST_DISABLE)
 		return -EINVAL;
 
 	obj = i915_gem_object_lookup(file, args->bo_handle);
@@ -248,7 +249,9 @@  i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 	ret = i915_gem_object_wait(obj,
 				   I915_WAIT_INTERRUPTIBLE |
 				   I915_WAIT_PRIORITY |
-				   I915_WAIT_ALL,
+				   I915_WAIT_ALL |
+				   (args->flags & I915_GEM_WAITBOOST_DISABLE ?
+				    I915_WAITBOOST_DISABLE : 0),
 				   to_wait_timeout(args->timeout_ns));
 
 	if (args->timeout_ns > 0) {
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index f59081066a19..2957409b4b2a 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -2044,7 +2044,8 @@  long i915_request_wait_timeout(struct i915_request *rq,
 	 * but at a cost of spending more power processing the workload
 	 * (bad for battery).
 	 */
-	if (flags & I915_WAIT_PRIORITY && !i915_request_started(rq))
+	if (!(flags & I915_WAITBOOST_DISABLE) && (flags & I915_WAIT_PRIORITY) &&
+	    !i915_request_started(rq))
 		intel_rps_boost(rq);
 
 	wait.tsk = current;
diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
index 0ac55b2e4223..3cc00e8254dc 100644
--- a/drivers/gpu/drm/i915/i915_request.h
+++ b/drivers/gpu/drm/i915/i915_request.h
@@ -445,6 +445,7 @@  long i915_request_wait(struct i915_request *rq,
 #define I915_WAIT_INTERRUPTIBLE	BIT(0)
 #define I915_WAIT_PRIORITY	BIT(1) /* small priority bump for the request */
 #define I915_WAIT_ALL		BIT(2) /* used by i915_gem_object_wait() */
+#define I915_WAITBOOST_DISABLE	BIT(3) /* used by i915_gem_object_wait() */
 
 void i915_request_show(struct drm_printer *m,
 		       const struct i915_request *rq,
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 7000e5910a1d..4adee70e39cf 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -1928,6 +1928,7 @@  struct drm_i915_gem_wait {
 	/** Handle of BO we shall wait on */
 	__u32 bo_handle;
 	__u32 flags;
+#define I915_GEM_WAITBOOST_DISABLE      (1u<<0)
 	/** Number of nanoseconds to wait, Returns time remaining. */
 	__s64 timeout_ns;
 };