Message ID | 20210429003410.69754-2-umesh.nerlige.ramappa@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Add support for querying engine cycles | expand |
On 29/04/2021 03:34, Umesh Nerlige Ramappa wrote: > Perf measurements rely on CPU and engine timestamps to correlate > events of interest across these time domains. Current mechanisms get > these timestamps separately and the calculated delta between these > timestamps lack enough accuracy. > > To improve the accuracy of these time measurements to within a few us, > add a query that returns the engine and cpu timestamps captured as > close to each other as possible. > > v2: (Tvrtko) > - document clock reference used > - return cpu timestamp always > - capture cpu time just before lower dword of cs timestamp > > v3: (Chris) > - use uncore-rpm > - use __query_cs_timestamp helper > > v4: (Lionel) > - Kernel perf subsytem allows users to specify the clock id to be used > in perf_event_open. This clock id is used by the perf subsystem to > return the appropriate cpu timestamp in perf events. Similarly, let > the user pass the clockid to this query so that cpu timestamp > corresponds to the clock id requested. > > v5: (Tvrtko) > - Use normal ktime accessors instead of fast versions > - Add more uApi documentation > > v6: (Lionel) > - Move switch out of spinlock > > v7: (Chris) > - cs_timestamp is a misnomer, use cs_cycles instead > - return the cs cycle frequency as well in the query > > v8: > - Add platform and engine specific checks > > v9: (Lionel) > - Return 2 cpu timestamps in the query - captured before and after the > register read > > v10: (Chris) > - Use local_clock() to measure time taken to read lower dword of > register and return it to user. > > v11: (Jani) > - IS_GEN deprecated. User GRAPHICS_VER instead. > > v12: (Jason) > - Split cpu timestamp array into timestamp and delta for cleaner API > > Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com> > Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Thanks for the update : Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> > --- > drivers/gpu/drm/i915/i915_query.c | 148 ++++++++++++++++++++++++++++++ > include/uapi/drm/i915_drm.h | 52 +++++++++++ > 2 files changed, 200 insertions(+) > > diff --git a/drivers/gpu/drm/i915/i915_query.c b/drivers/gpu/drm/i915/i915_query.c > index fed337ad7b68..357c44e8177c 100644 > --- a/drivers/gpu/drm/i915/i915_query.c > +++ b/drivers/gpu/drm/i915/i915_query.c > @@ -6,6 +6,8 @@ > > #include <linux/nospec.h> > > +#include "gt/intel_engine_pm.h" > +#include "gt/intel_engine_user.h" > #include "i915_drv.h" > #include "i915_perf.h" > #include "i915_query.h" > @@ -90,6 +92,151 @@ static int query_topology_info(struct drm_i915_private *dev_priv, > return total_length; > } > > +typedef u64 (*__ktime_func_t)(void); > +static __ktime_func_t __clock_id_to_func(clockid_t clk_id) > +{ > + /* > + * Use logic same as the perf subsystem to allow user to select the > + * reference clock id to be used for timestamps. > + */ > + switch (clk_id) { > + case CLOCK_MONOTONIC: > + return &ktime_get_ns; > + case CLOCK_MONOTONIC_RAW: > + return &ktime_get_raw_ns; > + case CLOCK_REALTIME: > + return &ktime_get_real_ns; > + case CLOCK_BOOTTIME: > + return &ktime_get_boottime_ns; > + case CLOCK_TAI: > + return &ktime_get_clocktai_ns; > + default: > + return NULL; > + } > +} > + > +static inline int > +__read_timestamps(struct intel_uncore *uncore, > + i915_reg_t lower_reg, > + i915_reg_t upper_reg, > + u64 *cs_ts, > + u64 *cpu_ts, > + u64 *cpu_delta, > + __ktime_func_t cpu_clock) > +{ > + u32 upper, lower, old_upper, loop = 0; > + > + upper = intel_uncore_read_fw(uncore, upper_reg); > + do { > + *cpu_delta = local_clock(); > + *cpu_ts = cpu_clock(); > + lower = intel_uncore_read_fw(uncore, lower_reg); > + *cpu_delta = local_clock() - *cpu_delta; > + old_upper = upper; > + upper = intel_uncore_read_fw(uncore, upper_reg); > + } while (upper != old_upper && loop++ < 2); > + > + *cs_ts = (u64)upper << 32 | lower; > + > + return 0; > +} > + > +static int > +__query_cs_cycles(struct intel_engine_cs *engine, > + u64 *cs_ts, u64 *cpu_ts, u64 *cpu_delta, > + __ktime_func_t cpu_clock) > +{ > + struct intel_uncore *uncore = engine->uncore; > + enum forcewake_domains fw_domains; > + u32 base = engine->mmio_base; > + intel_wakeref_t wakeref; > + int ret; > + > + fw_domains = intel_uncore_forcewake_for_reg(uncore, > + RING_TIMESTAMP(base), > + FW_REG_READ); > + > + with_intel_runtime_pm(uncore->rpm, wakeref) { > + spin_lock_irq(&uncore->lock); > + intel_uncore_forcewake_get__locked(uncore, fw_domains); > + > + ret = __read_timestamps(uncore, > + RING_TIMESTAMP(base), > + RING_TIMESTAMP_UDW(base), > + cs_ts, > + cpu_ts, > + cpu_delta, > + cpu_clock); > + > + intel_uncore_forcewake_put__locked(uncore, fw_domains); > + spin_unlock_irq(&uncore->lock); > + } > + > + return ret; > +} > + > +static int > +query_cs_cycles(struct drm_i915_private *i915, > + struct drm_i915_query_item *query_item) > +{ > + struct drm_i915_query_cs_cycles __user *query_ptr; > + struct drm_i915_query_cs_cycles query; > + struct intel_engine_cs *engine; > + __ktime_func_t cpu_clock; > + int ret; > + > + if (GRAPHICS_VER(i915) < 6) > + return -ENODEV; > + > + query_ptr = u64_to_user_ptr(query_item->data_ptr); > + ret = copy_query_item(&query, sizeof(query), sizeof(query), query_item); > + if (ret != 0) > + return ret; > + > + if (query.flags) > + return -EINVAL; > + > + if (query.rsvd) > + return -EINVAL; > + > + cpu_clock = __clock_id_to_func(query.clockid); > + if (!cpu_clock) > + return -EINVAL; > + > + engine = intel_engine_lookup_user(i915, > + query.engine.engine_class, > + query.engine.engine_instance); > + if (!engine) > + return -EINVAL; > + > + if (GRAPHICS_VER(i915) == 6 && > + query.engine.engine_class != I915_ENGINE_CLASS_RENDER) > + return -ENODEV; > + > + query.cs_frequency = engine->gt->clock_frequency; > + ret = __query_cs_cycles(engine, > + &query.cs_cycles, > + &query.cpu_timestamp, > + &query.cpu_delta, > + cpu_clock); > + if (ret) > + return ret; > + > + if (put_user(query.cs_frequency, &query_ptr->cs_frequency)) > + return -EFAULT; > + > + if (put_user(query.cpu_timestamp, &query_ptr->cpu_timestamp)) > + return -EFAULT; > + > + if (put_user(query.cpu_delta, &query_ptr->cpu_delta)) > + return -EFAULT; > + > + if (put_user(query.cs_cycles, &query_ptr->cs_cycles)) > + return -EFAULT; > + > + return sizeof(query); > +} > + > static int > query_engine_info(struct drm_i915_private *i915, > struct drm_i915_query_item *query_item) > @@ -424,6 +571,7 @@ static int (* const i915_query_funcs[])(struct drm_i915_private *dev_priv, > query_topology_info, > query_engine_info, > query_perf_config, > + query_cs_cycles, > }; > > int i915_query_ioctl(struct drm_device *dev, void *data, struct drm_file *file) > diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h > index 6a34243a7646..0b4c27092d41 100644 > --- a/include/uapi/drm/i915_drm.h > +++ b/include/uapi/drm/i915_drm.h > @@ -2230,6 +2230,10 @@ struct drm_i915_query_item { > #define DRM_I915_QUERY_TOPOLOGY_INFO 1 > #define DRM_I915_QUERY_ENGINE_INFO 2 > #define DRM_I915_QUERY_PERF_CONFIG 3 > + /** > + * Query Command Streamer timestamp register. > + */ > +#define DRM_I915_QUERY_CS_CYCLES 4 > /* Must be kept compact -- no holes and well documented */ > > /** > @@ -2397,6 +2401,54 @@ struct drm_i915_engine_info { > __u64 rsvd1[4]; > }; > > +/** > + * struct drm_i915_query_cs_cycles > + * > + * The query returns the command streamer cycles and the frequency that can be > + * used to calculate the command streamer timestamp. In addition the query > + * returns a set of cpu timestamps that indicate when the command streamer cycle > + * count was captured. > + */ > +struct drm_i915_query_cs_cycles { > + /** Engine for which command streamer cycles is queried. */ > + struct i915_engine_class_instance engine; > + > + /** Must be zero. */ > + __u32 flags; > + > + /** > + * Command streamer cycles as read from the command streamer > + * register at 0x358 offset. > + */ > + __u64 cs_cycles; > + > + /** Frequency of the cs cycles in Hz. */ > + __u64 cs_frequency; > + > + /** > + * CPU timestamp in ns. The timestamp is captured before reading the > + * cs_cycles register using the reference clockid set by the user. > + */ > + __u64 cpu_timestamp; > + > + /** > + * Time delta in ns captured around reading the lower dword of the > + * cs_cycles register. > + */ > + __u64 cpu_delta; > + > + /** > + * Reference clock id for CPU timestamp. For definition, see > + * clock_gettime(2) and perf_event_open(2). Supported clock ids are > + * CLOCK_MONOTONIC, CLOCK_MONOTONIC_RAW, CLOCK_REALTIME, CLOCK_BOOTTIME, > + * CLOCK_TAI. > + */ > + __s32 clockid; > + > + /** Must be zero. */ > + __u32 rsvd; > +}; > + > /** > * struct drm_i915_query_engine_info > *
On Wed, Apr 28, 2021 at 7:34 PM Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com> wrote: > > Perf measurements rely on CPU and engine timestamps to correlate > events of interest across these time domains. Current mechanisms get > these timestamps separately and the calculated delta between these > timestamps lack enough accuracy. > > To improve the accuracy of these time measurements to within a few us, > add a query that returns the engine and cpu timestamps captured as > close to each other as possible. > > v2: (Tvrtko) > - document clock reference used > - return cpu timestamp always > - capture cpu time just before lower dword of cs timestamp > > v3: (Chris) > - use uncore-rpm > - use __query_cs_timestamp helper > > v4: (Lionel) > - Kernel perf subsytem allows users to specify the clock id to be used > in perf_event_open. This clock id is used by the perf subsystem to > return the appropriate cpu timestamp in perf events. Similarly, let > the user pass the clockid to this query so that cpu timestamp > corresponds to the clock id requested. > > v5: (Tvrtko) > - Use normal ktime accessors instead of fast versions > - Add more uApi documentation > > v6: (Lionel) > - Move switch out of spinlock > > v7: (Chris) > - cs_timestamp is a misnomer, use cs_cycles instead > - return the cs cycle frequency as well in the query > > v8: > - Add platform and engine specific checks > > v9: (Lionel) > - Return 2 cpu timestamps in the query - captured before and after the > register read > > v10: (Chris) > - Use local_clock() to measure time taken to read lower dword of > register and return it to user. > > v11: (Jani) > - IS_GEN deprecated. User GRAPHICS_VER instead. > > v12: (Jason) > - Split cpu timestamp array into timestamp and delta for cleaner API > > Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com> > Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> > --- > drivers/gpu/drm/i915/i915_query.c | 148 ++++++++++++++++++++++++++++++ > include/uapi/drm/i915_drm.h | 52 +++++++++++ > 2 files changed, 200 insertions(+) > > diff --git a/drivers/gpu/drm/i915/i915_query.c b/drivers/gpu/drm/i915/i915_query.c > index fed337ad7b68..357c44e8177c 100644 > --- a/drivers/gpu/drm/i915/i915_query.c > +++ b/drivers/gpu/drm/i915/i915_query.c > @@ -6,6 +6,8 @@ > > #include <linux/nospec.h> > > +#include "gt/intel_engine_pm.h" > +#include "gt/intel_engine_user.h" > #include "i915_drv.h" > #include "i915_perf.h" > #include "i915_query.h" > @@ -90,6 +92,151 @@ static int query_topology_info(struct drm_i915_private *dev_priv, > return total_length; > } > > +typedef u64 (*__ktime_func_t)(void); > +static __ktime_func_t __clock_id_to_func(clockid_t clk_id) > +{ > + /* > + * Use logic same as the perf subsystem to allow user to select the > + * reference clock id to be used for timestamps. > + */ > + switch (clk_id) { > + case CLOCK_MONOTONIC: > + return &ktime_get_ns; > + case CLOCK_MONOTONIC_RAW: > + return &ktime_get_raw_ns; > + case CLOCK_REALTIME: > + return &ktime_get_real_ns; > + case CLOCK_BOOTTIME: > + return &ktime_get_boottime_ns; > + case CLOCK_TAI: > + return &ktime_get_clocktai_ns; > + default: > + return NULL; > + } > +} > + > +static inline int > +__read_timestamps(struct intel_uncore *uncore, > + i915_reg_t lower_reg, > + i915_reg_t upper_reg, > + u64 *cs_ts, > + u64 *cpu_ts, > + u64 *cpu_delta, > + __ktime_func_t cpu_clock) > +{ > + u32 upper, lower, old_upper, loop = 0; > + > + upper = intel_uncore_read_fw(uncore, upper_reg); > + do { > + *cpu_delta = local_clock(); > + *cpu_ts = cpu_clock(); > + lower = intel_uncore_read_fw(uncore, lower_reg); > + *cpu_delta = local_clock() - *cpu_delta; > + old_upper = upper; > + upper = intel_uncore_read_fw(uncore, upper_reg); > + } while (upper != old_upper && loop++ < 2); > + > + *cs_ts = (u64)upper << 32 | lower; > + > + return 0; > +} > + > +static int > +__query_cs_cycles(struct intel_engine_cs *engine, > + u64 *cs_ts, u64 *cpu_ts, u64 *cpu_delta, > + __ktime_func_t cpu_clock) > +{ > + struct intel_uncore *uncore = engine->uncore; > + enum forcewake_domains fw_domains; > + u32 base = engine->mmio_base; > + intel_wakeref_t wakeref; > + int ret; > + > + fw_domains = intel_uncore_forcewake_for_reg(uncore, > + RING_TIMESTAMP(base), > + FW_REG_READ); > + > + with_intel_runtime_pm(uncore->rpm, wakeref) { > + spin_lock_irq(&uncore->lock); > + intel_uncore_forcewake_get__locked(uncore, fw_domains); > + > + ret = __read_timestamps(uncore, > + RING_TIMESTAMP(base), > + RING_TIMESTAMP_UDW(base), > + cs_ts, > + cpu_ts, > + cpu_delta, > + cpu_clock); > + > + intel_uncore_forcewake_put__locked(uncore, fw_domains); > + spin_unlock_irq(&uncore->lock); > + } > + > + return ret; > +} > + > +static int > +query_cs_cycles(struct drm_i915_private *i915, > + struct drm_i915_query_item *query_item) > +{ > + struct drm_i915_query_cs_cycles __user *query_ptr; > + struct drm_i915_query_cs_cycles query; > + struct intel_engine_cs *engine; > + __ktime_func_t cpu_clock; > + int ret; > + > + if (GRAPHICS_VER(i915) < 6) > + return -ENODEV; > + > + query_ptr = u64_to_user_ptr(query_item->data_ptr); > + ret = copy_query_item(&query, sizeof(query), sizeof(query), query_item); > + if (ret != 0) > + return ret; > + > + if (query.flags) > + return -EINVAL; > + > + if (query.rsvd) > + return -EINVAL; > + > + cpu_clock = __clock_id_to_func(query.clockid); > + if (!cpu_clock) > + return -EINVAL; > + > + engine = intel_engine_lookup_user(i915, > + query.engine.engine_class, > + query.engine.engine_instance); > + if (!engine) > + return -EINVAL; > + > + if (GRAPHICS_VER(i915) == 6 && > + query.engine.engine_class != I915_ENGINE_CLASS_RENDER) > + return -ENODEV; > + > + query.cs_frequency = engine->gt->clock_frequency; > + ret = __query_cs_cycles(engine, > + &query.cs_cycles, > + &query.cpu_timestamp, > + &query.cpu_delta, > + cpu_clock); > + if (ret) > + return ret; > + > + if (put_user(query.cs_frequency, &query_ptr->cs_frequency)) > + return -EFAULT; > + > + if (put_user(query.cpu_timestamp, &query_ptr->cpu_timestamp)) > + return -EFAULT; > + > + if (put_user(query.cpu_delta, &query_ptr->cpu_delta)) > + return -EFAULT; > + > + if (put_user(query.cs_cycles, &query_ptr->cs_cycles)) > + return -EFAULT; > + > + return sizeof(query); > +} > + > static int > query_engine_info(struct drm_i915_private *i915, > struct drm_i915_query_item *query_item) > @@ -424,6 +571,7 @@ static int (* const i915_query_funcs[])(struct drm_i915_private *dev_priv, > query_topology_info, > query_engine_info, > query_perf_config, > + query_cs_cycles, > }; > > int i915_query_ioctl(struct drm_device *dev, void *data, struct drm_file *file) > diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h > index 6a34243a7646..0b4c27092d41 100644 > --- a/include/uapi/drm/i915_drm.h > +++ b/include/uapi/drm/i915_drm.h > @@ -2230,6 +2230,10 @@ struct drm_i915_query_item { > #define DRM_I915_QUERY_TOPOLOGY_INFO 1 > #define DRM_I915_QUERY_ENGINE_INFO 2 > #define DRM_I915_QUERY_PERF_CONFIG 3 > + /** > + * Query Command Streamer timestamp register. > + */ > +#define DRM_I915_QUERY_CS_CYCLES 4 > /* Must be kept compact -- no holes and well documented */ > > /** > @@ -2397,6 +2401,54 @@ struct drm_i915_engine_info { > __u64 rsvd1[4]; > }; > > +/** > + * struct drm_i915_query_cs_cycles > + * > + * The query returns the command streamer cycles and the frequency that can be > + * used to calculate the command streamer timestamp. In addition the query > + * returns a set of cpu timestamps that indicate when the command streamer cycle > + * count was captured. > + */ > +struct drm_i915_query_cs_cycles { > + /** Engine for which command streamer cycles is queried. */ > + struct i915_engine_class_instance engine; I've checked with HW engineers and they're claiming that all CS timestamp registers should report the same time modulo minor drift. You're CC'd on the internal e-mail. If this is really the case, then I don't think we want to put an engine in this query. --Jason > + > + /** Must be zero. */ > + __u32 flags; > + > + /** > + * Command streamer cycles as read from the command streamer > + * register at 0x358 offset. > + */ > + __u64 cs_cycles; > + > + /** Frequency of the cs cycles in Hz. */ > + __u64 cs_frequency; > + > + /** > + * CPU timestamp in ns. The timestamp is captured before reading the > + * cs_cycles register using the reference clockid set by the user. > + */ > + __u64 cpu_timestamp; > + > + /** > + * Time delta in ns captured around reading the lower dword of the > + * cs_cycles register. > + */ > + __u64 cpu_delta; > + > + /** > + * Reference clock id for CPU timestamp. For definition, see > + * clock_gettime(2) and perf_event_open(2). Supported clock ids are > + * CLOCK_MONOTONIC, CLOCK_MONOTONIC_RAW, CLOCK_REALTIME, CLOCK_BOOTTIME, > + * CLOCK_TAI. > + */ > + __s32 clockid; > + > + /** Must be zero. */ > + __u32 rsvd; > +}; > + > /** > * struct drm_i915_query_engine_info > * > -- > 2.20.1 >
On Thu, Apr 29, 2021 at 02:07:58PM -0500, Jason Ekstrand wrote: >On Wed, Apr 28, 2021 at 7:34 PM Umesh Nerlige Ramappa ><umesh.nerlige.ramappa@intel.com> wrote: >> >> Perf measurements rely on CPU and engine timestamps to correlate >> events of interest across these time domains. Current mechanisms get >> these timestamps separately and the calculated delta between these >> timestamps lack enough accuracy. >> >> To improve the accuracy of these time measurements to within a few us, >> add a query that returns the engine and cpu timestamps captured as >> close to each other as possible. >> >> v2: (Tvrtko) >> - document clock reference used >> - return cpu timestamp always >> - capture cpu time just before lower dword of cs timestamp >> >> v3: (Chris) >> - use uncore-rpm >> - use __query_cs_timestamp helper >> >> v4: (Lionel) >> - Kernel perf subsytem allows users to specify the clock id to be used >> in perf_event_open. This clock id is used by the perf subsystem to >> return the appropriate cpu timestamp in perf events. Similarly, let >> the user pass the clockid to this query so that cpu timestamp >> corresponds to the clock id requested. >> >> v5: (Tvrtko) >> - Use normal ktime accessors instead of fast versions >> - Add more uApi documentation >> >> v6: (Lionel) >> - Move switch out of spinlock >> >> v7: (Chris) >> - cs_timestamp is a misnomer, use cs_cycles instead >> - return the cs cycle frequency as well in the query >> >> v8: >> - Add platform and engine specific checks >> >> v9: (Lionel) >> - Return 2 cpu timestamps in the query - captured before and after the >> register read >> >> v10: (Chris) >> - Use local_clock() to measure time taken to read lower dword of >> register and return it to user. >> >> v11: (Jani) >> - IS_GEN deprecated. User GRAPHICS_VER instead. >> >> v12: (Jason) >> - Split cpu timestamp array into timestamp and delta for cleaner API >> >> Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com> >> Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> >> --- >> drivers/gpu/drm/i915/i915_query.c | 148 ++++++++++++++++++++++++++++++ >> include/uapi/drm/i915_drm.h | 52 +++++++++++ >> 2 files changed, 200 insertions(+) >> >> diff --git a/drivers/gpu/drm/i915/i915_query.c b/drivers/gpu/drm/i915/i915_query.c >> index fed337ad7b68..357c44e8177c 100644 >> --- a/drivers/gpu/drm/i915/i915_query.c >> +++ b/drivers/gpu/drm/i915/i915_query.c >> @@ -6,6 +6,8 @@ >> >> #include <linux/nospec.h> >> >> +#include "gt/intel_engine_pm.h" >> +#include "gt/intel_engine_user.h" >> #include "i915_drv.h" >> #include "i915_perf.h" >> #include "i915_query.h" >> @@ -90,6 +92,151 @@ static int query_topology_info(struct drm_i915_private *dev_priv, >> return total_length; >> } >> >> +typedef u64 (*__ktime_func_t)(void); >> +static __ktime_func_t __clock_id_to_func(clockid_t clk_id) >> +{ >> + /* >> + * Use logic same as the perf subsystem to allow user to select the >> + * reference clock id to be used for timestamps. >> + */ >> + switch (clk_id) { >> + case CLOCK_MONOTONIC: >> + return &ktime_get_ns; >> + case CLOCK_MONOTONIC_RAW: >> + return &ktime_get_raw_ns; >> + case CLOCK_REALTIME: >> + return &ktime_get_real_ns; >> + case CLOCK_BOOTTIME: >> + return &ktime_get_boottime_ns; >> + case CLOCK_TAI: >> + return &ktime_get_clocktai_ns; >> + default: >> + return NULL; >> + } >> +} >> + >> +static inline int >> +__read_timestamps(struct intel_uncore *uncore, >> + i915_reg_t lower_reg, >> + i915_reg_t upper_reg, >> + u64 *cs_ts, >> + u64 *cpu_ts, >> + u64 *cpu_delta, >> + __ktime_func_t cpu_clock) >> +{ >> + u32 upper, lower, old_upper, loop = 0; >> + >> + upper = intel_uncore_read_fw(uncore, upper_reg); >> + do { >> + *cpu_delta = local_clock(); >> + *cpu_ts = cpu_clock(); >> + lower = intel_uncore_read_fw(uncore, lower_reg); >> + *cpu_delta = local_clock() - *cpu_delta; >> + old_upper = upper; >> + upper = intel_uncore_read_fw(uncore, upper_reg); >> + } while (upper != old_upper && loop++ < 2); >> + >> + *cs_ts = (u64)upper << 32 | lower; >> + >> + return 0; >> +} >> + >> +static int >> +__query_cs_cycles(struct intel_engine_cs *engine, >> + u64 *cs_ts, u64 *cpu_ts, u64 *cpu_delta, >> + __ktime_func_t cpu_clock) >> +{ >> + struct intel_uncore *uncore = engine->uncore; >> + enum forcewake_domains fw_domains; >> + u32 base = engine->mmio_base; >> + intel_wakeref_t wakeref; >> + int ret; >> + >> + fw_domains = intel_uncore_forcewake_for_reg(uncore, >> + RING_TIMESTAMP(base), >> + FW_REG_READ); >> + >> + with_intel_runtime_pm(uncore->rpm, wakeref) { >> + spin_lock_irq(&uncore->lock); >> + intel_uncore_forcewake_get__locked(uncore, fw_domains); >> + >> + ret = __read_timestamps(uncore, >> + RING_TIMESTAMP(base), >> + RING_TIMESTAMP_UDW(base), >> + cs_ts, >> + cpu_ts, >> + cpu_delta, >> + cpu_clock); >> + >> + intel_uncore_forcewake_put__locked(uncore, fw_domains); >> + spin_unlock_irq(&uncore->lock); >> + } >> + >> + return ret; >> +} >> + >> +static int >> +query_cs_cycles(struct drm_i915_private *i915, >> + struct drm_i915_query_item *query_item) >> +{ >> + struct drm_i915_query_cs_cycles __user *query_ptr; >> + struct drm_i915_query_cs_cycles query; >> + struct intel_engine_cs *engine; >> + __ktime_func_t cpu_clock; >> + int ret; >> + >> + if (GRAPHICS_VER(i915) < 6) >> + return -ENODEV; >> + >> + query_ptr = u64_to_user_ptr(query_item->data_ptr); >> + ret = copy_query_item(&query, sizeof(query), sizeof(query), query_item); >> + if (ret != 0) >> + return ret; >> + >> + if (query.flags) >> + return -EINVAL; >> + >> + if (query.rsvd) >> + return -EINVAL; >> + >> + cpu_clock = __clock_id_to_func(query.clockid); >> + if (!cpu_clock) >> + return -EINVAL; >> + >> + engine = intel_engine_lookup_user(i915, >> + query.engine.engine_class, >> + query.engine.engine_instance); >> + if (!engine) >> + return -EINVAL; >> + >> + if (GRAPHICS_VER(i915) == 6 && >> + query.engine.engine_class != I915_ENGINE_CLASS_RENDER) >> + return -ENODEV; >> + >> + query.cs_frequency = engine->gt->clock_frequency; >> + ret = __query_cs_cycles(engine, >> + &query.cs_cycles, >> + &query.cpu_timestamp, >> + &query.cpu_delta, >> + cpu_clock); >> + if (ret) >> + return ret; >> + >> + if (put_user(query.cs_frequency, &query_ptr->cs_frequency)) >> + return -EFAULT; >> + >> + if (put_user(query.cpu_timestamp, &query_ptr->cpu_timestamp)) >> + return -EFAULT; >> + >> + if (put_user(query.cpu_delta, &query_ptr->cpu_delta)) >> + return -EFAULT; >> + >> + if (put_user(query.cs_cycles, &query_ptr->cs_cycles)) >> + return -EFAULT; >> + >> + return sizeof(query); >> +} >> + >> static int >> query_engine_info(struct drm_i915_private *i915, >> struct drm_i915_query_item *query_item) >> @@ -424,6 +571,7 @@ static int (* const i915_query_funcs[])(struct drm_i915_private *dev_priv, >> query_topology_info, >> query_engine_info, >> query_perf_config, >> + query_cs_cycles, >> }; >> >> int i915_query_ioctl(struct drm_device *dev, void *data, struct drm_file *file) >> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h >> index 6a34243a7646..0b4c27092d41 100644 >> --- a/include/uapi/drm/i915_drm.h >> +++ b/include/uapi/drm/i915_drm.h >> @@ -2230,6 +2230,10 @@ struct drm_i915_query_item { >> #define DRM_I915_QUERY_TOPOLOGY_INFO 1 >> #define DRM_I915_QUERY_ENGINE_INFO 2 >> #define DRM_I915_QUERY_PERF_CONFIG 3 >> + /** >> + * Query Command Streamer timestamp register. >> + */ >> +#define DRM_I915_QUERY_CS_CYCLES 4 >> /* Must be kept compact -- no holes and well documented */ >> >> /** >> @@ -2397,6 +2401,54 @@ struct drm_i915_engine_info { >> __u64 rsvd1[4]; >> }; >> >> +/** >> + * struct drm_i915_query_cs_cycles >> + * >> + * The query returns the command streamer cycles and the frequency that can be >> + * used to calculate the command streamer timestamp. In addition the query >> + * returns a set of cpu timestamps that indicate when the command streamer cycle >> + * count was captured. >> + */ >> +struct drm_i915_query_cs_cycles { >> + /** Engine for which command streamer cycles is queried. */ >> + struct i915_engine_class_instance engine; > >I've checked with HW engineers and they're claiming that all CS >timestamp registers should report the same time modulo minor drift. >You're CC'd on the internal e-mail. If this is really the case, then >I don't think we want to put an engine in this query. Looks like the engine can be dropped since all timestamps are in sync. I just have one more question here. The timestamp itself is 36 bits. Should the uapi also report the timestamp width to the user OR should I just return the lower 32 bits of the timestamp? Thanks, Umesh > >--Jason > >> + >> + /** Must be zero. */ >> + __u32 flags; >> + >> + /** >> + * Command streamer cycles as read from the command streamer >> + * register at 0x358 offset. >> + */ >> + __u64 cs_cycles; >> + >> + /** Frequency of the cs cycles in Hz. */ >> + __u64 cs_frequency; >> + >> + /** >> + * CPU timestamp in ns. The timestamp is captured before reading the >> + * cs_cycles register using the reference clockid set by the user. >> + */ >> + __u64 cpu_timestamp; >> + >> + /** >> + * Time delta in ns captured around reading the lower dword of the >> + * cs_cycles register. >> + */ >> + __u64 cpu_delta; >> + >> + /** >> + * Reference clock id for CPU timestamp. For definition, see >> + * clock_gettime(2) and perf_event_open(2). Supported clock ids are >> + * CLOCK_MONOTONIC, CLOCK_MONOTONIC_RAW, CLOCK_REALTIME, CLOCK_BOOTTIME, >> + * CLOCK_TAI. >> + */ >> + __s32 clockid; >> + >> + /** Must be zero. */ >> + __u32 rsvd; >> +}; >> + >> /** >> * struct drm_i915_query_engine_info >> * >> -- >> 2.20.1 >>
On Fri, 30 Apr 2021 15:26:09 -0700, Umesh Nerlige Ramappa wrote: > > Looks like the engine can be dropped since all timestamps are in sync. I > just have one more question here. The timestamp itself is 36 bits. Should > the uapi also report the timestamp width to the user OR should I just > return the lower 32 bits of the timestamp? How would exposing only the lower 32 bits of the timestamp work? The way to avoid exposing the width would be to expose the timestamp as a regular 64 bit value. In the kernel engine state, have a variable for the counter and keep on accumulating that (on each query) to full 64 bits in spite of the 36 bit HW counter overflow. So not exposing the width (or exposing a 64 bit timestamp) is a cleaner interface but also more work in the kernel.
On Fri, 30 Apr 2021 16:00:46 -0700, Dixit, Ashutosh wrote: > > On Fri, 30 Apr 2021 15:26:09 -0700, Umesh Nerlige Ramappa wrote: > > > > Looks like the engine can be dropped since all timestamps are in sync. I > > just have one more question here. The timestamp itself is 36 bits. Should > > the uapi also report the timestamp width to the user OR should I just > > return the lower 32 bits of the timestamp? > > How would exposing only the lower 32 bits of the timestamp work? It would work I guess but overflow every few seconds. So if the counters are sampled at a low frequency (once every few seconds) it would yield misleading timestamps. > The way to avoid exposing the width would be to expose the timestamp as a > regular 64 bit value. In the kernel engine state, have a variable for the > counter and keep on accumulating that (on each query) to full 64 bits in > spite of the 36 bit HW counter overflow. > > So not exposing the width (or exposing a 64 bit timestamp) is a cleaner > interface but also more work in the kernel.
On April 30, 2021 18:00:58 "Dixit, Ashutosh" <ashutosh.dixit@intel.com> wrote: > On Fri, 30 Apr 2021 15:26:09 -0700, Umesh Nerlige Ramappa wrote: >> >> Looks like the engine can be dropped since all timestamps are in sync. I >> just have one more question here. The timestamp itself is 36 bits. Should >> the uapi also report the timestamp width to the user OR should I just >> return the lower 32 bits of the timestamp? Yeah, I think reporting the timestamp width is a good idea since we're reporting the period/frequency here. >> > How would exposing only the lower 32 bits of the timestamp work? > > The way to avoid exposing the width would be to expose the timestamp as a > regular 64 bit value. In the kernel engine state, have a variable for the > counter and keep on accumulating that (on each query) to full 64 bits in > spite of the 36 bit HW counter overflow. That's doesn't actually work since you can query the 64-bit timestamp value from the GPU. The way this is handled in Vulkan is that the number of timestamp bits is reported to the application as a queue property. --Jason >
On Fri, Apr 30, 2021 at 07:35:41PM -0500, Jason Ekstrand wrote: > On April 30, 2021 18:00:58 "Dixit, Ashutosh" <ashutosh.dixit@intel.com> > wrote: > > On Fri, 30 Apr 2021 15:26:09 -0700, Umesh Nerlige Ramappa wrote: > > Looks like the engine can be dropped since all timestamps are in sync. > I > just have one more question here. The timestamp itself is 36 bits. > Should > the uapi also report the timestamp width to the user OR should I just > return the lower 32 bits of the timestamp? > > Yeah, I think reporting the timestamp width is a good idea since we're > reporting the period/frequency here. Actually, I forgot that we are handling the overflow before returning the cs_cycles to the user and overflow handling was the only reason I thought user should know the width. Would you stil recommend returning the width in the uapi? Thanks, Umesh > > How would exposing only the lower 32 bits of the timestamp work? > The way to avoid exposing the width would be to expose the timestamp as > a > regular 64 bit value. In the kernel engine state, have a variable for > the > counter and keep on accumulating that (on each query) to full 64 bits in > spite of the 36 bit HW counter overflow. > > That's doesn't actually work since you can query the 64-bit timestamp > value from the GPU. The way this is handled in Vulkan is that the number > of timestamp bits is reported to the application as a queue property. > --Jason
On Fri, 30 Apr 2021 19:19:59 -0700, Umesh Nerlige Ramappa wrote: > > On Fri, Apr 30, 2021 at 07:35:41PM -0500, Jason Ekstrand wrote: > > On April 30, 2021 18:00:58 "Dixit, Ashutosh" <ashutosh.dixit@intel.com> > > wrote: > > > > On Fri, 30 Apr 2021 15:26:09 -0700, Umesh Nerlige Ramappa wrote: > > > > Looks like the engine can be dropped since all timestamps are in sync. > > I > > just have one more question here. The timestamp itself is 36 bits. > > Should > > the uapi also report the timestamp width to the user OR should I just > > return the lower 32 bits of the timestamp? > > > > Yeah, I think reporting the timestamp width is a good idea since we're > > reporting the period/frequency here. > > Actually, I forgot that we are handling the overflow before returning the > cs_cycles to the user and overflow handling was the only reason I thought > user should know the width. Would you stil recommend returning the width in > the uapi? The width is needed for userspace to figure out if overflow has occured between two successive query calls. I don't think I see this happening in the code.
On April 30, 2021 23:01:44 "Dixit, Ashutosh" <ashutosh.dixit@intel.com> wrote: > On Fri, 30 Apr 2021 19:19:59 -0700, Umesh Nerlige Ramappa wrote: >> >> On Fri, Apr 30, 2021 at 07:35:41PM -0500, Jason Ekstrand wrote: >>> On April 30, 2021 18:00:58 "Dixit, Ashutosh" <ashutosh.dixit@intel.com> >>> wrote: >>> >>> On Fri, 30 Apr 2021 15:26:09 -0700, Umesh Nerlige Ramappa wrote: >>> >>> Looks like the engine can be dropped since all timestamps are in sync. >>> I >>> just have one more question here. The timestamp itself is 36 bits. >>> Should >>> the uapi also report the timestamp width to the user OR should I just >>> return the lower 32 bits of the timestamp? >>> >>> Yeah, I think reporting the timestamp width is a good idea since we're >>> reporting the period/frequency here. >> >> Actually, I forgot that we are handling the overflow before returning the >> cs_cycles to the user and overflow handling was the only reason I thought >> user should know the width. Would you stil recommend returning the width in >> the uapi? > > The width is needed for userspace to figure out if overflow has occured > between two successive query calls. I don't think I see this happening in > the code. Right... We (UMDs) currently just hard-code it to 36 bits because that's what we've had on all platforms since close enough to forever. We bake in the frequency based on PCI ID. Returning the number of bits, like I said, goes nicely with the frequency. It's not necessary, assuming sufficiently smart userspace (neither is frequency), but it seems to go with it. I guess I don't care much either way. Coming back to the multi-tile issue we discussed internally, I think that is something we should care about. Since this works by reading the timestamp register on an engine, I think leaving the engine specifier in there is fine. Userspace should know that there's actually only one clock and just query one of them (probably RCS). For crazy multi-device cases, we'll either query per logical device (read tile) or we'll have to make them look like a single device and sync the timestamps somehow in the UMD by carrying around an offset factor. As is, this patch is Reviewed-by: Jason Ekstrand <jason@jlekstrand.net> I still need to review the ANV patch before we can land this though. --Jason
On Sat, May 01, 2021 at 10:27:03AM -0500, Jason Ekstrand wrote: > On April 30, 2021 23:01:44 "Dixit, Ashutosh" <ashutosh.dixit@intel.com> > wrote: > > On Fri, 30 Apr 2021 19:19:59 -0700, Umesh Nerlige Ramappa wrote: > > On Fri, Apr 30, 2021 at 07:35:41PM -0500, Jason Ekstrand wrote: > > On April 30, 2021 18:00:58 "Dixit, Ashutosh" > <ashutosh.dixit@intel.com> > wrote: > On Fri, 30 Apr 2021 15:26:09 -0700, Umesh Nerlige Ramappa wrote: > Looks like the engine can be dropped since all timestamps are in > sync. > I > just have one more question here. The timestamp itself is 36 bits. > Should > the uapi also report the timestamp width to the user OR should I > just > return the lower 32 bits of the timestamp? > Yeah, I think reporting the timestamp width is a good idea since > we're > reporting the period/frequency here. > > Actually, I forgot that we are handling the overflow before returning > the > cs_cycles to the user and overflow handling was the only reason I > thought > user should know the width. Would you stil recommend returning the > width in > the uapi? > > The width is needed for userspace to figure out if overflow has occured > between two successive query calls. I don't think I see this happening > in > the code. > > Right... We (UMDs) currently just hard-code it to 36 bits because that's > what we've had on all platforms since close enough to forever. We bake in > the frequency based on PCI ID. Returning the number of bits, like I said, > goes nicely with the frequency. It's not necessary, assuming sufficiently > smart userspace (neither is frequency), but it seems to go with it. I > guess I don't care much either way. > Coming back to the multi-tile issue we discussed internally, I think that > is something we should care about. Since this works by reading the > timestamp register on an engine, I think leaving the engine specifier in > there is fine. Userspace should know that there's actually only one clock > and just query one of them (probably RCS). For crazy multi-device cases, > we'll either query per logical device (read tile) or we'll have to make > them look like a single device and sync the timestamps somehow in the UMD > by carrying around an offset factor. > As is, this patch is > Reviewed-by: Jason Ekstrand <jason@jlekstrand.net> Thanks, I will add the width here and post the final version. Regards, Umesh > I still need to review the ANV patch before we can land this though. > --Jason
diff --git a/drivers/gpu/drm/i915/i915_query.c b/drivers/gpu/drm/i915/i915_query.c index fed337ad7b68..357c44e8177c 100644 --- a/drivers/gpu/drm/i915/i915_query.c +++ b/drivers/gpu/drm/i915/i915_query.c @@ -6,6 +6,8 @@ #include <linux/nospec.h> +#include "gt/intel_engine_pm.h" +#include "gt/intel_engine_user.h" #include "i915_drv.h" #include "i915_perf.h" #include "i915_query.h" @@ -90,6 +92,151 @@ static int query_topology_info(struct drm_i915_private *dev_priv, return total_length; } +typedef u64 (*__ktime_func_t)(void); +static __ktime_func_t __clock_id_to_func(clockid_t clk_id) +{ + /* + * Use logic same as the perf subsystem to allow user to select the + * reference clock id to be used for timestamps. + */ + switch (clk_id) { + case CLOCK_MONOTONIC: + return &ktime_get_ns; + case CLOCK_MONOTONIC_RAW: + return &ktime_get_raw_ns; + case CLOCK_REALTIME: + return &ktime_get_real_ns; + case CLOCK_BOOTTIME: + return &ktime_get_boottime_ns; + case CLOCK_TAI: + return &ktime_get_clocktai_ns; + default: + return NULL; + } +} + +static inline int +__read_timestamps(struct intel_uncore *uncore, + i915_reg_t lower_reg, + i915_reg_t upper_reg, + u64 *cs_ts, + u64 *cpu_ts, + u64 *cpu_delta, + __ktime_func_t cpu_clock) +{ + u32 upper, lower, old_upper, loop = 0; + + upper = intel_uncore_read_fw(uncore, upper_reg); + do { + *cpu_delta = local_clock(); + *cpu_ts = cpu_clock(); + lower = intel_uncore_read_fw(uncore, lower_reg); + *cpu_delta = local_clock() - *cpu_delta; + old_upper = upper; + upper = intel_uncore_read_fw(uncore, upper_reg); + } while (upper != old_upper && loop++ < 2); + + *cs_ts = (u64)upper << 32 | lower; + + return 0; +} + +static int +__query_cs_cycles(struct intel_engine_cs *engine, + u64 *cs_ts, u64 *cpu_ts, u64 *cpu_delta, + __ktime_func_t cpu_clock) +{ + struct intel_uncore *uncore = engine->uncore; + enum forcewake_domains fw_domains; + u32 base = engine->mmio_base; + intel_wakeref_t wakeref; + int ret; + + fw_domains = intel_uncore_forcewake_for_reg(uncore, + RING_TIMESTAMP(base), + FW_REG_READ); + + with_intel_runtime_pm(uncore->rpm, wakeref) { + spin_lock_irq(&uncore->lock); + intel_uncore_forcewake_get__locked(uncore, fw_domains); + + ret = __read_timestamps(uncore, + RING_TIMESTAMP(base), + RING_TIMESTAMP_UDW(base), + cs_ts, + cpu_ts, + cpu_delta, + cpu_clock); + + intel_uncore_forcewake_put__locked(uncore, fw_domains); + spin_unlock_irq(&uncore->lock); + } + + return ret; +} + +static int +query_cs_cycles(struct drm_i915_private *i915, + struct drm_i915_query_item *query_item) +{ + struct drm_i915_query_cs_cycles __user *query_ptr; + struct drm_i915_query_cs_cycles query; + struct intel_engine_cs *engine; + __ktime_func_t cpu_clock; + int ret; + + if (GRAPHICS_VER(i915) < 6) + return -ENODEV; + + query_ptr = u64_to_user_ptr(query_item->data_ptr); + ret = copy_query_item(&query, sizeof(query), sizeof(query), query_item); + if (ret != 0) + return ret; + + if (query.flags) + return -EINVAL; + + if (query.rsvd) + return -EINVAL; + + cpu_clock = __clock_id_to_func(query.clockid); + if (!cpu_clock) + return -EINVAL; + + engine = intel_engine_lookup_user(i915, + query.engine.engine_class, + query.engine.engine_instance); + if (!engine) + return -EINVAL; + + if (GRAPHICS_VER(i915) == 6 && + query.engine.engine_class != I915_ENGINE_CLASS_RENDER) + return -ENODEV; + + query.cs_frequency = engine->gt->clock_frequency; + ret = __query_cs_cycles(engine, + &query.cs_cycles, + &query.cpu_timestamp, + &query.cpu_delta, + cpu_clock); + if (ret) + return ret; + + if (put_user(query.cs_frequency, &query_ptr->cs_frequency)) + return -EFAULT; + + if (put_user(query.cpu_timestamp, &query_ptr->cpu_timestamp)) + return -EFAULT; + + if (put_user(query.cpu_delta, &query_ptr->cpu_delta)) + return -EFAULT; + + if (put_user(query.cs_cycles, &query_ptr->cs_cycles)) + return -EFAULT; + + return sizeof(query); +} + static int query_engine_info(struct drm_i915_private *i915, struct drm_i915_query_item *query_item) @@ -424,6 +571,7 @@ static int (* const i915_query_funcs[])(struct drm_i915_private *dev_priv, query_topology_info, query_engine_info, query_perf_config, + query_cs_cycles, }; int i915_query_ioctl(struct drm_device *dev, void *data, struct drm_file *file) diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index 6a34243a7646..0b4c27092d41 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -2230,6 +2230,10 @@ struct drm_i915_query_item { #define DRM_I915_QUERY_TOPOLOGY_INFO 1 #define DRM_I915_QUERY_ENGINE_INFO 2 #define DRM_I915_QUERY_PERF_CONFIG 3 + /** + * Query Command Streamer timestamp register. + */ +#define DRM_I915_QUERY_CS_CYCLES 4 /* Must be kept compact -- no holes and well documented */ /** @@ -2397,6 +2401,54 @@ struct drm_i915_engine_info { __u64 rsvd1[4]; }; +/** + * struct drm_i915_query_cs_cycles + * + * The query returns the command streamer cycles and the frequency that can be + * used to calculate the command streamer timestamp. In addition the query + * returns a set of cpu timestamps that indicate when the command streamer cycle + * count was captured. + */ +struct drm_i915_query_cs_cycles { + /** Engine for which command streamer cycles is queried. */ + struct i915_engine_class_instance engine; + + /** Must be zero. */ + __u32 flags; + + /** + * Command streamer cycles as read from the command streamer + * register at 0x358 offset. + */ + __u64 cs_cycles; + + /** Frequency of the cs cycles in Hz. */ + __u64 cs_frequency; + + /** + * CPU timestamp in ns. The timestamp is captured before reading the + * cs_cycles register using the reference clockid set by the user. + */ + __u64 cpu_timestamp; + + /** + * Time delta in ns captured around reading the lower dword of the + * cs_cycles register. + */ + __u64 cpu_delta; + + /** + * Reference clock id for CPU timestamp. For definition, see + * clock_gettime(2) and perf_event_open(2). Supported clock ids are + * CLOCK_MONOTONIC, CLOCK_MONOTONIC_RAW, CLOCK_REALTIME, CLOCK_BOOTTIME, + * CLOCK_TAI. + */ + __s32 clockid; + + /** Must be zero. */ + __u32 rsvd; +}; + /** * struct drm_i915_query_engine_info *