diff mbox series

[v3,5/6] drm/xe/hwmon: Expose hwmon energy attribute

Message ID 20230802135241.458855-6-badal.nilawar@intel.com (mailing list archive)
State Changes Requested
Headers show
Series Add HWMON support for DGFX | expand

Commit Message

Nilawar, Badal Aug. 2, 2023, 1:52 p.m. UTC
Expose hwmon energy attribute to show device level and gt
level energy usage

v2:
  - %s/hwm_/hwmon_/
  - %s/tile_/gt_
  - Convert enums to upper case
  - Print error info for hwmon_gt devices

Signed-off-by: Badal Nilawar <badal.nilawar@intel.com>
---
 .../ABI/testing/sysfs-driver-intel-xe-hwmon   |  12 +
 drivers/gpu/drm/xe/regs/xe_gt_regs.h          |   2 +
 drivers/gpu/drm/xe/regs/xe_mchbar_regs.h      |   3 +
 drivers/gpu/drm/xe/xe_hwmon.c                 | 216 +++++++++++++++++-
 4 files changed, 229 insertions(+), 4 deletions(-)

Comments

Guenter Roeck Aug. 2, 2023, 2:14 p.m. UTC | #1
On 8/2/23 06:52, Badal Nilawar wrote:
> Expose hwmon energy attribute to show device level and gt
> level energy usage
> 
> v2:
>    - %s/hwm_/hwmon_/
>    - %s/tile_/gt_
>    - Convert enums to upper case
>    - Print error info for hwmon_gt devices
> 
> Signed-off-by: Badal Nilawar <badal.nilawar@intel.com>
> ---
>   .../ABI/testing/sysfs-driver-intel-xe-hwmon   |  12 +
>   drivers/gpu/drm/xe/regs/xe_gt_regs.h          |   2 +
>   drivers/gpu/drm/xe/regs/xe_mchbar_regs.h      |   3 +
>   drivers/gpu/drm/xe/xe_hwmon.c                 | 216 +++++++++++++++++-
>   4 files changed, 229 insertions(+), 4 deletions(-)
> 
> diff --git a/Documentation/ABI/testing/sysfs-driver-intel-xe-hwmon b/Documentation/ABI/testing/sysfs-driver-intel-xe-hwmon
> index 167bd9480602..4b2d6e1d0c7f 100644
> --- a/Documentation/ABI/testing/sysfs-driver-intel-xe-hwmon
> +++ b/Documentation/ABI/testing/sysfs-driver-intel-xe-hwmon
> @@ -52,3 +52,15 @@ Description:	RO. Current Voltage in millivolt.
>   
>   		Only supported for particular Intel xe graphics platforms.
>   
> +What:		/sys/devices/.../hwmon/hwmon<i>/energy1_input
> +Date:		August 2023
> +KernelVersion:	6.4
> +Contact:	intel-xe@lists.freedesktop.org
> +Description:	RO. Energy input of device or gt in microjoules.
> +
> +		For xe device level hwmon devices (name "xe") this
> +		reflects energy input for the entire device. For gt level
> +		hwmon devices (name "xe_gtN") this reflects energy input
> +		for the gt.
> +
> +		Only supported for particular Intel xe graphics platforms.
> diff --git a/drivers/gpu/drm/xe/regs/xe_gt_regs.h b/drivers/gpu/drm/xe/regs/xe_gt_regs.h
> index cc452ec999fc..8819b934a592 100644
> --- a/drivers/gpu/drm/xe/regs/xe_gt_regs.h
> +++ b/drivers/gpu/drm/xe/regs/xe_gt_regs.h
> @@ -400,8 +400,10 @@
>   #define XEHPC_BCS5_BCS6_INTR_MASK		XE_REG(0x190118)
>   #define XEHPC_BCS7_BCS8_INTR_MASK		XE_REG(0x19011c)
>   
> +#define PVC_GT0_PACKAGE_ENERGY_STATUS		XE_REG(0x281004)
>   #define PVC_GT0_PACKAGE_RAPL_LIMIT		XE_REG(0x281008)
>   #define PVC_GT0_PACKAGE_POWER_SKU_UNIT		XE_REG(0x281068)
> +#define PVC_GT0_PLATFORM_ENERGY_STATUS		XE_REG(0x28106c)
>   #define PVC_GT0_PACKAGE_POWER_SKU		XE_REG(0x281080)
>   
>   #endif
> diff --git a/drivers/gpu/drm/xe/regs/xe_mchbar_regs.h b/drivers/gpu/drm/xe/regs/xe_mchbar_regs.h
> index cb2d49b5c8a9..473a44bd7c56 100644
> --- a/drivers/gpu/drm/xe/regs/xe_mchbar_regs.h
> +++ b/drivers/gpu/drm/xe/regs/xe_mchbar_regs.h
> @@ -25,6 +25,9 @@
>   
>   #define PCU_CR_PACKAGE_POWER_SKU_UNIT		XE_REG(MCHBAR_MIRROR_BASE_SNB + 0x5938)
>   #define   PKG_PWR_UNIT				REG_GENMASK(3, 0)
> +#define   PKG_ENERGY_UNIT			REG_GENMASK(12, 8)
> +
> +#define PCU_CR_PACKAGE_ENERGY_STATUS		XE_REG(MCHBAR_MIRROR_BASE_SNB + 0x593c)
>   
>   #define PCU_CR_PACKAGE_RAPL_LIMIT		XE_REG(MCHBAR_MIRROR_BASE_SNB + 0x59a0)
>   #define   PKG_PWR_LIM_1				REG_GENMASK(14, 0)
> diff --git a/drivers/gpu/drm/xe/xe_hwmon.c b/drivers/gpu/drm/xe/xe_hwmon.c
> index 3e69cd79c1e2..a337edcebae5 100644
> --- a/drivers/gpu/drm/xe/xe_hwmon.c
> +++ b/drivers/gpu/drm/xe/xe_hwmon.c
> @@ -22,6 +22,8 @@ enum hwmon_reg_name {
>   	REG_PKG_POWER_SKU,
>   	REG_PKG_POWER_SKU_UNIT,
>   	REG_GT_PERF_STATUS,
> +	REG_ENERGY_STATUS_ALL,
> +	REG_ENERGY_STATUS_GT,
>   };
>   
>   enum hwmon_reg_operation {
> @@ -30,31 +32,50 @@ enum hwmon_reg_operation {
>   	REG_RMW,
>   };
>   
> +enum xe_hwmon_device_type {
> +	HWMON_GT,
> +	HWMON_DEVICE,
> +};
> +
>   /*
>    * SF_* - scale factors for particular quantities according to hwmon spec.
>    * - power  - microwatts
>    * - curr   - milliamperes
>    * - voltage  - millivolts
> + * - energy - microjoules
>    */
>   #define SF_POWER	1000000
>   #define SF_CURR		1000
>   #define SF_VOLTAGE	1000
> +#define SF_ENERGY	1000000
> +
> +struct hwmon_energy_info {
> +	u32 reg_val_prev;
> +	long accum_energy;		/* Accumulated energy for energy1_input */
> +};
>   
>   struct xe_hwmon_data {
>   	struct device *hwmon_dev;
>   	struct xe_gt *gt;
>   	char name[12];
> +	struct hwmon_energy_info ei;	/*  Energy info for energy1_input */
> +	enum xe_hwmon_device_type type;
>   };
>   
>   struct xe_hwmon {
>   	struct xe_hwmon_data ddat;
> +	struct xe_hwmon_data ddat_gt[XE_MAX_TILES_PER_DEVICE];
>   	struct mutex hwmon_lock; /* rmw operations*/
>   	bool reset_in_progress;
>   	wait_queue_head_t waitq;
>   	int scl_shift_power;
> +	int scl_shift_energy;
>   };
>   
> -#define ddat_to_xe_hwmon(ddat)	({ container_of(ddat, struct xe_hwmon, ddat); })
> +#define ddat_to_xe_hwmon(ddat)	\
> +	({ ddat->type == HWMON_GT ?	\
> +		container_of(ddat, struct xe_hwmon, ddat_gt[ddat->gt->info.id]) :	\
> +		container_of(ddat, struct xe_hwmon, ddat); })
>   
>   static u32 hwmon_get_reg(struct xe_hwmon_data *ddat, enum hwmon_reg_name reg_name)
>   {
> @@ -84,6 +105,16 @@ static u32 hwmon_get_reg(struct xe_hwmon_data *ddat, enum hwmon_reg_name reg_nam
>   		if (xe->info.platform == XE_DG2)
>   			reg = GT_PERF_STATUS;
>   		break;
> +	case REG_ENERGY_STATUS_ALL:
> +		if (xe->info.platform == XE_DG2)
> +			reg = PCU_CR_PACKAGE_ENERGY_STATUS;
> +		else if (xe->info.platform == XE_PVC)
> +			reg = PVC_GT0_PLATFORM_ENERGY_STATUS;
> +		break;
> +	case REG_ENERGY_STATUS_GT:
> +		if (xe->info.platform == XE_PVC)
> +			reg = PVC_GT0_PACKAGE_ENERGY_STATUS;
> +		break;
>   	default:
>   		XE_MISSING_CASE(reg_name);
>   		break;
> @@ -228,10 +259,69 @@ static int hwmon_power_rated_max_read(struct xe_hwmon_data *ddat, long *value)
>   	return 0;
>   }
>   
> +/*
> + * hwmon_energy_get - Obtain energy value
> + *
> + * The underlying energy hardware register is 32-bits and is subject to
> + * overflow. How long before overflow? For example, with an example
> + * scaling bit shift of 14 bits (see register *PACKAGE_POWER_SKU_UNIT) and
> + * a power draw of 1000 watts, the 32-bit counter will overflow in
> + * approximately 4.36 minutes.
> + *
> + * Examples:
> + *    1 watt:  (2^32 >> 14) /    1 W / (60 * 60 * 24) secs/day -> 3 days
> + * 1000 watts: (2^32 >> 14) / 1000 W / 60             secs/min -> 4.36 minutes
> + *
> + * The function significantly increases overflow duration (from 4.36
> + * minutes) by accumulating the energy register into a 'long' as allowed by
> + * the hwmon API. Using x86_64 128 bit arithmetic (see mul_u64_u32_shr()),
> + * a 'long' of 63 bits, SF_ENERGY of 1e6 (~20 bits) and
> + * hwmon->scl_shift_energy of 14 bits we have 57 (63 - 20 + 14) bits before
> + * energy1_input overflows. This at 1000 W is an overflow duration of 278 years.
> + */
> +static void
> +hwmon_energy_get(struct xe_hwmon_data *ddat, long *energy)
> +{
> +	struct xe_hwmon *hwmon = ddat_to_xe_hwmon(ddat);
> +	struct hwmon_energy_info *ei = &ddat->ei;
> +	u32 reg_val;
> +
> +	xe_device_mem_access_get(gt_to_xe(ddat->gt));
> +
> +	mutex_lock(&hwmon->hwmon_lock);
> +
> +	if (ddat->type == HWMON_GT)
> +		process_hwmon_reg(ddat, REG_ENERGY_STATUS_GT, REG_READ,
> +				  &reg_val, 0, 0);
> +	else
> +		process_hwmon_reg(ddat, REG_ENERGY_STATUS_ALL, REG_READ,
> +				  &reg_val, 0, 0);
> +
> +	if (reg_val >= ei->reg_val_prev)
> +		ei->accum_energy += reg_val - ei->reg_val_prev;
> +	else
> +		ei->accum_energy += UINT_MAX - ei->reg_val_prev + reg_val;
> +
> +	ei->reg_val_prev = reg_val;
> +
> +	*energy = mul_u64_u32_shr(ei->accum_energy, SF_ENERGY,
> +				  hwmon->scl_shift_energy);
> +
> +	mutex_unlock(&hwmon->hwmon_lock);
> +
> +	xe_device_mem_access_put(gt_to_xe(ddat->gt));
> +}
> +
>   static const struct hwmon_channel_info *hwmon_info[] = {
>   	HWMON_CHANNEL_INFO(power, HWMON_P_MAX | HWMON_P_RATED_MAX | HWMON_P_CRIT),
>   	HWMON_CHANNEL_INFO(curr, HWMON_C_CRIT),
>   	HWMON_CHANNEL_INFO(in, HWMON_I_INPUT),
> +	HWMON_CHANNEL_INFO(energy, HWMON_E_INPUT),
> +	NULL
> +};
> +
> +static const struct hwmon_channel_info *hwmon_gt_info[] = {
> +	HWMON_CHANNEL_INFO(energy, HWMON_E_INPUT),
>   	NULL
>   };
>   
> @@ -449,6 +539,32 @@ hwmon_in_read(struct xe_hwmon_data *ddat, u32 attr, long *val)
>   	return ret;
>   }
>   
> +static umode_t
> +hwmon_energy_is_visible(struct xe_hwmon_data *ddat, u32 attr)
> +{
> +	switch (attr) {
> +	case hwmon_energy_input:
> +		if (ddat->type == HWMON_GT)
> +			return hwmon_get_reg(ddat, REG_ENERGY_STATUS_GT) ? 0444 : 0;
> +		else
> +			return hwmon_get_reg(ddat, REG_ENERGY_STATUS_ALL) ? 0444 : 0;
> +	default:
> +		return 0;
> +	}
> +}
> +
> +static int
> +hwmon_energy_read(struct xe_hwmon_data *ddat, u32 attr, long *val)
> +{
> +	switch (attr) {
> +	case hwmon_energy_input:
> +		hwmon_energy_get(ddat, val);
> +		return 0;
> +	default:
> +		return -EOPNOTSUPP;
> +	}
> +}
> +
>   static umode_t
>   hwmon_is_visible(const void *drvdata, enum hwmon_sensor_types type,
>   		 u32 attr, int channel)
> @@ -468,6 +584,9 @@ hwmon_is_visible(const void *drvdata, enum hwmon_sensor_types type,
>   	case hwmon_in:
>   		ret = hwmon_in_is_visible(ddat, attr);
>   		break;
> +	case hwmon_energy:
> +		ret = hwmon_energy_is_visible(ddat, attr);
> +		break;
>   	default:
>   		ret = 0;
>   		break;
> @@ -497,6 +616,9 @@ hwmon_read(struct device *dev, enum hwmon_sensor_types type, u32 attr,
>   	case hwmon_in:
>   		ret = hwmon_in_read(ddat, attr, val);
>   		break;
> +	case hwmon_energy:
> +		ret = hwmon_energy_read(ddat, attr, val);
> +		break;
>   	default:
>   		ret = -EOPNOTSUPP;
>   		break;
> @@ -544,12 +666,53 @@ static const struct hwmon_chip_info hwmon_chip_info = {
>   	.info = hwmon_info,
>   };
>   
> +static umode_t
> +hwmon_gt_is_visible(const void *drvdata, enum hwmon_sensor_types type,
> +		    u32 attr, int channel)
> +{
> +	struct xe_hwmon_data *ddat = (struct xe_hwmon_data *)drvdata;
> +
> +	switch (type) {
> +	case hwmon_energy:
> +		return hwmon_energy_is_visible(ddat, attr);
> +	default:
> +		return 0;
> +	}
> +}
> +
> +static int
> +hwmon_gt_read(struct device *dev, enum hwmon_sensor_types type, u32 attr,
> +	      int channel, long *val)
> +{
> +	struct xe_hwmon_data *ddat = dev_get_drvdata(dev);
> +
> +	switch (type) {
> +	case hwmon_energy:
> +		return hwmon_energy_read(ddat, attr, val);
> +	default:
> +		return -EOPNOTSUPP;
> +	}
> +}
> +
> +static const struct hwmon_ops hwmon_gt_ops = {
> +	.is_visible = hwmon_gt_is_visible,
> +	.read = hwmon_gt_read,
> +};
> +
> +static const struct hwmon_chip_info hwmon_gt_chip_info = {
> +	.ops = &hwmon_gt_ops,
> +	.info = hwmon_gt_info,
> +};
> +
>   static void
>   hwmon_get_preregistration_info(struct xe_device *xe)
>   {
>   	struct xe_hwmon *hwmon = xe->hwmon;
>   	struct xe_hwmon_data *ddat = &hwmon->ddat;
> +	struct xe_gt *gt;
> +	long energy;
>   	u32 val_sku_unit = 0;
> +	u8 id;
>   	int ret;
>   
>   	ret = process_hwmon_reg(ddat, REG_PKG_POWER_SKU_UNIT, REG_READ, &val_sku_unit, 0, 0);
> @@ -557,8 +720,22 @@ hwmon_get_preregistration_info(struct xe_device *xe)
>   	 * The contents of register PKG_POWER_SKU_UNIT do not change,
>   	 * so read it once and store the shift values.
>   	 */
> -	if (!ret)
> +	if (!ret) {
>   		hwmon->scl_shift_power = REG_FIELD_GET(PKG_PWR_UNIT, val_sku_unit);
> +		hwmon->scl_shift_energy = REG_FIELD_GET(PKG_ENERGY_UNIT, val_sku_unit);
> +	}
> +
> +	/*
> +	 * Initialize 'struct hwmon_energy_info', i.e. set fields to the
> +	 * first value of the energy register read
> +	 */
> +	if (hwmon_is_visible(ddat, hwmon_energy, hwmon_energy_input, 0))
> +		hwmon_energy_get(ddat, &energy);
> +
> +	for_each_gt(gt, xe, id)
> +		if (hwmon_gt_is_visible(&hwmon->ddat_gt[id], hwmon_energy,
> +					hwmon_energy_input, 0))
> +			hwmon_energy_get(&hwmon->ddat_gt[id], &energy);
>   }
>   
>   void xe_hwmon_register(struct xe_device *xe)
> @@ -567,6 +744,9 @@ void xe_hwmon_register(struct xe_device *xe)
>   	struct xe_hwmon *hwmon;
>   	struct device *hwmon_dev;
>   	struct xe_hwmon_data *ddat;
> +	struct xe_hwmon_data *ddat_gt;
> +	struct xe_gt *gt;
> +	u8 id;
>   
>   	/* hwmon is available only for dGfx */
>   	if (!IS_DGFX(xe))
> @@ -583,13 +763,21 @@ void xe_hwmon_register(struct xe_device *xe)
>   
>   	/* primary GT to access device level properties */
>   	ddat->gt = xe->tiles[0].primary_gt;
> +	ddat->type = HWMON_DEVICE;
>   
>   	snprintf(ddat->name, sizeof(ddat->name), "xe");
>   
> -	hwmon_get_preregistration_info(xe);
> -
>   	init_waitqueue_head(&hwmon->waitq);
>   
> +	for_each_gt(gt, xe, id) {
> +		ddat_gt = hwmon->ddat_gt + id;
> +		ddat_gt->gt = gt;
> +		snprintf(ddat_gt->name, sizeof(ddat_gt->name), "xe_gt%u", id);
> +		ddat_gt->type = HWMON_GT;
> +	}
> +
> +	hwmon_get_preregistration_info(xe);
> +
>   	drm_dbg(&xe->drm, "Register xe hwmon interface\n");
>   
>   	/* hwmon_dev points to device hwmon<i> */
> @@ -605,6 +793,26 @@ void xe_hwmon_register(struct xe_device *xe)
>   	}
>   
>   	ddat->hwmon_dev = hwmon_dev;
> +
> +	for_each_gt(gt, xe, id) {
> +		ddat_gt = hwmon->ddat_gt + id;
> +		/*
> +		 * Create per-gt directories only if a per-gt attribute is
> +		 * visible. Currently this is only energy
> +		 */
> +		if (!hwmon_gt_is_visible(ddat_gt, hwmon_energy, hwmon_energy_input, 0))
> +			continue;
> +
> +		hwmon_dev = devm_hwmon_device_register_with_info(dev, ddat_gt->name,
> +								 ddat_gt,
> +								 &hwmon_gt_chip_info,
> +								 NULL);
> +		if (IS_ERR(hwmon_dev))
> +			drm_warn(&xe->drm, "Fail to register xe_gt %d hwmon, Err:%ld\n",
> +				 id, PTR_ERR(hwmon_dev));
> +		else
> +			ddat_gt->hwmon_dev = hwmon_dev;
> +	}

There should be just one hardware monitoring device. Just use energyN
and reference the input with an appropriate sensor label.

Guenter

>   }
>   
>   void xe_hwmon_unregister(struct xe_device *xe)
Nilawar, Badal Aug. 3, 2023, 6:34 a.m. UTC | #2
Hi Guenter,

On 02-08-2023 19:44, Guenter Roeck wrote:
> On 8/2/23 06:52, Badal Nilawar wrote:
>> Expose hwmon energy attribute to show device level and gt
>> level energy usage
>>
>> v2:
>>    - %s/hwm_/hwmon_/
>>    - %s/tile_/gt_
>>    - Convert enums to upper case
>>    - Print error info for hwmon_gt devices
>>
>> Signed-off-by: Badal Nilawar <badal.nilawar@intel.com>
>> ---
>>   .../ABI/testing/sysfs-driver-intel-xe-hwmon   |  12 +
>>   drivers/gpu/drm/xe/regs/xe_gt_regs.h          |   2 +
>>   drivers/gpu/drm/xe/regs/xe_mchbar_regs.h      |   3 +
>>   drivers/gpu/drm/xe/xe_hwmon.c                 | 216 +++++++++++++++++-
>>   4 files changed, 229 insertions(+), 4 deletions(-)
>>
>> diff --git a/Documentation/ABI/testing/sysfs-driver-intel-xe-hwmon 
>> b/Documentation/ABI/testing/sysfs-driver-intel-xe-hwmon
>> index 167bd9480602..4b2d6e1d0c7f 100644
>> --- a/Documentation/ABI/testing/sysfs-driver-intel-xe-hwmon
>> +++ b/Documentation/ABI/testing/sysfs-driver-intel-xe-hwmon
>> @@ -52,3 +52,15 @@ Description:    RO. Current Voltage in millivolt.
>>           Only supported for particular Intel xe graphics platforms.
>> +What:        /sys/devices/.../hwmon/hwmon<i>/energy1_input
>> +Date:        August 2023
>> +KernelVersion:    6.4
>> +Contact:    intel-xe@lists.freedesktop.org
>> +Description:    RO. Energy input of device or gt in microjoules.
>> +
>> +        For xe device level hwmon devices (name "xe") this
>> +        reflects energy input for the entire device. For gt level
>> +        hwmon devices (name "xe_gtN") this reflects energy input
>> +        for the gt.
>> +
>> +        Only supported for particular Intel xe graphics platforms.
>> diff --git a/drivers/gpu/drm/xe/regs/xe_gt_regs.h 
>> b/drivers/gpu/drm/xe/regs/xe_gt_regs.h
>> index cc452ec999fc..8819b934a592 100644
>> --- a/drivers/gpu/drm/xe/regs/xe_gt_regs.h
>> +++ b/drivers/gpu/drm/xe/regs/xe_gt_regs.h
>> @@ -400,8 +400,10 @@
>>   #define XEHPC_BCS5_BCS6_INTR_MASK        XE_REG(0x190118)
>>   #define XEHPC_BCS7_BCS8_INTR_MASK        XE_REG(0x19011c)
>> +#define PVC_GT0_PACKAGE_ENERGY_STATUS        XE_REG(0x281004)
>>   #define PVC_GT0_PACKAGE_RAPL_LIMIT        XE_REG(0x281008)
>>   #define PVC_GT0_PACKAGE_POWER_SKU_UNIT        XE_REG(0x281068)
>> +#define PVC_GT0_PLATFORM_ENERGY_STATUS        XE_REG(0x28106c)
>>   #define PVC_GT0_PACKAGE_POWER_SKU        XE_REG(0x281080)
>>   #endif
>> diff --git a/drivers/gpu/drm/xe/regs/xe_mchbar_regs.h 
>> b/drivers/gpu/drm/xe/regs/xe_mchbar_regs.h
>> index cb2d49b5c8a9..473a44bd7c56 100644
>> --- a/drivers/gpu/drm/xe/regs/xe_mchbar_regs.h
>> +++ b/drivers/gpu/drm/xe/regs/xe_mchbar_regs.h
>> @@ -25,6 +25,9 @@
>>   #define PCU_CR_PACKAGE_POWER_SKU_UNIT        
>> XE_REG(MCHBAR_MIRROR_BASE_SNB + 0x5938)
>>   #define   PKG_PWR_UNIT                REG_GENMASK(3, 0)
>> +#define   PKG_ENERGY_UNIT            REG_GENMASK(12, 8)
>> +
>> +#define PCU_CR_PACKAGE_ENERGY_STATUS        
>> XE_REG(MCHBAR_MIRROR_BASE_SNB + 0x593c)
>>   #define PCU_CR_PACKAGE_RAPL_LIMIT        
>> XE_REG(MCHBAR_MIRROR_BASE_SNB + 0x59a0)
>>   #define   PKG_PWR_LIM_1                REG_GENMASK(14, 0)
>> diff --git a/drivers/gpu/drm/xe/xe_hwmon.c 
>> b/drivers/gpu/drm/xe/xe_hwmon.c
>> index 3e69cd79c1e2..a337edcebae5 100644
>> --- a/drivers/gpu/drm/xe/xe_hwmon.c
>> +++ b/drivers/gpu/drm/xe/xe_hwmon.c
>> @@ -22,6 +22,8 @@ enum hwmon_reg_name {
>>       REG_PKG_POWER_SKU,
>>       REG_PKG_POWER_SKU_UNIT,
>>       REG_GT_PERF_STATUS,
>> +    REG_ENERGY_STATUS_ALL,
>> +    REG_ENERGY_STATUS_GT,
>>   };
>>   enum hwmon_reg_operation {
>> @@ -30,31 +32,50 @@ enum hwmon_reg_operation {
>>       REG_RMW,
>>   };
>> +enum xe_hwmon_device_type {
>> +    HWMON_GT,
>> +    HWMON_DEVICE,
>> +};
>> +
>>   /*
>>    * SF_* - scale factors for particular quantities according to hwmon 
>> spec.
>>    * - power  - microwatts
>>    * - curr   - milliamperes
>>    * - voltage  - millivolts
>> + * - energy - microjoules
>>    */
>>   #define SF_POWER    1000000
>>   #define SF_CURR        1000
>>   #define SF_VOLTAGE    1000
>> +#define SF_ENERGY    1000000
>> +
>> +struct hwmon_energy_info {
>> +    u32 reg_val_prev;
>> +    long accum_energy;        /* Accumulated energy for energy1_input */
>> +};
>>   struct xe_hwmon_data {
>>       struct device *hwmon_dev;
>>       struct xe_gt *gt;
>>       char name[12];
>> +    struct hwmon_energy_info ei;    /*  Energy info for energy1_input */
>> +    enum xe_hwmon_device_type type;
>>   };
>>   struct xe_hwmon {
>>       struct xe_hwmon_data ddat;
>> +    struct xe_hwmon_data ddat_gt[XE_MAX_TILES_PER_DEVICE];
>>       struct mutex hwmon_lock; /* rmw operations*/
>>       bool reset_in_progress;
>>       wait_queue_head_t waitq;
>>       int scl_shift_power;
>> +    int scl_shift_energy;
>>   };
>> -#define ddat_to_xe_hwmon(ddat)    ({ container_of(ddat, struct 
>> xe_hwmon, ddat); })
>> +#define ddat_to_xe_hwmon(ddat)    \
>> +    ({ ddat->type == HWMON_GT ?    \
>> +        container_of(ddat, struct xe_hwmon, 
>> ddat_gt[ddat->gt->info.id]) :    \
>> +        container_of(ddat, struct xe_hwmon, ddat); })
>>   static u32 hwmon_get_reg(struct xe_hwmon_data *ddat, enum 
>> hwmon_reg_name reg_name)
>>   {
>> @@ -84,6 +105,16 @@ static u32 hwmon_get_reg(struct xe_hwmon_data 
>> *ddat, enum hwmon_reg_name reg_nam
>>           if (xe->info.platform == XE_DG2)
>>               reg = GT_PERF_STATUS;
>>           break;
>> +    case REG_ENERGY_STATUS_ALL:
>> +        if (xe->info.platform == XE_DG2)
>> +            reg = PCU_CR_PACKAGE_ENERGY_STATUS;
>> +        else if (xe->info.platform == XE_PVC)
>> +            reg = PVC_GT0_PLATFORM_ENERGY_STATUS;
>> +        break;
>> +    case REG_ENERGY_STATUS_GT:
>> +        if (xe->info.platform == XE_PVC)
>> +            reg = PVC_GT0_PACKAGE_ENERGY_STATUS;
>> +        break;
>>       default:
>>           XE_MISSING_CASE(reg_name);
>>           break;
>> @@ -228,10 +259,69 @@ static int hwmon_power_rated_max_read(struct 
>> xe_hwmon_data *ddat, long *value)
>>       return 0;
>>   }
>> +/*
>> + * hwmon_energy_get - Obtain energy value
>> + *
>> + * The underlying energy hardware register is 32-bits and is subject to
>> + * overflow. How long before overflow? For example, with an example
>> + * scaling bit shift of 14 bits (see register 
>> *PACKAGE_POWER_SKU_UNIT) and
>> + * a power draw of 1000 watts, the 32-bit counter will overflow in
>> + * approximately 4.36 minutes.
>> + *
>> + * Examples:
>> + *    1 watt:  (2^32 >> 14) /    1 W / (60 * 60 * 24) secs/day -> 3 days
>> + * 1000 watts: (2^32 >> 14) / 1000 W / 60             secs/min -> 
>> 4.36 minutes
>> + *
>> + * The function significantly increases overflow duration (from 4.36
>> + * minutes) by accumulating the energy register into a 'long' as 
>> allowed by
>> + * the hwmon API. Using x86_64 128 bit arithmetic (see 
>> mul_u64_u32_shr()),
>> + * a 'long' of 63 bits, SF_ENERGY of 1e6 (~20 bits) and
>> + * hwmon->scl_shift_energy of 14 bits we have 57 (63 - 20 + 14) bits 
>> before
>> + * energy1_input overflows. This at 1000 W is an overflow duration of 
>> 278 years.
>> + */
>> +static void
>> +hwmon_energy_get(struct xe_hwmon_data *ddat, long *energy)
>> +{
>> +    struct xe_hwmon *hwmon = ddat_to_xe_hwmon(ddat);
>> +    struct hwmon_energy_info *ei = &ddat->ei;
>> +    u32 reg_val;
>> +
>> +    xe_device_mem_access_get(gt_to_xe(ddat->gt));
>> +
>> +    mutex_lock(&hwmon->hwmon_lock);
>> +
>> +    if (ddat->type == HWMON_GT)
>> +        process_hwmon_reg(ddat, REG_ENERGY_STATUS_GT, REG_READ,
>> +                  &reg_val, 0, 0);
>> +    else
>> +        process_hwmon_reg(ddat, REG_ENERGY_STATUS_ALL, REG_READ,
>> +                  &reg_val, 0, 0);
>> +
>> +    if (reg_val >= ei->reg_val_prev)
>> +        ei->accum_energy += reg_val - ei->reg_val_prev;
>> +    else
>> +        ei->accum_energy += UINT_MAX - ei->reg_val_prev + reg_val;
>> +
>> +    ei->reg_val_prev = reg_val;
>> +
>> +    *energy = mul_u64_u32_shr(ei->accum_energy, SF_ENERGY,
>> +                  hwmon->scl_shift_energy);
>> +
>> +    mutex_unlock(&hwmon->hwmon_lock);
>> +
>> +    xe_device_mem_access_put(gt_to_xe(ddat->gt));
>> +}
>> +
>>   static const struct hwmon_channel_info *hwmon_info[] = {
>>       HWMON_CHANNEL_INFO(power, HWMON_P_MAX | HWMON_P_RATED_MAX | 
>> HWMON_P_CRIT),
>>       HWMON_CHANNEL_INFO(curr, HWMON_C_CRIT),
>>       HWMON_CHANNEL_INFO(in, HWMON_I_INPUT),
>> +    HWMON_CHANNEL_INFO(energy, HWMON_E_INPUT),
>> +    NULL
>> +};
>> +
>> +static const struct hwmon_channel_info *hwmon_gt_info[] = {
>> +    HWMON_CHANNEL_INFO(energy, HWMON_E_INPUT),
>>       NULL
>>   };
>> @@ -449,6 +539,32 @@ hwmon_in_read(struct xe_hwmon_data *ddat, u32 
>> attr, long *val)
>>       return ret;
>>   }
>> +static umode_t
>> +hwmon_energy_is_visible(struct xe_hwmon_data *ddat, u32 attr)
>> +{
>> +    switch (attr) {
>> +    case hwmon_energy_input:
>> +        if (ddat->type == HWMON_GT)
>> +            return hwmon_get_reg(ddat, REG_ENERGY_STATUS_GT) ? 0444 : 0;
>> +        else
>> +            return hwmon_get_reg(ddat, REG_ENERGY_STATUS_ALL) ? 0444 
>> : 0;
>> +    default:
>> +        return 0;
>> +    }
>> +}
>> +
>> +static int
>> +hwmon_energy_read(struct xe_hwmon_data *ddat, u32 attr, long *val)
>> +{
>> +    switch (attr) {
>> +    case hwmon_energy_input:
>> +        hwmon_energy_get(ddat, val);
>> +        return 0;
>> +    default:
>> +        return -EOPNOTSUPP;
>> +    }
>> +}
>> +
>>   static umode_t
>>   hwmon_is_visible(const void *drvdata, enum hwmon_sensor_types type,
>>            u32 attr, int channel)
>> @@ -468,6 +584,9 @@ hwmon_is_visible(const void *drvdata, enum 
>> hwmon_sensor_types type,
>>       case hwmon_in:
>>           ret = hwmon_in_is_visible(ddat, attr);
>>           break;
>> +    case hwmon_energy:
>> +        ret = hwmon_energy_is_visible(ddat, attr);
>> +        break;
>>       default:
>>           ret = 0;
>>           break;
>> @@ -497,6 +616,9 @@ hwmon_read(struct device *dev, enum 
>> hwmon_sensor_types type, u32 attr,
>>       case hwmon_in:
>>           ret = hwmon_in_read(ddat, attr, val);
>>           break;
>> +    case hwmon_energy:
>> +        ret = hwmon_energy_read(ddat, attr, val);
>> +        break;
>>       default:
>>           ret = -EOPNOTSUPP;
>>           break;
>> @@ -544,12 +666,53 @@ static const struct hwmon_chip_info 
>> hwmon_chip_info = {
>>       .info = hwmon_info,
>>   };
>> +static umode_t
>> +hwmon_gt_is_visible(const void *drvdata, enum hwmon_sensor_types type,
>> +            u32 attr, int channel)
>> +{
>> +    struct xe_hwmon_data *ddat = (struct xe_hwmon_data *)drvdata;
>> +
>> +    switch (type) {
>> +    case hwmon_energy:
>> +        return hwmon_energy_is_visible(ddat, attr);
>> +    default:
>> +        return 0;
>> +    }
>> +}
>> +
>> +static int
>> +hwmon_gt_read(struct device *dev, enum hwmon_sensor_types type, u32 
>> attr,
>> +          int channel, long *val)
>> +{
>> +    struct xe_hwmon_data *ddat = dev_get_drvdata(dev);
>> +
>> +    switch (type) {
>> +    case hwmon_energy:
>> +        return hwmon_energy_read(ddat, attr, val);
>> +    default:
>> +        return -EOPNOTSUPP;
>> +    }
>> +}
>> +
>> +static const struct hwmon_ops hwmon_gt_ops = {
>> +    .is_visible = hwmon_gt_is_visible,
>> +    .read = hwmon_gt_read,
>> +};
>> +
>> +static const struct hwmon_chip_info hwmon_gt_chip_info = {
>> +    .ops = &hwmon_gt_ops,
>> +    .info = hwmon_gt_info,
>> +};
>> +
>>   static void
>>   hwmon_get_preregistration_info(struct xe_device *xe)
>>   {
>>       struct xe_hwmon *hwmon = xe->hwmon;
>>       struct xe_hwmon_data *ddat = &hwmon->ddat;
>> +    struct xe_gt *gt;
>> +    long energy;
>>       u32 val_sku_unit = 0;
>> +    u8 id;
>>       int ret;
>>       ret = process_hwmon_reg(ddat, REG_PKG_POWER_SKU_UNIT, REG_READ, 
>> &val_sku_unit, 0, 0);
>> @@ -557,8 +720,22 @@ hwmon_get_preregistration_info(struct xe_device *xe)
>>        * The contents of register PKG_POWER_SKU_UNIT do not change,
>>        * so read it once and store the shift values.
>>        */
>> -    if (!ret)
>> +    if (!ret) {
>>           hwmon->scl_shift_power = REG_FIELD_GET(PKG_PWR_UNIT, 
>> val_sku_unit);
>> +        hwmon->scl_shift_energy = REG_FIELD_GET(PKG_ENERGY_UNIT, 
>> val_sku_unit);
>> +    }
>> +
>> +    /*
>> +     * Initialize 'struct hwmon_energy_info', i.e. set fields to the
>> +     * first value of the energy register read
>> +     */
>> +    if (hwmon_is_visible(ddat, hwmon_energy, hwmon_energy_input, 0))
>> +        hwmon_energy_get(ddat, &energy);
>> +
>> +    for_each_gt(gt, xe, id)
>> +        if (hwmon_gt_is_visible(&hwmon->ddat_gt[id], hwmon_energy,
>> +                    hwmon_energy_input, 0))
>> +            hwmon_energy_get(&hwmon->ddat_gt[id], &energy);
>>   }
>>   void xe_hwmon_register(struct xe_device *xe)
>> @@ -567,6 +744,9 @@ void xe_hwmon_register(struct xe_device *xe)
>>       struct xe_hwmon *hwmon;
>>       struct device *hwmon_dev;
>>       struct xe_hwmon_data *ddat;
>> +    struct xe_hwmon_data *ddat_gt;
>> +    struct xe_gt *gt;
>> +    u8 id;
>>       /* hwmon is available only for dGfx */
>>       if (!IS_DGFX(xe))
>> @@ -583,13 +763,21 @@ void xe_hwmon_register(struct xe_device *xe)
>>       /* primary GT to access device level properties */
>>       ddat->gt = xe->tiles[0].primary_gt;
>> +    ddat->type = HWMON_DEVICE;
>>       snprintf(ddat->name, sizeof(ddat->name), "xe");
>> -    hwmon_get_preregistration_info(xe);
>> -
>>       init_waitqueue_head(&hwmon->waitq);
>> +    for_each_gt(gt, xe, id) {
>> +        ddat_gt = hwmon->ddat_gt + id;
>> +        ddat_gt->gt = gt;
>> +        snprintf(ddat_gt->name, sizeof(ddat_gt->name), "xe_gt%u", id);
>> +        ddat_gt->type = HWMON_GT;
>> +    }
>> +
>> +    hwmon_get_preregistration_info(xe);
>> +
>>       drm_dbg(&xe->drm, "Register xe hwmon interface\n");
>>       /* hwmon_dev points to device hwmon<i> */
>> @@ -605,6 +793,26 @@ void xe_hwmon_register(struct xe_device *xe)
>>       }
>>       ddat->hwmon_dev = hwmon_dev;
>> +
>> +    for_each_gt(gt, xe, id) {
>> +        ddat_gt = hwmon->ddat_gt + id;
>> +        /*
>> +         * Create per-gt directories only if a per-gt attribute is
>> +         * visible. Currently this is only energy
>> +         */
>> +        if (!hwmon_gt_is_visible(ddat_gt, hwmon_energy, 
>> hwmon_energy_input, 0))
>> +            continue;
>> +
>> +        hwmon_dev = devm_hwmon_device_register_with_info(dev, 
>> ddat_gt->name,
>> +                                 ddat_gt,
>> +                                 &hwmon_gt_chip_info,
>> +                                 NULL);
>> +        if (IS_ERR(hwmon_dev))
>> +            drm_warn(&xe->drm, "Fail to register xe_gt %d hwmon, 
>> Err:%ld\n",
>> +                 id, PTR_ERR(hwmon_dev));
>> +        else
>> +            ddat_gt->hwmon_dev = hwmon_dev;
>> +    }
> 
> There should be just one hardware monitoring device. Just use energyN
> and reference the input with an appropriate sensor label.
Idea was to expose energy attributes under saperate hwmon folder with 
device names xe_gtN. But with channel/label approach it will look like 
energyN_input (energy1_input - device, energy_2,3input - gt0/gt1) with 
appropriate energyN_label (energy1_label = "energy device", energy_2,3 = 
"energy gt0/gt1". With this approach we can avoid using 2 structures 
xe_hwmon and xe_hwmon_data.

Regards,
Badal
> 
> Guenter
> 
>>   }
>>   void xe_hwmon_unregister(struct xe_device *xe)
>
Guenter Roeck Aug. 3, 2023, 2:42 p.m. UTC | #3
On 8/2/23 23:34, Nilawar, Badal wrote:

>>
>> There should be just one hardware monitoring device. Just use energyN
>> and reference the input with an appropriate sensor label.
> Idea was to expose energy attributes under saperate hwmon folder with device names xe_gtN. But with channel/label approach it will look like energyN_input (energy1_input - device, energy_2,3input - gt0/gt1) with appropriate energyN_label (energy1_label = "energy device", energy_2,3 = "energy gt0/gt1". With this approach we can avoid using 2 structures xe_hwmon and xe_hwmon_data.
> 

There is really no such thing as "energy device". I'd suggest to find a better name,
such as "Package energy" or just "Package".

Anyway, your code is flawed: It claims to handle energy overflows, but it doesn't
really do that reliably. The returned numbers are only reliable if read at least
once within an overflow interval. Making it reliable would require introducing
a worker or kernel thread which reads the data from the chip in regular intervals
and accumulates independently of attribute reads.

Guenter
diff mbox series

Patch

diff --git a/Documentation/ABI/testing/sysfs-driver-intel-xe-hwmon b/Documentation/ABI/testing/sysfs-driver-intel-xe-hwmon
index 167bd9480602..4b2d6e1d0c7f 100644
--- a/Documentation/ABI/testing/sysfs-driver-intel-xe-hwmon
+++ b/Documentation/ABI/testing/sysfs-driver-intel-xe-hwmon
@@ -52,3 +52,15 @@  Description:	RO. Current Voltage in millivolt.
 
 		Only supported for particular Intel xe graphics platforms.
 
+What:		/sys/devices/.../hwmon/hwmon<i>/energy1_input
+Date:		August 2023
+KernelVersion:	6.4
+Contact:	intel-xe@lists.freedesktop.org
+Description:	RO. Energy input of device or gt in microjoules.
+
+		For xe device level hwmon devices (name "xe") this
+		reflects energy input for the entire device. For gt level
+		hwmon devices (name "xe_gtN") this reflects energy input
+		for the gt.
+
+		Only supported for particular Intel xe graphics platforms.
diff --git a/drivers/gpu/drm/xe/regs/xe_gt_regs.h b/drivers/gpu/drm/xe/regs/xe_gt_regs.h
index cc452ec999fc..8819b934a592 100644
--- a/drivers/gpu/drm/xe/regs/xe_gt_regs.h
+++ b/drivers/gpu/drm/xe/regs/xe_gt_regs.h
@@ -400,8 +400,10 @@ 
 #define XEHPC_BCS5_BCS6_INTR_MASK		XE_REG(0x190118)
 #define XEHPC_BCS7_BCS8_INTR_MASK		XE_REG(0x19011c)
 
+#define PVC_GT0_PACKAGE_ENERGY_STATUS		XE_REG(0x281004)
 #define PVC_GT0_PACKAGE_RAPL_LIMIT		XE_REG(0x281008)
 #define PVC_GT0_PACKAGE_POWER_SKU_UNIT		XE_REG(0x281068)
+#define PVC_GT0_PLATFORM_ENERGY_STATUS		XE_REG(0x28106c)
 #define PVC_GT0_PACKAGE_POWER_SKU		XE_REG(0x281080)
 
 #endif
diff --git a/drivers/gpu/drm/xe/regs/xe_mchbar_regs.h b/drivers/gpu/drm/xe/regs/xe_mchbar_regs.h
index cb2d49b5c8a9..473a44bd7c56 100644
--- a/drivers/gpu/drm/xe/regs/xe_mchbar_regs.h
+++ b/drivers/gpu/drm/xe/regs/xe_mchbar_regs.h
@@ -25,6 +25,9 @@ 
 
 #define PCU_CR_PACKAGE_POWER_SKU_UNIT		XE_REG(MCHBAR_MIRROR_BASE_SNB + 0x5938)
 #define   PKG_PWR_UNIT				REG_GENMASK(3, 0)
+#define   PKG_ENERGY_UNIT			REG_GENMASK(12, 8)
+
+#define PCU_CR_PACKAGE_ENERGY_STATUS		XE_REG(MCHBAR_MIRROR_BASE_SNB + 0x593c)
 
 #define PCU_CR_PACKAGE_RAPL_LIMIT		XE_REG(MCHBAR_MIRROR_BASE_SNB + 0x59a0)
 #define   PKG_PWR_LIM_1				REG_GENMASK(14, 0)
diff --git a/drivers/gpu/drm/xe/xe_hwmon.c b/drivers/gpu/drm/xe/xe_hwmon.c
index 3e69cd79c1e2..a337edcebae5 100644
--- a/drivers/gpu/drm/xe/xe_hwmon.c
+++ b/drivers/gpu/drm/xe/xe_hwmon.c
@@ -22,6 +22,8 @@  enum hwmon_reg_name {
 	REG_PKG_POWER_SKU,
 	REG_PKG_POWER_SKU_UNIT,
 	REG_GT_PERF_STATUS,
+	REG_ENERGY_STATUS_ALL,
+	REG_ENERGY_STATUS_GT,
 };
 
 enum hwmon_reg_operation {
@@ -30,31 +32,50 @@  enum hwmon_reg_operation {
 	REG_RMW,
 };
 
+enum xe_hwmon_device_type {
+	HWMON_GT,
+	HWMON_DEVICE,
+};
+
 /*
  * SF_* - scale factors for particular quantities according to hwmon spec.
  * - power  - microwatts
  * - curr   - milliamperes
  * - voltage  - millivolts
+ * - energy - microjoules
  */
 #define SF_POWER	1000000
 #define SF_CURR		1000
 #define SF_VOLTAGE	1000
+#define SF_ENERGY	1000000
+
+struct hwmon_energy_info {
+	u32 reg_val_prev;
+	long accum_energy;		/* Accumulated energy for energy1_input */
+};
 
 struct xe_hwmon_data {
 	struct device *hwmon_dev;
 	struct xe_gt *gt;
 	char name[12];
+	struct hwmon_energy_info ei;	/*  Energy info for energy1_input */
+	enum xe_hwmon_device_type type;
 };
 
 struct xe_hwmon {
 	struct xe_hwmon_data ddat;
+	struct xe_hwmon_data ddat_gt[XE_MAX_TILES_PER_DEVICE];
 	struct mutex hwmon_lock; /* rmw operations*/
 	bool reset_in_progress;
 	wait_queue_head_t waitq;
 	int scl_shift_power;
+	int scl_shift_energy;
 };
 
-#define ddat_to_xe_hwmon(ddat)	({ container_of(ddat, struct xe_hwmon, ddat); })
+#define ddat_to_xe_hwmon(ddat)	\
+	({ ddat->type == HWMON_GT ?	\
+		container_of(ddat, struct xe_hwmon, ddat_gt[ddat->gt->info.id]) :	\
+		container_of(ddat, struct xe_hwmon, ddat); })
 
 static u32 hwmon_get_reg(struct xe_hwmon_data *ddat, enum hwmon_reg_name reg_name)
 {
@@ -84,6 +105,16 @@  static u32 hwmon_get_reg(struct xe_hwmon_data *ddat, enum hwmon_reg_name reg_nam
 		if (xe->info.platform == XE_DG2)
 			reg = GT_PERF_STATUS;
 		break;
+	case REG_ENERGY_STATUS_ALL:
+		if (xe->info.platform == XE_DG2)
+			reg = PCU_CR_PACKAGE_ENERGY_STATUS;
+		else if (xe->info.platform == XE_PVC)
+			reg = PVC_GT0_PLATFORM_ENERGY_STATUS;
+		break;
+	case REG_ENERGY_STATUS_GT:
+		if (xe->info.platform == XE_PVC)
+			reg = PVC_GT0_PACKAGE_ENERGY_STATUS;
+		break;
 	default:
 		XE_MISSING_CASE(reg_name);
 		break;
@@ -228,10 +259,69 @@  static int hwmon_power_rated_max_read(struct xe_hwmon_data *ddat, long *value)
 	return 0;
 }
 
+/*
+ * hwmon_energy_get - Obtain energy value
+ *
+ * The underlying energy hardware register is 32-bits and is subject to
+ * overflow. How long before overflow? For example, with an example
+ * scaling bit shift of 14 bits (see register *PACKAGE_POWER_SKU_UNIT) and
+ * a power draw of 1000 watts, the 32-bit counter will overflow in
+ * approximately 4.36 minutes.
+ *
+ * Examples:
+ *    1 watt:  (2^32 >> 14) /    1 W / (60 * 60 * 24) secs/day -> 3 days
+ * 1000 watts: (2^32 >> 14) / 1000 W / 60             secs/min -> 4.36 minutes
+ *
+ * The function significantly increases overflow duration (from 4.36
+ * minutes) by accumulating the energy register into a 'long' as allowed by
+ * the hwmon API. Using x86_64 128 bit arithmetic (see mul_u64_u32_shr()),
+ * a 'long' of 63 bits, SF_ENERGY of 1e6 (~20 bits) and
+ * hwmon->scl_shift_energy of 14 bits we have 57 (63 - 20 + 14) bits before
+ * energy1_input overflows. This at 1000 W is an overflow duration of 278 years.
+ */
+static void
+hwmon_energy_get(struct xe_hwmon_data *ddat, long *energy)
+{
+	struct xe_hwmon *hwmon = ddat_to_xe_hwmon(ddat);
+	struct hwmon_energy_info *ei = &ddat->ei;
+	u32 reg_val;
+
+	xe_device_mem_access_get(gt_to_xe(ddat->gt));
+
+	mutex_lock(&hwmon->hwmon_lock);
+
+	if (ddat->type == HWMON_GT)
+		process_hwmon_reg(ddat, REG_ENERGY_STATUS_GT, REG_READ,
+				  &reg_val, 0, 0);
+	else
+		process_hwmon_reg(ddat, REG_ENERGY_STATUS_ALL, REG_READ,
+				  &reg_val, 0, 0);
+
+	if (reg_val >= ei->reg_val_prev)
+		ei->accum_energy += reg_val - ei->reg_val_prev;
+	else
+		ei->accum_energy += UINT_MAX - ei->reg_val_prev + reg_val;
+
+	ei->reg_val_prev = reg_val;
+
+	*energy = mul_u64_u32_shr(ei->accum_energy, SF_ENERGY,
+				  hwmon->scl_shift_energy);
+
+	mutex_unlock(&hwmon->hwmon_lock);
+
+	xe_device_mem_access_put(gt_to_xe(ddat->gt));
+}
+
 static const struct hwmon_channel_info *hwmon_info[] = {
 	HWMON_CHANNEL_INFO(power, HWMON_P_MAX | HWMON_P_RATED_MAX | HWMON_P_CRIT),
 	HWMON_CHANNEL_INFO(curr, HWMON_C_CRIT),
 	HWMON_CHANNEL_INFO(in, HWMON_I_INPUT),
+	HWMON_CHANNEL_INFO(energy, HWMON_E_INPUT),
+	NULL
+};
+
+static const struct hwmon_channel_info *hwmon_gt_info[] = {
+	HWMON_CHANNEL_INFO(energy, HWMON_E_INPUT),
 	NULL
 };
 
@@ -449,6 +539,32 @@  hwmon_in_read(struct xe_hwmon_data *ddat, u32 attr, long *val)
 	return ret;
 }
 
+static umode_t
+hwmon_energy_is_visible(struct xe_hwmon_data *ddat, u32 attr)
+{
+	switch (attr) {
+	case hwmon_energy_input:
+		if (ddat->type == HWMON_GT)
+			return hwmon_get_reg(ddat, REG_ENERGY_STATUS_GT) ? 0444 : 0;
+		else
+			return hwmon_get_reg(ddat, REG_ENERGY_STATUS_ALL) ? 0444 : 0;
+	default:
+		return 0;
+	}
+}
+
+static int
+hwmon_energy_read(struct xe_hwmon_data *ddat, u32 attr, long *val)
+{
+	switch (attr) {
+	case hwmon_energy_input:
+		hwmon_energy_get(ddat, val);
+		return 0;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
 static umode_t
 hwmon_is_visible(const void *drvdata, enum hwmon_sensor_types type,
 		 u32 attr, int channel)
@@ -468,6 +584,9 @@  hwmon_is_visible(const void *drvdata, enum hwmon_sensor_types type,
 	case hwmon_in:
 		ret = hwmon_in_is_visible(ddat, attr);
 		break;
+	case hwmon_energy:
+		ret = hwmon_energy_is_visible(ddat, attr);
+		break;
 	default:
 		ret = 0;
 		break;
@@ -497,6 +616,9 @@  hwmon_read(struct device *dev, enum hwmon_sensor_types type, u32 attr,
 	case hwmon_in:
 		ret = hwmon_in_read(ddat, attr, val);
 		break;
+	case hwmon_energy:
+		ret = hwmon_energy_read(ddat, attr, val);
+		break;
 	default:
 		ret = -EOPNOTSUPP;
 		break;
@@ -544,12 +666,53 @@  static const struct hwmon_chip_info hwmon_chip_info = {
 	.info = hwmon_info,
 };
 
+static umode_t
+hwmon_gt_is_visible(const void *drvdata, enum hwmon_sensor_types type,
+		    u32 attr, int channel)
+{
+	struct xe_hwmon_data *ddat = (struct xe_hwmon_data *)drvdata;
+
+	switch (type) {
+	case hwmon_energy:
+		return hwmon_energy_is_visible(ddat, attr);
+	default:
+		return 0;
+	}
+}
+
+static int
+hwmon_gt_read(struct device *dev, enum hwmon_sensor_types type, u32 attr,
+	      int channel, long *val)
+{
+	struct xe_hwmon_data *ddat = dev_get_drvdata(dev);
+
+	switch (type) {
+	case hwmon_energy:
+		return hwmon_energy_read(ddat, attr, val);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static const struct hwmon_ops hwmon_gt_ops = {
+	.is_visible = hwmon_gt_is_visible,
+	.read = hwmon_gt_read,
+};
+
+static const struct hwmon_chip_info hwmon_gt_chip_info = {
+	.ops = &hwmon_gt_ops,
+	.info = hwmon_gt_info,
+};
+
 static void
 hwmon_get_preregistration_info(struct xe_device *xe)
 {
 	struct xe_hwmon *hwmon = xe->hwmon;
 	struct xe_hwmon_data *ddat = &hwmon->ddat;
+	struct xe_gt *gt;
+	long energy;
 	u32 val_sku_unit = 0;
+	u8 id;
 	int ret;
 
 	ret = process_hwmon_reg(ddat, REG_PKG_POWER_SKU_UNIT, REG_READ, &val_sku_unit, 0, 0);
@@ -557,8 +720,22 @@  hwmon_get_preregistration_info(struct xe_device *xe)
 	 * The contents of register PKG_POWER_SKU_UNIT do not change,
 	 * so read it once and store the shift values.
 	 */
-	if (!ret)
+	if (!ret) {
 		hwmon->scl_shift_power = REG_FIELD_GET(PKG_PWR_UNIT, val_sku_unit);
+		hwmon->scl_shift_energy = REG_FIELD_GET(PKG_ENERGY_UNIT, val_sku_unit);
+	}
+
+	/*
+	 * Initialize 'struct hwmon_energy_info', i.e. set fields to the
+	 * first value of the energy register read
+	 */
+	if (hwmon_is_visible(ddat, hwmon_energy, hwmon_energy_input, 0))
+		hwmon_energy_get(ddat, &energy);
+
+	for_each_gt(gt, xe, id)
+		if (hwmon_gt_is_visible(&hwmon->ddat_gt[id], hwmon_energy,
+					hwmon_energy_input, 0))
+			hwmon_energy_get(&hwmon->ddat_gt[id], &energy);
 }
 
 void xe_hwmon_register(struct xe_device *xe)
@@ -567,6 +744,9 @@  void xe_hwmon_register(struct xe_device *xe)
 	struct xe_hwmon *hwmon;
 	struct device *hwmon_dev;
 	struct xe_hwmon_data *ddat;
+	struct xe_hwmon_data *ddat_gt;
+	struct xe_gt *gt;
+	u8 id;
 
 	/* hwmon is available only for dGfx */
 	if (!IS_DGFX(xe))
@@ -583,13 +763,21 @@  void xe_hwmon_register(struct xe_device *xe)
 
 	/* primary GT to access device level properties */
 	ddat->gt = xe->tiles[0].primary_gt;
+	ddat->type = HWMON_DEVICE;
 
 	snprintf(ddat->name, sizeof(ddat->name), "xe");
 
-	hwmon_get_preregistration_info(xe);
-
 	init_waitqueue_head(&hwmon->waitq);
 
+	for_each_gt(gt, xe, id) {
+		ddat_gt = hwmon->ddat_gt + id;
+		ddat_gt->gt = gt;
+		snprintf(ddat_gt->name, sizeof(ddat_gt->name), "xe_gt%u", id);
+		ddat_gt->type = HWMON_GT;
+	}
+
+	hwmon_get_preregistration_info(xe);
+
 	drm_dbg(&xe->drm, "Register xe hwmon interface\n");
 
 	/* hwmon_dev points to device hwmon<i> */
@@ -605,6 +793,26 @@  void xe_hwmon_register(struct xe_device *xe)
 	}
 
 	ddat->hwmon_dev = hwmon_dev;
+
+	for_each_gt(gt, xe, id) {
+		ddat_gt = hwmon->ddat_gt + id;
+		/*
+		 * Create per-gt directories only if a per-gt attribute is
+		 * visible. Currently this is only energy
+		 */
+		if (!hwmon_gt_is_visible(ddat_gt, hwmon_energy, hwmon_energy_input, 0))
+			continue;
+
+		hwmon_dev = devm_hwmon_device_register_with_info(dev, ddat_gt->name,
+								 ddat_gt,
+								 &hwmon_gt_chip_info,
+								 NULL);
+		if (IS_ERR(hwmon_dev))
+			drm_warn(&xe->drm, "Fail to register xe_gt %d hwmon, Err:%ld\n",
+				 id, PTR_ERR(hwmon_dev));
+		else
+			ddat_gt->hwmon_dev = hwmon_dev;
+	}
 }
 
 void xe_hwmon_unregister(struct xe_device *xe)