diff mbox series

[2/3] hwmon: surface_temp: Add support for sensor names

Message ID 20240330112409.3402943-3-luzmaximilian@gmail.com (mailing list archive)
State Changes Requested
Headers show
Series Add thermal sensor driver for Surface Aggregator | expand

Commit Message

Maximilian Luz March 30, 2024, 11:24 a.m. UTC
From: Ivor Wanders <ivor@iwanders.net>

The thermal subsystem of the Surface Aggregator Module allows us to
query the names of the respective thermal sensors. Forward those to
userspace.

Signed-off-by: Ivor Wanders <ivor@iwanders.net>
Co-developed-by: Maximilian Luz <luzmaximilian@gmail.com>
Signed-off-by: Maximilian Luz <luzmaximilian@gmail.com>
---
 drivers/hwmon/surface_temp.c | 112 +++++++++++++++++++++++++++++------
 1 file changed, 95 insertions(+), 17 deletions(-)

Comments

Hans de Goede April 8, 2024, 1:34 p.m. UTC | #1
Hi,

On 3/30/24 12:24 PM, Maximilian Luz wrote:
> From: Ivor Wanders <ivor@iwanders.net>
> 
> The thermal subsystem of the Surface Aggregator Module allows us to
> query the names of the respective thermal sensors. Forward those to
> userspace.
> 
> Signed-off-by: Ivor Wanders <ivor@iwanders.net>
> Co-developed-by: Maximilian Luz <luzmaximilian@gmail.com>
> Signed-off-by: Maximilian Luz <luzmaximilian@gmail.com>

Thanks, patch looks good to me:

Reviewed-by: Hans de Goede <hdegoede@redhat.com>

Regards,

Hans



> ---
>  drivers/hwmon/surface_temp.c | 112 +++++++++++++++++++++++++++++------
>  1 file changed, 95 insertions(+), 17 deletions(-)
> 
> diff --git a/drivers/hwmon/surface_temp.c b/drivers/hwmon/surface_temp.c
> index 48c3e826713f6..7a2e1f638336c 100644
> --- a/drivers/hwmon/surface_temp.c
> +++ b/drivers/hwmon/surface_temp.c
> @@ -17,6 +17,27 @@
>  
>  /* -- SAM interface. -------------------------------------------------------- */
>  
> +/*
> + * Available sensors are indicated by a 16-bit bitfield, where a 1 marks the
> + * presence of a sensor. So we have at most 16 possible sensors/channels.
> + */
> +#define SSAM_TMP_SENSOR_MAX_COUNT 16
> +
> +/*
> + * All names observed so far are 6 characters long, but there's only
> + * zeros after the name, so perhaps they can be longer. This number reflects
> + * the maximum zero-padded space observed in the returned buffer.
> + */
> +#define SSAM_TMP_SENSOR_NAME_LENGTH 18
> +
> +struct ssam_tmp_get_name_rsp {
> +	__le16 unknown1;
> +	char unknown2;
> +	char name[SSAM_TMP_SENSOR_NAME_LENGTH];
> +} __packed;
> +
> +static_assert(sizeof(struct ssam_tmp_get_name_rsp) == 21);
> +
>  SSAM_DEFINE_SYNC_REQUEST_CL_R(__ssam_tmp_get_available_sensors, __le16, {
>  	.target_category = SSAM_SSH_TC_TMP,
>  	.command_id      = 0x04,
> @@ -27,6 +48,11 @@ SSAM_DEFINE_SYNC_REQUEST_MD_R(__ssam_tmp_get_temperature, __le16, {
>  	.command_id      = 0x01,
>  });
>  
> +SSAM_DEFINE_SYNC_REQUEST_MD_R(__ssam_tmp_get_name, struct ssam_tmp_get_name_rsp, {
> +	.target_category = SSAM_SSH_TC_TMP,
> +	.command_id      = 0x0e,
> +});
> +
>  static int ssam_tmp_get_available_sensors(struct ssam_device *sdev, s16 *sensors)
>  {
>  	__le16 sensors_le;
> @@ -54,12 +80,37 @@ static int ssam_tmp_get_temperature(struct ssam_device *sdev, u8 iid, long *temp
>  	return 0;
>  }
>  
> +static int ssam_tmp_get_name(struct ssam_device *sdev, u8 iid, char *buf, size_t buf_len)
> +{
> +	struct ssam_tmp_get_name_rsp name_rsp;
> +	int status;
> +
> +	status =  __ssam_tmp_get_name(sdev->ctrl, sdev->uid.target, iid, &name_rsp);
> +	if (status)
> +		return status;
> +
> +	/*
> +	 * This should not fail unless the name in the returned struct is not
> +	 * null-terminated or someone changed something in the struct
> +	 * definitions above, since our buffer and struct have the same
> +	 * capacity by design. So if this fails blow this up with a warning.
> +	 * Since the more likely cause is that the returned string isn't
> +	 * null-terminated, we might have received garbage (as opposed to just
> +	 * an incomplete string), so also fail the function.
> +	 */
> +	status = strscpy(buf, name_rsp.name, buf_len);
> +	WARN_ON(status < 0);
> +
> +	return status < 0 ? status : 0;
> +}
> +
>  
>  /* -- Driver.---------------------------------------------------------------- */
>  
>  struct ssam_temp {
>  	struct ssam_device *sdev;
>  	s16 sensors;
> +	char names[SSAM_TMP_SENSOR_MAX_COUNT][SSAM_TMP_SENSOR_NAME_LENGTH];
>  };
>  
>  static umode_t ssam_temp_hwmon_is_visible(const void *data,
> @@ -83,33 +134,47 @@ static int ssam_temp_hwmon_read(struct device *dev,
>  	return ssam_tmp_get_temperature(ssam_temp->sdev, channel + 1, value);
>  }
>  
> +static int ssam_temp_hwmon_read_string(struct device *dev,
> +				       enum hwmon_sensor_types type,
> +				       u32 attr, int channel, const char **str)
> +{
> +	const struct ssam_temp *ssam_temp = dev_get_drvdata(dev);
> +
> +	*str = ssam_temp->names[channel];
> +	return 0;
> +}
> +
>  static const struct hwmon_channel_info * const ssam_temp_hwmon_info[] = {
>  	HWMON_CHANNEL_INFO(chip,
>  			   HWMON_C_REGISTER_TZ),
> -	/* We have at most 16 thermal sensor channels. */
> +	/*
> +	 * We have at most SSAM_TMP_SENSOR_MAX_COUNT = 16 thermal sensor
> +	 * channels.
> +	 */
>  	HWMON_CHANNEL_INFO(temp,
> -			   HWMON_T_INPUT,
> -			   HWMON_T_INPUT,
> -			   HWMON_T_INPUT,
> -			   HWMON_T_INPUT,
> -			   HWMON_T_INPUT,
> -			   HWMON_T_INPUT,
> -			   HWMON_T_INPUT,
> -			   HWMON_T_INPUT,
> -			   HWMON_T_INPUT,
> -			   HWMON_T_INPUT,
> -			   HWMON_T_INPUT,
> -			   HWMON_T_INPUT,
> -			   HWMON_T_INPUT,
> -			   HWMON_T_INPUT,
> -			   HWMON_T_INPUT,
> -			   HWMON_T_INPUT),
> +			   HWMON_T_INPUT | HWMON_T_LABEL,
> +			   HWMON_T_INPUT | HWMON_T_LABEL,
> +			   HWMON_T_INPUT | HWMON_T_LABEL,
> +			   HWMON_T_INPUT | HWMON_T_LABEL,
> +			   HWMON_T_INPUT | HWMON_T_LABEL,
> +			   HWMON_T_INPUT | HWMON_T_LABEL,
> +			   HWMON_T_INPUT | HWMON_T_LABEL,
> +			   HWMON_T_INPUT | HWMON_T_LABEL,
> +			   HWMON_T_INPUT | HWMON_T_LABEL,
> +			   HWMON_T_INPUT | HWMON_T_LABEL,
> +			   HWMON_T_INPUT | HWMON_T_LABEL,
> +			   HWMON_T_INPUT | HWMON_T_LABEL,
> +			   HWMON_T_INPUT | HWMON_T_LABEL,
> +			   HWMON_T_INPUT | HWMON_T_LABEL,
> +			   HWMON_T_INPUT | HWMON_T_LABEL,
> +			   HWMON_T_INPUT | HWMON_T_LABEL),
>  	NULL
>  };
>  
>  static const struct hwmon_ops ssam_temp_hwmon_ops = {
>  	.is_visible = ssam_temp_hwmon_is_visible,
>  	.read = ssam_temp_hwmon_read,
> +	.read_string = ssam_temp_hwmon_read_string,
>  };
>  
>  static const struct hwmon_chip_info ssam_temp_hwmon_chip_info = {
> @@ -122,6 +187,7 @@ static int ssam_temp_probe(struct ssam_device *sdev)
>  	struct ssam_temp *ssam_temp;
>  	struct device *hwmon_dev;
>  	s16 sensors;
> +	int channel;
>  	int status;
>  
>  	status = ssam_tmp_get_available_sensors(sdev, &sensors);
> @@ -135,6 +201,18 @@ static int ssam_temp_probe(struct ssam_device *sdev)
>  	ssam_temp->sdev = sdev;
>  	ssam_temp->sensors = sensors;
>  
> +	/* Retrieve the name for each available sensor. */
> +	for (channel = 0; channel < SSAM_TMP_SENSOR_MAX_COUNT; channel++) {
> +		if (!(sensors & BIT(channel)))
> +			continue;
> +
> +		status = ssam_tmp_get_name(sdev, channel + 1,
> +					   ssam_temp->names[channel],
> +					   SSAM_TMP_SENSOR_NAME_LENGTH);
> +		if (status)
> +			return status;
> +	}
> +
>  	hwmon_dev = devm_hwmon_device_register_with_info(&sdev->dev,
>  			"surface_thermal", ssam_temp, &ssam_temp_hwmon_chip_info,
>  			NULL);
Guenter Roeck April 16, 2024, 1:30 p.m. UTC | #2
On Sat, Mar 30, 2024 at 12:24:01PM +0100, Maximilian Luz wrote:
> From: Ivor Wanders <ivor@iwanders.net>
> 
> The thermal subsystem of the Surface Aggregator Module allows us to
> query the names of the respective thermal sensors. Forward those to
> userspace.
> 
> Signed-off-by: Ivor Wanders <ivor@iwanders.net>
> Co-developed-by: Maximilian Luz <luzmaximilian@gmail.com>
> Signed-off-by: Maximilian Luz <luzmaximilian@gmail.com>
> Reviewed-by: Hans de Goede <hdegoede@redhat.com>
> ---
>  drivers/hwmon/surface_temp.c | 112 +++++++++++++++++++++++++++++------
>  1 file changed, 95 insertions(+), 17 deletions(-)
> 
> diff --git a/drivers/hwmon/surface_temp.c b/drivers/hwmon/surface_temp.c
> index 48c3e826713f6..7a2e1f638336c 100644
> --- a/drivers/hwmon/surface_temp.c
> +++ b/drivers/hwmon/surface_temp.c
> @@ -17,6 +17,27 @@
>  
>  /* -- SAM interface. -------------------------------------------------------- */
>  
> +/*
> + * Available sensors are indicated by a 16-bit bitfield, where a 1 marks the
> + * presence of a sensor. So we have at most 16 possible sensors/channels.
> + */
> +#define SSAM_TMP_SENSOR_MAX_COUNT 16

#define<space>DEFINITION<tab>value

> +
> +/*
> + * All names observed so far are 6 characters long, but there's only
> + * zeros after the name, so perhaps they can be longer. This number reflects
> + * the maximum zero-padded space observed in the returned buffer.
> + */
> +#define SSAM_TMP_SENSOR_NAME_LENGTH 18
> +
> +struct ssam_tmp_get_name_rsp {
> +	__le16 unknown1;
> +	char unknown2;
> +	char name[SSAM_TMP_SENSOR_NAME_LENGTH];
> +} __packed;
> +
> +static_assert(sizeof(struct ssam_tmp_get_name_rsp) == 21);
> +
>  SSAM_DEFINE_SYNC_REQUEST_CL_R(__ssam_tmp_get_available_sensors, __le16, {
>  	.target_category = SSAM_SSH_TC_TMP,
>  	.command_id      = 0x04,
> @@ -27,6 +48,11 @@ SSAM_DEFINE_SYNC_REQUEST_MD_R(__ssam_tmp_get_temperature, __le16, {
>  	.command_id      = 0x01,
>  });
>  
> +SSAM_DEFINE_SYNC_REQUEST_MD_R(__ssam_tmp_get_name, struct ssam_tmp_get_name_rsp, {
> +	.target_category = SSAM_SSH_TC_TMP,
> +	.command_id      = 0x0e,
> +});
> +
>  static int ssam_tmp_get_available_sensors(struct ssam_device *sdev, s16 *sensors)
>  {
>  	__le16 sensors_le;
> @@ -54,12 +80,37 @@ static int ssam_tmp_get_temperature(struct ssam_device *sdev, u8 iid, long *temp
>  	return 0;
>  }
>  
> +static int ssam_tmp_get_name(struct ssam_device *sdev, u8 iid, char *buf, size_t buf_len)
> +{
> +	struct ssam_tmp_get_name_rsp name_rsp;
> +	int status;
> +
> +	status =  __ssam_tmp_get_name(sdev->ctrl, sdev->uid.target, iid, &name_rsp);
> +	if (status)
> +		return status;
> +
> +	/*
> +	 * This should not fail unless the name in the returned struct is not
> +	 * null-terminated or someone changed something in the struct
> +	 * definitions above, since our buffer and struct have the same
> +	 * capacity by design. So if this fails blow this up with a warning.
> +	 * Since the more likely cause is that the returned string isn't
> +	 * null-terminated, we might have received garbage (as opposed to just
> +	 * an incomplete string), so also fail the function.
> +	 */
> +	status = strscpy(buf, name_rsp.name, buf_len);
> +	WARN_ON(status < 0);

Not acceptable. From include/asm-generic/bug.h:

 * Do not use these macros when checking for invalid external inputs
 * (e.g. invalid system call arguments, or invalid data coming from
 * network/devices), and on transient conditions like ENOMEM or EAGAIN.
 * These macros should be used for recoverable kernel issues only.

> +
> +	return status < 0 ? status : 0;
> +}
> +
>  
>  /* -- Driver.---------------------------------------------------------------- */
>  
>  struct ssam_temp {
>  	struct ssam_device *sdev;
>  	s16 sensors;
> +	char names[SSAM_TMP_SENSOR_MAX_COUNT][SSAM_TMP_SENSOR_NAME_LENGTH];
>  };
>  
>  static umode_t ssam_temp_hwmon_is_visible(const void *data,
> @@ -83,33 +134,47 @@ static int ssam_temp_hwmon_read(struct device *dev,
>  	return ssam_tmp_get_temperature(ssam_temp->sdev, channel + 1, value);
>  }
>  
> +static int ssam_temp_hwmon_read_string(struct device *dev,
> +				       enum hwmon_sensor_types type,
> +				       u32 attr, int channel, const char **str)
> +{
> +	const struct ssam_temp *ssam_temp = dev_get_drvdata(dev);
> +
> +	*str = ssam_temp->names[channel];
> +	return 0;
> +}
> +
>  static const struct hwmon_channel_info * const ssam_temp_hwmon_info[] = {
>  	HWMON_CHANNEL_INFO(chip,
>  			   HWMON_C_REGISTER_TZ),
> -	/* We have at most 16 thermal sensor channels. */
> +	/*
> +	 * We have at most SSAM_TMP_SENSOR_MAX_COUNT = 16 thermal sensor
> +	 * channels.
> +	 */

Pointless comment. Already explained above, and perfect example showing
why it has no value separating this and the previous patch.

>  	HWMON_CHANNEL_INFO(temp,
> -			   HWMON_T_INPUT,
> -			   HWMON_T_INPUT,
> -			   HWMON_T_INPUT,
> -			   HWMON_T_INPUT,
> -			   HWMON_T_INPUT,
> -			   HWMON_T_INPUT,
> -			   HWMON_T_INPUT,
> -			   HWMON_T_INPUT,
> -			   HWMON_T_INPUT,
> -			   HWMON_T_INPUT,
> -			   HWMON_T_INPUT,
> -			   HWMON_T_INPUT,
> -			   HWMON_T_INPUT,
> -			   HWMON_T_INPUT,
> -			   HWMON_T_INPUT,
> -			   HWMON_T_INPUT),
> +			   HWMON_T_INPUT | HWMON_T_LABEL,
> +			   HWMON_T_INPUT | HWMON_T_LABEL,
> +			   HWMON_T_INPUT | HWMON_T_LABEL,
> +			   HWMON_T_INPUT | HWMON_T_LABEL,
> +			   HWMON_T_INPUT | HWMON_T_LABEL,
> +			   HWMON_T_INPUT | HWMON_T_LABEL,
> +			   HWMON_T_INPUT | HWMON_T_LABEL,
> +			   HWMON_T_INPUT | HWMON_T_LABEL,
> +			   HWMON_T_INPUT | HWMON_T_LABEL,
> +			   HWMON_T_INPUT | HWMON_T_LABEL,
> +			   HWMON_T_INPUT | HWMON_T_LABEL,
> +			   HWMON_T_INPUT | HWMON_T_LABEL,
> +			   HWMON_T_INPUT | HWMON_T_LABEL,
> +			   HWMON_T_INPUT | HWMON_T_LABEL,
> +			   HWMON_T_INPUT | HWMON_T_LABEL,
> +			   HWMON_T_INPUT | HWMON_T_LABEL),

Another example. Why have me review the previous patch
just to change the code here ?

[ ... ]

> +	/* Retrieve the name for each available sensor. */
> +	for (channel = 0; channel < SSAM_TMP_SENSOR_MAX_COUNT; channel++) {
> +		if (!(sensors & BIT(channel)))
> +			continue;
> +
> +		status = ssam_tmp_get_name(sdev, channel + 1,
> +					   ssam_temp->names[channel],
> +					   SSAM_TMP_SENSOR_NAME_LENGTH);
> +		if (status)
> +			return status;

Your call to fail probe in this case just because it can not find
a sensor name. I personally find that quite aggressive.

Guenter
Maximilian Luz April 16, 2024, 7 p.m. UTC | #3
On 4/16/24 3:30 PM, Guenter Roeck wrote:
> On Sat, Mar 30, 2024 at 12:24:01PM +0100, Maximilian Luz wrote:

[...]

>> +static int ssam_tmp_get_name(struct ssam_device *sdev, u8 iid, char *buf, size_t buf_len)
>> +{
>> +	struct ssam_tmp_get_name_rsp name_rsp;
>> +	int status;
>> +
>> +	status =  __ssam_tmp_get_name(sdev->ctrl, sdev->uid.target, iid, &name_rsp);
>> +	if (status)
>> +		return status;
>> +
>> +	/*
>> +	 * This should not fail unless the name in the returned struct is not
>> +	 * null-terminated or someone changed something in the struct
>> +	 * definitions above, since our buffer and struct have the same
>> +	 * capacity by design. So if this fails blow this up with a warning.
>> +	 * Since the more likely cause is that the returned string isn't
>> +	 * null-terminated, we might have received garbage (as opposed to just
>> +	 * an incomplete string), so also fail the function.
>> +	 */
>> +	status = strscpy(buf, name_rsp.name, buf_len);
>> +	WARN_ON(status < 0);
> 
> Not acceptable. From include/asm-generic/bug.h:
> 
>   * Do not use these macros when checking for invalid external inputs
>   * (e.g. invalid system call arguments, or invalid data coming from
>   * network/devices), and on transient conditions like ENOMEM or EAGAIN.
>   * These macros should be used for recoverable kernel issues only.
>

Hmm, I always interpreted that as "do not use for checking user-defined
input", which this is not.

The reason I added/requested it here was to check for "bugs" in how we
think the interface behaves (and our definitions related to it) as the
interface was reverse-engineered. Generally, when this fails I expect
that we made some mistake in our code (or the things we assume about the
interface), which likely causes us to interpret the received data as
"garbage" (and not the EC sending corrupted data, which it is generally
not due to CRC checking and validation in the SAM driver). Hence, I
personally would prefer if this blows up in a big warning with a trace
attached to it, so that an end-user can easily report this to us and
that we can appropriately deal with it. As opposed to some one-line
error message that will likely get overlooked or not taken as seriously.

If you still insist, I could change that to a dev_err() message. Or
maybe make the comment a bit clearer.

>> +
>> +	return status < 0 ? status : 0;
>> +}
>> +
>>   
>>   /* -- Driver.---------------------------------------------------------------- */
>>   
>>   struct ssam_temp {
>>   	struct ssam_device *sdev;
>>   	s16 sensors;
>> +	char names[SSAM_TMP_SENSOR_MAX_COUNT][SSAM_TMP_SENSOR_NAME_LENGTH];
>>   };
>>   
>>   static umode_t ssam_temp_hwmon_is_visible(const void *data,
>> @@ -83,33 +134,47 @@ static int ssam_temp_hwmon_read(struct device *dev,
>>   	return ssam_tmp_get_temperature(ssam_temp->sdev, channel + 1, value);
>>   }
>>   
>> +static int ssam_temp_hwmon_read_string(struct device *dev,
>> +				       enum hwmon_sensor_types type,
>> +				       u32 attr, int channel, const char **str)
>> +{
>> +	const struct ssam_temp *ssam_temp = dev_get_drvdata(dev);
>> +
>> +	*str = ssam_temp->names[channel];
>> +	return 0;
>> +}
>> +
>>   static const struct hwmon_channel_info * const ssam_temp_hwmon_info[] = {
>>   	HWMON_CHANNEL_INFO(chip,
>>   			   HWMON_C_REGISTER_TZ),
>> -	/* We have at most 16 thermal sensor channels. */
>> +	/*
>> +	 * We have at most SSAM_TMP_SENSOR_MAX_COUNT = 16 thermal sensor
>> +	 * channels.
>> +	 */
> 
> Pointless comment. Already explained above, and perfect example showing
> why it has no value separating this and the previous patch.

I can remove the comment.

The reason for it being two separate patches is to retain proper
attribution. I am sorry that this has created more work for you.

In short, Ivor reverse-engineered the interface calls to get the sensor
names and wrote the vast majority of this patch (I only changed some
smaller things and gave advice, hence the co-developed-by). Therefore I
wanted to give proper attribution to Ivor for his work and not squash it
into a single patch.

>>   	HWMON_CHANNEL_INFO(temp,
>> -			   HWMON_T_INPUT,
>> -			   HWMON_T_INPUT,
>> -			   HWMON_T_INPUT,
>> -			   HWMON_T_INPUT,
>> -			   HWMON_T_INPUT,
>> -			   HWMON_T_INPUT,
>> -			   HWMON_T_INPUT,
>> -			   HWMON_T_INPUT,
>> -			   HWMON_T_INPUT,
>> -			   HWMON_T_INPUT,
>> -			   HWMON_T_INPUT,
>> -			   HWMON_T_INPUT,
>> -			   HWMON_T_INPUT,
>> -			   HWMON_T_INPUT,
>> -			   HWMON_T_INPUT,
>> -			   HWMON_T_INPUT),
>> +			   HWMON_T_INPUT | HWMON_T_LABEL,
>> +			   HWMON_T_INPUT | HWMON_T_LABEL,
>> +			   HWMON_T_INPUT | HWMON_T_LABEL,
>> +			   HWMON_T_INPUT | HWMON_T_LABEL,
>> +			   HWMON_T_INPUT | HWMON_T_LABEL,
>> +			   HWMON_T_INPUT | HWMON_T_LABEL,
>> +			   HWMON_T_INPUT | HWMON_T_LABEL,
>> +			   HWMON_T_INPUT | HWMON_T_LABEL,
>> +			   HWMON_T_INPUT | HWMON_T_LABEL,
>> +			   HWMON_T_INPUT | HWMON_T_LABEL,
>> +			   HWMON_T_INPUT | HWMON_T_LABEL,
>> +			   HWMON_T_INPUT | HWMON_T_LABEL,
>> +			   HWMON_T_INPUT | HWMON_T_LABEL,
>> +			   HWMON_T_INPUT | HWMON_T_LABEL,
>> +			   HWMON_T_INPUT | HWMON_T_LABEL,
>> +			   HWMON_T_INPUT | HWMON_T_LABEL),
> 
> Another example. Why have me review the previous patch
> just to change the code here ?

See above.

> 
> [ ... ]
> 
>> +	/* Retrieve the name for each available sensor. */
>> +	for (channel = 0; channel < SSAM_TMP_SENSOR_MAX_COUNT; channel++) {
>> +		if (!(sensors & BIT(channel)))
>> +			continue;
>> +
>> +		status = ssam_tmp_get_name(sdev, channel + 1,
>> +					   ssam_temp->names[channel],
>> +					   SSAM_TMP_SENSOR_NAME_LENGTH);
>> +		if (status)
>> +			return status;
> 
> Your call to fail probe in this case just because it can not find
> a sensor name. I personally find that quite aggressive.

We generally do not expect this to fail. And I think if this fails, it
should either be something wrong with the EC communication (in a way
that breaks other things like reading temperature values as well) or in
our assumptions of how the interface works. So similar to above, I
personally would prefer this to fail in a more noisy way, so that people
are more likely to tell us about the failure.

As an example: We expect sensor names and the interface for it to be
present. However, maybe some device (either one we couldn't test or some
future one) does not implement the interface call. Usually, an
unsupported call results in a timeout error. So if we would just ignore
that, the driver would load slow (as for each name it would wait for the
timeout), but users might not notice as the temperature sensors can
still be accessed normally after that. I do see that as an easily
detectable bug. Letting it continue to load IMHO just creates a more
subtle bug.

As above, if you prefer I can change that. Just let me know.

Thank you for your review.

Best regards,
Max
Guenter Roeck April 16, 2024, 9:08 p.m. UTC | #4
On Tue, Apr 16, 2024 at 09:00:05PM +0200, Maximilian Luz wrote:
> On 4/16/24 3:30 PM, Guenter Roeck wrote:
> > On Sat, Mar 30, 2024 at 12:24:01PM +0100, Maximilian Luz wrote:
> 
> [...]
> 
> > > +static int ssam_tmp_get_name(struct ssam_device *sdev, u8 iid, char *buf, size_t buf_len)
> > > +{
> > > +	struct ssam_tmp_get_name_rsp name_rsp;
> > > +	int status;
> > > +
> > > +	status =  __ssam_tmp_get_name(sdev->ctrl, sdev->uid.target, iid, &name_rsp);
> > > +	if (status)
> > > +		return status;
> > > +
> > > +	/*
> > > +	 * This should not fail unless the name in the returned struct is not
> > > +	 * null-terminated or someone changed something in the struct
> > > +	 * definitions above, since our buffer and struct have the same
> > > +	 * capacity by design. So if this fails blow this up with a warning.
> > > +	 * Since the more likely cause is that the returned string isn't
> > > +	 * null-terminated, we might have received garbage (as opposed to just
> > > +	 * an incomplete string), so also fail the function.
> > > +	 */
> > > +	status = strscpy(buf, name_rsp.name, buf_len);
> > > +	WARN_ON(status < 0);
> > 
> > Not acceptable. From include/asm-generic/bug.h:
> > 
> >   * Do not use these macros when checking for invalid external inputs
> >   * (e.g. invalid system call arguments, or invalid data coming from
> >   * network/devices), and on transient conditions like ENOMEM or EAGAIN.
> >   * These macros should be used for recoverable kernel issues only.
> > 
> 
> Hmm, I always interpreted that as "do not use for checking user-defined
> input", which this is not.
> 

"invalid data coming from network/devices" is not user-defined input.

> The reason I added/requested it here was to check for "bugs" in how we
> think the interface behaves (and our definitions related to it) as the
> interface was reverse-engineered. Generally, when this fails I expect
> that we made some mistake in our code (or the things we assume about the
> interface), which likely causes us to interpret the received data as
> "garbage" (and not the EC sending corrupted data, which it is generally
> not due to CRC checking and validation in the SAM driver). Hence, I
> personally would prefer if this blows up in a big warning with a trace
> attached to it, so that an end-user can easily report this to us and
> that we can appropriately deal with it. As opposed to some one-line
> error message that will likely get overlooked or not taken as seriously.
> 

I have heard the "This backtrace is absolutely essential" argument before,
including the "will be fixed" part. Chromebooks report more than 500,000
warning backtraces _every single day_. None of them is getting fixed.

> If you still insist, I could change that to a dev_err() message. Or
> maybe make the comment a bit clearer.

dev_err() would be acceptable. WARN() or WARN_ON() are no-go.

Guenter
Ivor Wanders April 16, 2024, 9:34 p.m. UTC | #5
> Ivor reverse-engineered the interface calls to get the sensor
> names and wrote the vast majority of this patch (I only changed some
> smaller things and gave advice, hence the co-developed-by). Therefore I
> wanted to give proper attribution to Ivor for his work and not squash it
> into a single patch.

By all means squash them in the next patch revision to make things simpler
to review, I don't think it's a large enough piece of work to worry too
much about attribution if it makes review more difficult.

~Ivor
Maximilian Luz May 2, 2024, 8:04 p.m. UTC | #6
On 4/16/24 11:34 PM, Ivor Wanders wrote:
>> Ivor reverse-engineered the interface calls to get the sensor
>> names and wrote the vast majority of this patch (I only changed some
>> smaller things and gave advice, hence the co-developed-by). Therefore I
>> wanted to give proper attribution to Ivor for his work and not squash it
>> into a single patch.
> 
> By all means squash them in the next patch revision to make things simpler
> to review, I don't think it's a large enough piece of work to worry too
> much about attribution if it makes review more difficult.

Alright, I will do that then.

Thanks Ivor!

Best regards,
Max
Maximilian Luz May 2, 2024, 8:05 p.m. UTC | #7
Hi,

On 4/16/24 11:08 PM, Guenter Roeck wrote:
> On Tue, Apr 16, 2024 at 09:00:05PM +0200, Maximilian Luz wrote:
>> On 4/16/24 3:30 PM, Guenter Roeck wrote:
>>> On Sat, Mar 30, 2024 at 12:24:01PM +0100, Maximilian Luz wrote:
>>
>> [...]
>>
>>>> +static int ssam_tmp_get_name(struct ssam_device *sdev, u8 iid, char *buf, size_t buf_len)
>>>> +{
>>>> +	struct ssam_tmp_get_name_rsp name_rsp;
>>>> +	int status;
>>>> +
>>>> +	status =  __ssam_tmp_get_name(sdev->ctrl, sdev->uid.target, iid, &name_rsp);
>>>> +	if (status)
>>>> +		return status;
>>>> +
>>>> +	/*
>>>> +	 * This should not fail unless the name in the returned struct is not
>>>> +	 * null-terminated or someone changed something in the struct
>>>> +	 * definitions above, since our buffer and struct have the same
>>>> +	 * capacity by design. So if this fails blow this up with a warning.
>>>> +	 * Since the more likely cause is that the returned string isn't
>>>> +	 * null-terminated, we might have received garbage (as opposed to just
>>>> +	 * an incomplete string), so also fail the function.
>>>> +	 */
>>>> +	status = strscpy(buf, name_rsp.name, buf_len);
>>>> +	WARN_ON(status < 0);
>>>
>>> Not acceptable. From include/asm-generic/bug.h:
>>>
>>>    * Do not use these macros when checking for invalid external inputs
>>>    * (e.g. invalid system call arguments, or invalid data coming from
>>>    * network/devices), and on transient conditions like ENOMEM or EAGAIN.
>>>    * These macros should be used for recoverable kernel issues only.
>>>
>>
>> Hmm, I always interpreted that as "do not use for checking user-defined
>> input", which this is not.
>>
> 
> "invalid data coming from network/devices" is not user-defined input.
> 
>> The reason I added/requested it here was to check for "bugs" in how we
>> think the interface behaves (and our definitions related to it) as the
>> interface was reverse-engineered. Generally, when this fails I expect
>> that we made some mistake in our code (or the things we assume about the
>> interface), which likely causes us to interpret the received data as
>> "garbage" (and not the EC sending corrupted data, which it is generally
>> not due to CRC checking and validation in the SAM driver). Hence, I
>> personally would prefer if this blows up in a big warning with a trace
>> attached to it, so that an end-user can easily report this to us and
>> that we can appropriately deal with it. As opposed to some one-line
>> error message that will likely get overlooked or not taken as seriously.
>>
> 
> I have heard the "This backtrace is absolutely essential" argument before,
> including the "will be fixed" part. Chromebooks report more than 500,000
> warning backtraces _every single day_. None of them is getting fixed.
> 
>> If you still insist, I could change that to a dev_err() message. Or
>> maybe make the comment a bit clearer.
> 
> dev_err() would be acceptable. WARN() or WARN_ON() are no-go.

Sorry for the delayed response. I will change this to a dev_err() then
and try to re-spin the patches this weekend.

Best regards,
Max
diff mbox series

Patch

diff --git a/drivers/hwmon/surface_temp.c b/drivers/hwmon/surface_temp.c
index 48c3e826713f6..7a2e1f638336c 100644
--- a/drivers/hwmon/surface_temp.c
+++ b/drivers/hwmon/surface_temp.c
@@ -17,6 +17,27 @@ 
 
 /* -- SAM interface. -------------------------------------------------------- */
 
+/*
+ * Available sensors are indicated by a 16-bit bitfield, where a 1 marks the
+ * presence of a sensor. So we have at most 16 possible sensors/channels.
+ */
+#define SSAM_TMP_SENSOR_MAX_COUNT 16
+
+/*
+ * All names observed so far are 6 characters long, but there's only
+ * zeros after the name, so perhaps they can be longer. This number reflects
+ * the maximum zero-padded space observed in the returned buffer.
+ */
+#define SSAM_TMP_SENSOR_NAME_LENGTH 18
+
+struct ssam_tmp_get_name_rsp {
+	__le16 unknown1;
+	char unknown2;
+	char name[SSAM_TMP_SENSOR_NAME_LENGTH];
+} __packed;
+
+static_assert(sizeof(struct ssam_tmp_get_name_rsp) == 21);
+
 SSAM_DEFINE_SYNC_REQUEST_CL_R(__ssam_tmp_get_available_sensors, __le16, {
 	.target_category = SSAM_SSH_TC_TMP,
 	.command_id      = 0x04,
@@ -27,6 +48,11 @@  SSAM_DEFINE_SYNC_REQUEST_MD_R(__ssam_tmp_get_temperature, __le16, {
 	.command_id      = 0x01,
 });
 
+SSAM_DEFINE_SYNC_REQUEST_MD_R(__ssam_tmp_get_name, struct ssam_tmp_get_name_rsp, {
+	.target_category = SSAM_SSH_TC_TMP,
+	.command_id      = 0x0e,
+});
+
 static int ssam_tmp_get_available_sensors(struct ssam_device *sdev, s16 *sensors)
 {
 	__le16 sensors_le;
@@ -54,12 +80,37 @@  static int ssam_tmp_get_temperature(struct ssam_device *sdev, u8 iid, long *temp
 	return 0;
 }
 
+static int ssam_tmp_get_name(struct ssam_device *sdev, u8 iid, char *buf, size_t buf_len)
+{
+	struct ssam_tmp_get_name_rsp name_rsp;
+	int status;
+
+	status =  __ssam_tmp_get_name(sdev->ctrl, sdev->uid.target, iid, &name_rsp);
+	if (status)
+		return status;
+
+	/*
+	 * This should not fail unless the name in the returned struct is not
+	 * null-terminated or someone changed something in the struct
+	 * definitions above, since our buffer and struct have the same
+	 * capacity by design. So if this fails blow this up with a warning.
+	 * Since the more likely cause is that the returned string isn't
+	 * null-terminated, we might have received garbage (as opposed to just
+	 * an incomplete string), so also fail the function.
+	 */
+	status = strscpy(buf, name_rsp.name, buf_len);
+	WARN_ON(status < 0);
+
+	return status < 0 ? status : 0;
+}
+
 
 /* -- Driver.---------------------------------------------------------------- */
 
 struct ssam_temp {
 	struct ssam_device *sdev;
 	s16 sensors;
+	char names[SSAM_TMP_SENSOR_MAX_COUNT][SSAM_TMP_SENSOR_NAME_LENGTH];
 };
 
 static umode_t ssam_temp_hwmon_is_visible(const void *data,
@@ -83,33 +134,47 @@  static int ssam_temp_hwmon_read(struct device *dev,
 	return ssam_tmp_get_temperature(ssam_temp->sdev, channel + 1, value);
 }
 
+static int ssam_temp_hwmon_read_string(struct device *dev,
+				       enum hwmon_sensor_types type,
+				       u32 attr, int channel, const char **str)
+{
+	const struct ssam_temp *ssam_temp = dev_get_drvdata(dev);
+
+	*str = ssam_temp->names[channel];
+	return 0;
+}
+
 static const struct hwmon_channel_info * const ssam_temp_hwmon_info[] = {
 	HWMON_CHANNEL_INFO(chip,
 			   HWMON_C_REGISTER_TZ),
-	/* We have at most 16 thermal sensor channels. */
+	/*
+	 * We have at most SSAM_TMP_SENSOR_MAX_COUNT = 16 thermal sensor
+	 * channels.
+	 */
 	HWMON_CHANNEL_INFO(temp,
-			   HWMON_T_INPUT,
-			   HWMON_T_INPUT,
-			   HWMON_T_INPUT,
-			   HWMON_T_INPUT,
-			   HWMON_T_INPUT,
-			   HWMON_T_INPUT,
-			   HWMON_T_INPUT,
-			   HWMON_T_INPUT,
-			   HWMON_T_INPUT,
-			   HWMON_T_INPUT,
-			   HWMON_T_INPUT,
-			   HWMON_T_INPUT,
-			   HWMON_T_INPUT,
-			   HWMON_T_INPUT,
-			   HWMON_T_INPUT,
-			   HWMON_T_INPUT),
+			   HWMON_T_INPUT | HWMON_T_LABEL,
+			   HWMON_T_INPUT | HWMON_T_LABEL,
+			   HWMON_T_INPUT | HWMON_T_LABEL,
+			   HWMON_T_INPUT | HWMON_T_LABEL,
+			   HWMON_T_INPUT | HWMON_T_LABEL,
+			   HWMON_T_INPUT | HWMON_T_LABEL,
+			   HWMON_T_INPUT | HWMON_T_LABEL,
+			   HWMON_T_INPUT | HWMON_T_LABEL,
+			   HWMON_T_INPUT | HWMON_T_LABEL,
+			   HWMON_T_INPUT | HWMON_T_LABEL,
+			   HWMON_T_INPUT | HWMON_T_LABEL,
+			   HWMON_T_INPUT | HWMON_T_LABEL,
+			   HWMON_T_INPUT | HWMON_T_LABEL,
+			   HWMON_T_INPUT | HWMON_T_LABEL,
+			   HWMON_T_INPUT | HWMON_T_LABEL,
+			   HWMON_T_INPUT | HWMON_T_LABEL),
 	NULL
 };
 
 static const struct hwmon_ops ssam_temp_hwmon_ops = {
 	.is_visible = ssam_temp_hwmon_is_visible,
 	.read = ssam_temp_hwmon_read,
+	.read_string = ssam_temp_hwmon_read_string,
 };
 
 static const struct hwmon_chip_info ssam_temp_hwmon_chip_info = {
@@ -122,6 +187,7 @@  static int ssam_temp_probe(struct ssam_device *sdev)
 	struct ssam_temp *ssam_temp;
 	struct device *hwmon_dev;
 	s16 sensors;
+	int channel;
 	int status;
 
 	status = ssam_tmp_get_available_sensors(sdev, &sensors);
@@ -135,6 +201,18 @@  static int ssam_temp_probe(struct ssam_device *sdev)
 	ssam_temp->sdev = sdev;
 	ssam_temp->sensors = sensors;
 
+	/* Retrieve the name for each available sensor. */
+	for (channel = 0; channel < SSAM_TMP_SENSOR_MAX_COUNT; channel++) {
+		if (!(sensors & BIT(channel)))
+			continue;
+
+		status = ssam_tmp_get_name(sdev, channel + 1,
+					   ssam_temp->names[channel],
+					   SSAM_TMP_SENSOR_NAME_LENGTH);
+		if (status)
+			return status;
+	}
+
 	hwmon_dev = devm_hwmon_device_register_with_info(&sdev->dev,
 			"surface_thermal", ssam_temp, &ssam_temp_hwmon_chip_info,
 			NULL);